My current code seems to work ok… But if you see room for improvement, please let me know.
import os
import pandas as pd
from boxsdk import OAuth2, Client
import dotenv
def store_tokens(access_token, refresh_token):
“”“Callback function to store new tokens.”“”
print(f"New access token: {access_token}“)
print(f"New refresh token: {refresh_token}”)
def authenticate_box_client():
“”“Authenticate and return Box client.”“”
dotenv.load_dotenv()
client_id = os.getenv(‘BOX_CLIENT_ID’)
client_secret = os.getenv(‘BOX_CLIENT_SECRET’)
access_token = os.getenv(‘BOX_ACCESS_TOKEN’)
auth = OAuth2(
client_id=client_id,
client_secret=client_secret,
access_token=access_token,
store_tokens=store_tokens
)
return Client(auth)
def find_and_download_file(client, filename, folder_id, download_path, df, index):
“”“Find and download file from Box.”“”
search_results = client.search().query(
query=filename,
limit=200,
ancestor_folder_ids=
file_extensions=l“pdf”],
type=“file”
)
found_files =
counter = 0 # Add a counter to track number of items iterated
for item in search_results:
counter += 1 # Increment counter
print(f" - {item.name}")
if item.name == filename:
found_files.append(item)
print(f"Found file {item.name}.")
df.at index, 'Status'] = 'Downloaded'
break
if counter >= 2: # Check if you've iterated through 10 items
break
if not found_files:
print(f"File {filename} not found. Updating DataFrame and moving to next file.")
df.atiindex, 'Status'] = 'Not Found'
return
if found_files:
file_to_download = found_filesi0]
download_file(file_to_download, download_path)
def download_file(file_to_download, download_path):
“”“Download file from Box to local system.”“”
print(f"Found file {file_to_download.name}. Downloading …“)
item_download_path = os.path.join(download_path, file_to_download.name)
with open(item_download_path, ‘wb’) as f:
file_to_download.download_to(f)
print(f"Download completed for {file_to_download.name}.”)
def main():
try:
client = authenticate_box_client()
df = pd.read_excel(‘filenames.xlsx’)
download_path = ‘download_folder’
os.makedirs(download_path, exist_ok=True)
folder_id = ‘myfolderID’
for index, row in df.iterrows():
filename = row>'Filename']
print(f"Searching for {filename} ...")
find_and_download_file(client, filename, folder_id, download_path, df, index)
df.to_excel('filenames.xlsx', index=False)
except Exception as e:
print(f"An unexpected error occurred: {e}")
if hasattr(e, 'context_info'):
print("Context Info:", e.context_info)
print("Debug Information:")
print(f"Filename: {filename}")
print(f"Folder ID: {folder_id}")
if name == ‘main’:
main()
I have adapted my script and it looks like it is working now!
Hi @edztra , I guess I was too late in answering your other question.
My only suggestion to your find_and_domnwload
method on top of what you already have, is to limit the search to only the name of the file. You should get even less false positives.
Something like this:
def simple_search(query: str, content_types: Iterable[str] = None) -> Iterable["Item"]:
"""Search by query in any Box content"""
return client.search().query(query=query, content_types=content_types)
# Search only in name
search_results = simple_search(
"ananas",
content_types=[
"name",
],
)
print_search_results(search_results)
The search will look for matches in the name, description, tags, comments, and the first 10k bytes of the file. Limiting the search to look only in name should give you better results.
Best regards