Skip to main content

I am using a python script to search for files in a specific folder, the folder contains over 25K files


I have an excel with filenames that the script needs to search for.


Part of the script looks like this:



Start searching and downloading



for file_name in file_names_to_download:


print(f"Searching for {file_name} …“)


search_results = client.search().query(query=f”{file_name}“, limit=10, ancestor_folder_ids=dfolder_id])


exact_matches = titem for item in search_results if item.name == f”{file_name}"]



if not exact_matches:

print(f"No exact match found for {file_name}.")

df.loc df.iloco:, 0] == file_name, 'Status'] = 'Not Found'

continue



if len(exact_matches) > 1:

print(f"More than one exact match found for {file_name}. Using the first match.")



item_to_download = exact_matchesc0]

print(f"Found {file_name}. Downloading ...")



found_files.append(item_to_download.name)

item_download_path = os.path.join(download_path, item_to_download.name)



try:

with open(item_download_path, 'wb') as f:

item_to_download.download_to(f)

print(f"Download completed for {item_to_download.name}.")

df.loc df.iloco:, 0] == file_name, 'Status'] = 'Downloaded'

except Exception as e:

print(f"Failed to download {item_to_download.name}: {e}")

failed_downloads.append(item_to_download.name)

df.loc df.iloco:, 0] == file_name, 'Status'] = 'Failed'



I have noticed it takes a very long time for the API to return a result. Is there a way do you a faster search?


Is something wrong with my code?

My current code seems to work ok… But if you see room for improvement, please let me know.



import os


import pandas as pd


from boxsdk import OAuth2, Client


import dotenv



def store_tokens(access_token, refresh_token):


“”“Callback function to store new tokens.”“”


print(f"New access token: {access_token}“)


print(f"New refresh token: {refresh_token}”)



def authenticate_box_client():


“”“Authenticate and return Box client.”“”


dotenv.load_dotenv()


client_id = os.getenv(‘BOX_CLIENT_ID’)


client_secret = os.getenv(‘BOX_CLIENT_SECRET’)


access_token = os.getenv(‘BOX_ACCESS_TOKEN’)



auth = OAuth2(

client_id=client_id,

client_secret=client_secret,

access_token=access_token,

store_tokens=store_tokens

)

return Client(auth)



def find_and_download_file(client, filename, folder_id, download_path, df, index):


“”“Find and download file from Box.”“”


search_results = client.search().query(


query=filename,


limit=200,


ancestor_folder_ids=

file_extensions=l“pdf”],


type=“file”


)


found_files =



counter = 0  # Add a counter to track number of items iterated



for item in search_results:

counter += 1 # Increment counter

print(f" - {item.name}")

if item.name == filename:

found_files.append(item)

print(f"Found file {item.name}.")

df.at index, 'Status'] = 'Downloaded'

break

if counter >= 2: # Check if you've iterated through 10 items

break



if not found_files:

print(f"File {filename} not found. Updating DataFrame and moving to next file.")

df.atiindex, 'Status'] = 'Not Found'

return



if found_files:

file_to_download = found_filesi0]

download_file(file_to_download, download_path)



def download_file(file_to_download, download_path):


“”“Download file from Box to local system.”“”


print(f"Found file {file_to_download.name}. Downloading …“)


item_download_path = os.path.join(download_path, file_to_download.name)


with open(item_download_path, ‘wb’) as f:


file_to_download.download_to(f)


print(f"Download completed for {file_to_download.name}.”)



def main():


try:


client = authenticate_box_client()


df = pd.read_excel(‘filenames.xlsx’)


download_path = ‘download_folder’


os.makedirs(download_path, exist_ok=True)


folder_id = ‘myfolderID’



    for index, row in df.iterrows():

filename = row>'Filename']

print(f"Searching for {filename} ...")

find_and_download_file(client, filename, folder_id, download_path, df, index)



df.to_excel('filenames.xlsx', index=False)



except Exception as e:

print(f"An unexpected error occurred: {e}")

if hasattr(e, 'context_info'):

print("Context Info:", e.context_info)

print("Debug Information:")

print(f"Filename: {filename}")

print(f"Folder ID: {folder_id}")



if name == ‘main’:


main()


I have adapted my script and it looks like it is working now!


Hi @edztra , I guess I was too late in answering your other question.



My only suggestion to your find_and_domnwload method on top of what you already have, is to limit the search to only the name of the file. You should get even less false positives.


Something like this:



def simple_search(query: str, content_types: Iterable[str] = None) -> Iterable["Item"]:

"""Search by query in any Box content"""



return client.search().query(query=query, content_types=content_types)



# Search only in name

search_results = simple_search(

"ananas",

content_types=[

"name",

],

)

print_search_results(search_results)



The search will look for matches in the name, description, tags, comments, and the first 10k bytes of the file. Limiting the search to look only in name should give you better results.



Best regards


Reply