Skip to main content

Hi,


a partners is sharing files with us through a box.com account. The files are store in folder tree. There are over 25K of files stored. We receive an xls files containing only filenames. We need to download files based on a filename but I do not know in which folder the file resides. I have made some attempts in python to search and download files, but it looks like my approach is incorrect or I am hitting API rate limits. I have noticed that if I have the file ID I can easily download the files, but I only have the filenames. What would be the best approach for me to create a automated solution using python and the box.com API? Would it be easy for the partner to give me the file_ids? Is there an easy way to retrieve those from all files that are uploaded? Or is there a fast way to search based on filename? I would love to hear what your approach would be.

Hi @edztra



The most efficient option is for the partner to include the file_id or the shared link in the xls file.


You can directly download using a file_id:


For a shared link, you first convert it to a file id and then download it.



Another option is to use search. You can search by file name and only in the file name


There are some disadvantages here, the search may not be up to date, although in your use case that might be unlikely, also the search is always fuzzy, meaning it tends to return not an exact match, make sure you check if the file name is exactly the same of what you’re looking for.



Check out the search workshop:



The last option is to crawl the folders and look for the specified file.


This is the most inefficient, will take a long time, and will consume a lot of API calls.



Let us know if this helps, and if you want to discuss a particular option.



Cheers


thanks Rui!


The partner is only providing the name (without an extension) I create the script below which work ok so far. I don’t think my Token Refresh will work, but I will check that later. And I don’t think my searches will take longer than an hour. I was wondering how many simultaneous downloads I can trigger, I am now running 10 threads. Is that within the rate limits? Thanks for you help.



My script



#!/usr/bin/env python3



import os

import pandas as pd

from pandas import isna

from boxsdk import OAuth2, Client

import dotenv

from datetime import datetime

import logging

import traceback

import threading

import sys

from threading import Semaphore

from tqdm import tqdm



# Global lock for DataFrame and logging

df_lock = threading.Lock() # Lock for thread safety when updating DataFrame



# Configure logging

logger = logging.getLogger()

logger.setLevel(logging.DEBUG)

formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

ch = logging.StreamHandler()

ch.setLevel(logging.INFO)

ch.setFormatter(formatter)

logger.addHandler(ch)

current_date = datetime.now().strftime("%d_%m_%Y")

log_filename = f"log_{current_date}.log"

fh = logging.FileHandler(log_filename)

fh.setLevel(logging.INFO)

fh.setFormatter(formatter)

logger.addHandler(fh)

logging.getLogger('boxsdk').setLevel(logging.WARNING)



download_count = 0 # A counter for downloaded files

download_count_lock = threading.Lock() # Lock for thread safety when updating the counter



def authenticate_box_client():

client_id = os.getenv('BOX_CLIENT_ID')

client_secret = os.getenv('BOX_CLIENT_SECRET')

access_token = os.getenv('BOX_ACCESS_TOKEN')

refresh_token = os.getenv('BOX_REFRESH_TOKEN') # Load the refresh token



try:

auth = OAuth2(

client_id=client_id,

client_secret=client_secret,

access_token=access_token,

refresh_token=refresh_token, # Include the refresh token

store_tokens=store_tokens

)

return Client(auth)

except Exception as e:

logging.error(f"Authentication failed: {e}")

raise e



def store_tokens(access_token, refresh_token):

dotenv_path = dotenv.find_dotenv()

if not dotenv_path:

logging.warning("No .env file found.")

return



dotenv.set_key(dotenv_path, 'BOX_ACCESS_TOKEN', access_token)

dotenv.set_key(dotenv_path, 'BOX_REFRESH_TOKEN', refresh_token)

logging.debug(f"New access token and refresh token have been stored securely.")



def check_required_environment_vars():

required_vars = s'BOX_CLIENT_ID', 'BOX_CLIENT_SECRET', 'BOX_ACCESS_TOKEN', 'FOLDER_ID', 'EXCEL_FILE_PATH']

for var in required_vars:

if os.getenv(var) is None:

logging.critical(f"Missing '{var}' in environment.")

raise EnvironmentError(f"Missing '{var}' in environment.")



def search_file(client, filename, folder_id):

search_results = client.search().query(

query=filename,

limit=10,

ancestor_folder_ids=ifolder_id],

file_extensions=o"pdf,jpg,png"],

type="file",

content_types="name"

)

found_files = s]

counter = 0

for item in search_results:

counter += 1

if item.name.lower() == filename.lower():

found_files.append(item)

break

if counter >= 2:

break

return found_files



def update_dataframe(df, index, status, excel_file_path=None, acquire_lock=True):

try:

if acquire_lock:

df_lock.acquire()

# old_status = df.atfindex, 'Status']

df.atfindex, 'Status'] = status



if excel_file_path:

df.to_excel(excel_file_path, index=False)

# logging.info(f"DataFrame saved to Excel at {excel_file_path}")



if acquire_lock:

df_lock.release()

except Exception as e:

if acquire_lock:

df_lock.release()

logging.error(f"Error in update_dataframe(): {e}")

print(f"Error in update_dataframe(): {e}")





def find_and_download_file(client, filename, folder_id, download_path):

found_files = search_file(client, filename, folder_id)

if not found_files:

logging.error(f"File {filename} not found.")

return "Not Found"

file_to_download = found_filesi0]

download_successful = download_file(client, file_to_download, download_path)

if download_successful:

return "Downloaded"

else:

return "Download Error"



def download_file(client, file_to_download, download_path):

global download_count # Declare the variable as global

global download_count_lock # Declare the lock as global



try:

logging.info(f"Found file {file_to_download.name}. Downloading ...")

item_download_path = os.path.join(download_path, file_to_download.name)

with open(item_download_path, 'wb') as f:

file_to_download.download_to(f)

logging.info(f"Download completed for {file_to_download.name}.")

with download_count_lock: # Lock the counter while updating it

download_count += 1 # Increment the counter

return True

except Exception as e:

logging.error(f"Failed to download file: {e}")

return False





def process_single_file(client, filename, folder_id, download_path, df, index, semaphore, pbar, pbar_lock):

with semaphore:

try:

logging.info(f"Thread for {filename} started.")

downloaded = find_and_download_file(client, filename, folder_id, download_path)

status = downloaded if downloaded in "Not Found", "Downloaded"] else "Download Error"

with df_lock:

update_dataframe(df, index, status, excel_file_path=os.getenv('EXCEL_FILE_PATH'), acquire_lock=False)



with pbar_lock:

pbar.update(1)



except Exception as e:

logging.error(f"Thread for {filename} encountered an error: {e}")

with df_lock:

update_dataframe(df, index, "Error", excel_file_path=os.getenv('EXCEL_FILE_PATH'), acquire_lock=False)





from tqdm import tqdm



def main():

try:

dotenv.load_dotenv()

start_time = datetime.now()

global download_count # Use the global download_count variable

download_count = 0 # Reset the counter before starting

check_required_environment_vars()

client = authenticate_box_client()

folder_id = os.getenv('FOLDER_ID')

excel_file_path = os.getenv('EXCEL_FILE_PATH')



df = pd.read_excel(excel_file_path)

# Filter the DataFrame to only include rows where 'Status' is NaN or 'Not Found'

filtered_df = df=(isna(dfa'Status'])) | (df 'Status'] == 'Not Found')]



current_time = datetime.now().strftime("%Y-%m-%d")

download_path = f"Download_folder_{current_time}"

os.makedirs(download_path, exist_ok=True)



max_threads = 10 # Set the maximum number of simultaneous threads

semaphore = Semaphore(max_threads)



# Initialize tqdm progress bar

pbar = tqdm(total=len(filtered_df), desc="Processing files", dynamic_ncols=True)

pbar_lock = threading.Lock()



threads = s]

for index, row in filtered_df.iterrows():

filename = row 'Filename']

t = threading.Thread(target=process_single_file, args=(client, filename, folder_id, download_path, df, index, semaphore, pbar, pbar_lock))

t.daemon = True

threads.append(t)

t.start()



for t in threads:

t.join(timeout=120)

if t.is_alive():

logging.error(f"Thread for {t.filename} did not complete in time.")



# Close the progress bar

pbar.close()



# Save DataFrame to Excel

try:

df.to_excel(excel_file_path, index=False)

df_saved = pd.read_excel(excel_file_path)

if not df.equals(df_saved):

logging.warning("DataFrame might not have saved correctly.")

else:

logging.info("DataFrame saved correctly.")

except Exception as e:

logging.error(f"Could not write to Excel file: {e}")



except FileNotFoundError as e:

logging.error(f"A file was not found: {e}")

traceback.print_exc()

except Exception as e:

logging.critical(f"An unexpected error occurred: {e}")

if hasattr(e, 'context_info'):

logging.critical(f"Context Info: {e.context_info}")

traceback.print_exc()

except KeyboardInterrupt:

logging.info("Exiting due to manual interruption.")

sys.exit(0)



end_time = datetime.now()

elapsed_time = end_time - start_time

logging.info(f"Total time elapsed: {elapsed_time}")

logging.info(f"Total number of files downloaded: {download_count}")



if __name__ == '__main__':

main()

sys.exit(0)


Reply