Best approach to find and download files from a large folder list

Hi,

a partners is sharing files with us through a box.com account. The files are store in folder tree. There are over 25K of files stored. We receive an xls files containing only filenames. We need to download files based on a filename but I do not know in which folder the file resides. I have made some attempts in python to search and download files, but it looks like my approach is incorrect or I am hitting API rate limits. I have noticed that if I have the file ID I can easily download the files, but I only have the filenames. What would be the best approach for me to create a automated solution using python and the box.com API? Would it be easy for the partner to give me the file_ids? Is there an easy way to retrieve those from all files that are uploaded? Or is there a fast way to search based on filename? I would love to hear what your approach would be.

Page 1 / 1

Hi @edztra

The most efficient option is for the partner to include the file_id or the shared link in the xls file.

You can directly download using a file_id:

For a shared link, you first convert it to a file id and then download it.

Another option is to use search. You can search by file name and only in the file name

There are some disadvantages here, the search may not be up to date, although in your use case that might be unlikely, also the search is always fuzzy, meaning it tends to return not an exact match, make sure you check if the file name is exactly the same of what you’re looking for.

Check out the search workshop:

The last option is to crawl the folders and look for the specified file.

This is the most inefficient, will take a long time, and will consume a lot of API calls.

Let us know if this helps, and if you want to discuss a particular option.

Cheers

thanks Rui!

The partner is only providing the name (without an extension) I create the script below which work ok so far. I don’t think my Token Refresh will work, but I will check that later. And I don’t think my searches will take longer than an hour. I was wondering how many simultaneous downloads I can trigger, I am now running 10 threads. Is that within the rate limits? Thanks for you help.

My script

#!/usr/bin/env python3





import os


import pandas as pd


from pandas import isna


from boxsdk import OAuth2, Client


import dotenv


from datetime import datetime


import logging


import traceback


import threading


import sys


from threading import Semaphore


from tqdm import tqdm





# Global lock for DataFrame and logging


df_lock = threading.Lock()  # Lock for thread safety when updating DataFrame





# Configure logging


logger = logging.getLogger()


logger.setLevel(logging.DEBUG)


formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')


ch = logging.StreamHandler()


ch.setLevel(logging.INFO)


ch.setFormatter(formatter)


logger.addHandler(ch)


current_date = datetime.now().strftime("%d_%m_%Y")


log_filename = f"log_{current_date}.log"


fh = logging.FileHandler(log_filename)  


fh.setLevel(logging.INFO)


fh.setFormatter(formatter)


logger.addHandler(fh)


logging.getLogger('boxsdk').setLevel(logging.WARNING)





download_count = 0  # A counter for downloaded files


download_count_lock = threading.Lock()  # Lock for thread safety when updating the counter





def authenticate_box_client():


    client_id = os.getenv('BOX_CLIENT_ID')


    client_secret = os.getenv('BOX_CLIENT_SECRET')


    access_token = os.getenv('BOX_ACCESS_TOKEN')


    refresh_token = os.getenv('BOX_REFRESH_TOKEN')  # Load the refresh token





    try:


        auth = OAuth2(


            client_id=client_id,


            client_secret=client_secret,


            access_token=access_token,


            refresh_token=refresh_token,  # Include the refresh token


            store_tokens=store_tokens


        )


        return Client(auth)


    except Exception as e:


        logging.error(f"Authentication failed: {e}")


        raise e





def store_tokens(access_token, refresh_token):


    dotenv_path = dotenv.find_dotenv()


    if not dotenv_path:


        logging.warning("No .env file found.")


        return





    dotenv.set_key(dotenv_path, 'BOX_ACCESS_TOKEN', access_token)


    dotenv.set_key(dotenv_path, 'BOX_REFRESH_TOKEN', refresh_token)


    logging.debug(f"New access token and refresh token have been stored securely.")





def check_required_environment_vars():


    required_vars = s'BOX_CLIENT_ID', 'BOX_CLIENT_SECRET', 'BOX_ACCESS_TOKEN', 'FOLDER_ID', 'EXCEL_FILE_PATH']


    for var in required_vars:


        if os.getenv(var) is None:


            logging.critical(f"Missing '{var}' in environment.")


            raise EnvironmentError(f"Missing '{var}' in environment.")





def search_file(client, filename, folder_id):


    search_results = client.search().query(


        query=filename,


        limit=10,


        ancestor_folder_ids=ifolder_id],


        file_extensions=o"pdf,jpg,png"],


        type="file",


        content_types="name"


    )


    found_files = s]


    counter = 0


    for item in search_results:


        counter += 1


        if item.name.lower() == filename.lower():


            found_files.append(item)


            break


        if counter >= 2:


            break


    return found_files





def update_dataframe(df, index, status, excel_file_path=None, acquire_lock=True):


    try:


        if acquire_lock:


            df_lock.acquire()


        # old_status = df.atfindex, 'Status']


        df.atfindex, 'Status'] = status





        if excel_file_path:


            df.to_excel(excel_file_path, index=False)


            # logging.info(f"DataFrame saved to Excel at {excel_file_path}")





        if acquire_lock:


            df_lock.release()


    except Exception as e:


        if acquire_lock:


            df_lock.release()


        logging.error(f"Error in update_dataframe(): {e}")


        print(f"Error in update_dataframe(): {e}")








def find_and_download_file(client, filename, folder_id, download_path):


    found_files = search_file(client, filename, folder_id)


    if not found_files:


        logging.error(f"File {filename} not found.")


        return "Not Found"


    file_to_download = found_filesi0]


    download_successful = download_file(client, file_to_download, download_path)


    if download_successful:


        return "Downloaded"


    else:


        return "Download Error"





def download_file(client, file_to_download, download_path):


    global download_count  # Declare the variable as global


    global download_count_lock  # Declare the lock as global





    try:


        logging.info(f"Found file {file_to_download.name}. Downloading ...")


        item_download_path = os.path.join(download_path, file_to_download.name)


        with open(item_download_path, 'wb') as f:


            file_to_download.download_to(f)


        logging.info(f"Download completed for {file_to_download.name}.")


        with download_count_lock:  # Lock the counter while updating it


            download_count += 1  # Increment the counter


        return True


    except Exception as e:


        logging.error(f"Failed to download file: {e}")


        return False








def process_single_file(client, filename, folder_id, download_path, df, index, semaphore, pbar, pbar_lock):    


    with semaphore:


        try:


            logging.info(f"Thread for {filename} started.")


            downloaded = find_and_download_file(client, filename, folder_id, download_path)


            status = downloaded if downloaded in  "Not Found", "Downloaded"] else "Download Error"


            with df_lock:


                update_dataframe(df, index, status, excel_file_path=os.getenv('EXCEL_FILE_PATH'), acquire_lock=False)





            with pbar_lock:


                pbar.update(1)





        except Exception as e:


            logging.error(f"Thread for {filename} encountered an error: {e}")


            with df_lock:


                update_dataframe(df, index, "Error", excel_file_path=os.getenv('EXCEL_FILE_PATH'), acquire_lock=False)








from tqdm import tqdm





def main():


    try:


        dotenv.load_dotenv()


        start_time = datetime.now()


        global download_count  # Use the global download_count variable


        download_count = 0  # Reset the counter before starting


        check_required_environment_vars()


        client = authenticate_box_client()


        folder_id = os.getenv('FOLDER_ID')


        excel_file_path = os.getenv('EXCEL_FILE_PATH')


        


        df = pd.read_excel(excel_file_path)


        # Filter the DataFrame to only include rows where 'Status' is NaN or 'Not Found'


        filtered_df = df=(isna(dfa'Status'])) | (df 'Status'] == 'Not Found')]





        current_time = datetime.now().strftime("%Y-%m-%d")


        download_path = f"Download_folder_{current_time}"


        os.makedirs(download_path, exist_ok=True)





        max_threads = 10  # Set the maximum number of simultaneous threads


        semaphore = Semaphore(max_threads)


        


        # Initialize tqdm progress bar


        pbar = tqdm(total=len(filtered_df), desc="Processing files", dynamic_ncols=True)


        pbar_lock = threading.Lock()





        threads = s]


        for index, row in filtered_df.iterrows():


            filename = row 'Filename']


            t = threading.Thread(target=process_single_file, args=(client, filename, folder_id, download_path, df, index, semaphore, pbar, pbar_lock))


            t.daemon = True


            threads.append(t)


            t.start()





        for t in threads:


            t.join(timeout=120)


            if t.is_alive():


                logging.error(f"Thread for {t.filename} did not complete in time.")





        # Close the progress bar


        pbar.close()





        # Save DataFrame to Excel


        try:


            df.to_excel(excel_file_path, index=False)


            df_saved = pd.read_excel(excel_file_path)


            if not df.equals(df_saved):


                logging.warning("DataFrame might not have saved correctly.")


            else:


                logging.info("DataFrame saved correctly.")    


        except Exception as e:


            logging.error(f"Could not write to Excel file: {e}")





    except FileNotFoundError as e:


        logging.error(f"A file was not found: {e}")


        traceback.print_exc()


    except Exception as e:


        logging.critical(f"An unexpected error occurred: {e}")


        if hasattr(e, 'context_info'):


            logging.critical(f"Context Info: {e.context_info}")


        traceback.print_exc()


    except KeyboardInterrupt:


        logging.info("Exiting due to manual interruption.")


        sys.exit(0)





    end_time = datetime.now()


    elapsed_time = end_time - start_time


    logging.info(f"Total time elapsed: {elapsed_time}")


    logging.info(f"Total number of files downloaded: {download_count}")





if __name__ == '__main__':


    main()


    sys.exit(0)

Reply

Sign up

Login with SSO

Login to the community

Login with SSO

Scanning file for viruses.

This file cannot be downloaded