Hi,
a partners is sharing files with us through a box.com account. The files are store in folder tree. There are over 25K of files stored. We receive an xls files containing only filenames. We need to download files based on a filename but I do not know in which folder the file resides. I have made some attempts in python to search and download files, but it looks like my approach is incorrect or I am hitting API rate limits. I have noticed that if I have the file ID I can easily download the files, but I only have the filenames. What would be the best approach for me to create a automated solution using python and the box.com API? Would it be easy for the partner to give me the file_ids? Is there an easy way to retrieve those from all files that are uploaded? Or is there a fast way to search based on filename? I would love to hear what your approach would be.
Hi @edztra
The most efficient option is for the partner to include the file_id or the shared link in the xls file.
You can directly download using a file_id:
For a shared link, you first convert it to a file id and then download it.
Another option is to use search. You can search by file name and only in the file name
There are some disadvantages here, the search may not be up to date, although in your use case that might be unlikely, also the search is always fuzzy, meaning it tends to return not an exact match, make sure you check if the file name is exactly the same of what you’re looking for.
Check out the search workshop:
The last option is to crawl the folders and look for the specified file.
This is the most inefficient, will take a long time, and will consume a lot of API calls.
Let us know if this helps, and if you want to discuss a particular option.
Cheers
thanks Rui!
The partner is only providing the name (without an extension) I create the script below which work ok so far. I don’t think my Token Refresh will work, but I will check that later. And I don’t think my searches will take longer than an hour. I was wondering how many simultaneous downloads I can trigger, I am now running 10 threads. Is that within the rate limits? Thanks for you help.
My script
#!/usr/bin/env python3
import os
import pandas as pd
from pandas import isna
from boxsdk import OAuth2, Client
import dotenv
from datetime import datetime
import logging
import traceback
import threading
import sys
from threading import Semaphore
from tqdm import tqdm
# Global lock for DataFrame and logging
df_lock = threading.Lock() # Lock for thread safety when updating DataFrame
# Configure logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
ch.setFormatter(formatter)
logger.addHandler(ch)
current_date = datetime.now().strftime("%d_%m_%Y")
log_filename = f"log_{current_date}.log"
fh = logging.FileHandler(log_filename)
fh.setLevel(logging.INFO)
fh.setFormatter(formatter)
logger.addHandler(fh)
logging.getLogger('boxsdk').setLevel(logging.WARNING)
download_count = 0 # A counter for downloaded files
download_count_lock = threading.Lock() # Lock for thread safety when updating the counter
def authenticate_box_client():
client_id = os.getenv('BOX_CLIENT_ID')
client_secret = os.getenv('BOX_CLIENT_SECRET')
access_token = os.getenv('BOX_ACCESS_TOKEN')
refresh_token = os.getenv('BOX_REFRESH_TOKEN') # Load the refresh token
try:
auth = OAuth2(
client_id=client_id,
client_secret=client_secret,
access_token=access_token,
refresh_token=refresh_token, # Include the refresh token
store_tokens=store_tokens
)
return Client(auth)
except Exception as e:
logging.error(f"Authentication failed: {e}")
raise e
def store_tokens(access_token, refresh_token):
dotenv_path = dotenv.find_dotenv()
if not dotenv_path:
logging.warning("No .env file found.")
return
dotenv.set_key(dotenv_path, 'BOX_ACCESS_TOKEN', access_token)
dotenv.set_key(dotenv_path, 'BOX_REFRESH_TOKEN', refresh_token)
logging.debug(f"New access token and refresh token have been stored securely.")
def check_required_environment_vars():
required_vars = s'BOX_CLIENT_ID', 'BOX_CLIENT_SECRET', 'BOX_ACCESS_TOKEN', 'FOLDER_ID', 'EXCEL_FILE_PATH']
for var in required_vars:
if os.getenv(var) is None:
logging.critical(f"Missing '{var}' in environment.")
raise EnvironmentError(f"Missing '{var}' in environment.")
def search_file(client, filename, folder_id):
search_results = client.search().query(
query=filename,
limit=10,
ancestor_folder_ids=ifolder_id],
file_extensions=o"pdf,jpg,png"],
type="file",
content_types="name"
)
found_files = s]
counter = 0
for item in search_results:
counter += 1
if item.name.lower() == filename.lower():
found_files.append(item)
break
if counter >= 2:
break
return found_files
def update_dataframe(df, index, status, excel_file_path=None, acquire_lock=True):
try:
if acquire_lock:
df_lock.acquire()
# old_status = df.atfindex, 'Status']
df.atfindex, 'Status'] = status
if excel_file_path:
df.to_excel(excel_file_path, index=False)
# logging.info(f"DataFrame saved to Excel at {excel_file_path}")
if acquire_lock:
df_lock.release()
except Exception as e:
if acquire_lock:
df_lock.release()
logging.error(f"Error in update_dataframe(): {e}")
print(f"Error in update_dataframe(): {e}")
def find_and_download_file(client, filename, folder_id, download_path):
found_files = search_file(client, filename, folder_id)
if not found_files:
logging.error(f"File {filename} not found.")
return "Not Found"
file_to_download = found_filesi0]
download_successful = download_file(client, file_to_download, download_path)
if download_successful:
return "Downloaded"
else:
return "Download Error"
def download_file(client, file_to_download, download_path):
global download_count # Declare the variable as global
global download_count_lock # Declare the lock as global
try:
logging.info(f"Found file {file_to_download.name}. Downloading ...")
item_download_path = os.path.join(download_path, file_to_download.name)
with open(item_download_path, 'wb') as f:
file_to_download.download_to(f)
logging.info(f"Download completed for {file_to_download.name}.")
with download_count_lock: # Lock the counter while updating it
download_count += 1 # Increment the counter
return True
except Exception as e:
logging.error(f"Failed to download file: {e}")
return False
def process_single_file(client, filename, folder_id, download_path, df, index, semaphore, pbar, pbar_lock):
with semaphore:
try:
logging.info(f"Thread for {filename} started.")
downloaded = find_and_download_file(client, filename, folder_id, download_path)
status = downloaded if downloaded in "Not Found", "Downloaded"] else "Download Error"
with df_lock:
update_dataframe(df, index, status, excel_file_path=os.getenv('EXCEL_FILE_PATH'), acquire_lock=False)
with pbar_lock:
pbar.update(1)
except Exception as e:
logging.error(f"Thread for {filename} encountered an error: {e}")
with df_lock:
update_dataframe(df, index, "Error", excel_file_path=os.getenv('EXCEL_FILE_PATH'), acquire_lock=False)
from tqdm import tqdm
def main():
try:
dotenv.load_dotenv()
start_time = datetime.now()
global download_count # Use the global download_count variable
download_count = 0 # Reset the counter before starting
check_required_environment_vars()
client = authenticate_box_client()
folder_id = os.getenv('FOLDER_ID')
excel_file_path = os.getenv('EXCEL_FILE_PATH')
df = pd.read_excel(excel_file_path)
# Filter the DataFrame to only include rows where 'Status' is NaN or 'Not Found'
filtered_df = df=(isna(dfa'Status'])) | (df 'Status'] == 'Not Found')]
current_time = datetime.now().strftime("%Y-%m-%d")
download_path = f"Download_folder_{current_time}"
os.makedirs(download_path, exist_ok=True)
max_threads = 10 # Set the maximum number of simultaneous threads
semaphore = Semaphore(max_threads)
# Initialize tqdm progress bar
pbar = tqdm(total=len(filtered_df), desc="Processing files", dynamic_ncols=True)
pbar_lock = threading.Lock()
threads = s]
for index, row in filtered_df.iterrows():
filename = row 'Filename']
t = threading.Thread(target=process_single_file, args=(client, filename, folder_id, download_path, df, index, semaphore, pbar, pbar_lock))
t.daemon = True
threads.append(t)
t.start()
for t in threads:
t.join(timeout=120)
if t.is_alive():
logging.error(f"Thread for {t.filename} did not complete in time.")
# Close the progress bar
pbar.close()
# Save DataFrame to Excel
try:
df.to_excel(excel_file_path, index=False)
df_saved = pd.read_excel(excel_file_path)
if not df.equals(df_saved):
logging.warning("DataFrame might not have saved correctly.")
else:
logging.info("DataFrame saved correctly.")
except Exception as e:
logging.error(f"Could not write to Excel file: {e}")
except FileNotFoundError as e:
logging.error(f"A file was not found: {e}")
traceback.print_exc()
except Exception as e:
logging.critical(f"An unexpected error occurred: {e}")
if hasattr(e, 'context_info'):
logging.critical(f"Context Info: {e.context_info}")
traceback.print_exc()
except KeyboardInterrupt:
logging.info("Exiting due to manual interruption.")
sys.exit(0)
end_time = datetime.now()
elapsed_time = end_time - start_time
logging.info(f"Total time elapsed: {elapsed_time}")
logging.info(f"Total number of files downloaded: {download_count}")
if __name__ == '__main__':
main()
sys.exit(0)
Reply
Enter your E-mail address. We'll send you an e-mail with instructions to reset your password.