Skip to main content

Search for files starting in a specific folder and searching all sub folders using Python


  • New Participant
  • 3 replies

I am using a python script to search for files in a specific folder, the folder contains over 25K files

I have an excel with filenames that the script needs to search for.

Part of the script looks like this:


Start searching and downloading


for file_name in file_names_to_download:

print(f"Searching for {file_name} …“)

search_results = client.search().query(query=f”{file_name}“, limit=10, ancestor_folder_ids=[folder_id])

exact_matches = [item for item in search_results if item.name == f”{file_name}"]


if not exact_matches:

    print(f"No exact match found for {file_name}.")

    df.loc[df.iloc[:, 0] == file_name, 'Status'] = 'Not Found'

    continue



if len(exact_matches) > 1:

    print(f"More than one exact match found for {file_name}. Using the first match.")



item_to_download = exact_matches[0]

print(f"Found {file_name}. Downloading ...")



found_files.append(item_to_download.name)

item_download_path = os.path.join(download_path, item_to_download.name)



try:

    with open(item_download_path, 'wb') as f:

        item_to_download.download_to(f)

    print(f"Download completed for {item_to_download.name}.")

    df.loc[df.iloc[:, 0] == file_name, 'Status'] = 'Downloaded'

except Exception as e:

    print(f"Failed to download {item_to_download.name}: {e}")

    failed_downloads.append(item_to_download.name)

    df.loc[df.iloc[:, 0] == file_name, 'Status'] = 'Failed'


I have noticed it takes a very long time for the API to return a result. Is there a way do you a faster search?

Is something wrong with my code?

3 replies

  • Author
  • New Participant
  • 3 replies
  • September 1, 2023

My current code seems to work ok… But if you see room for improvement, please let me know.


import os

import pandas as pd

from boxsdk import OAuth2, Client

import dotenv


def store_tokens(access_token, refresh_token):

“”“Callback function to store new tokens.”“”

print(f"New access token: {access_token}“)

print(f"New refresh token: {refresh_token}”)


def authenticate_box_client():

“”“Authenticate and return Box client.”“”

dotenv.load_dotenv()

client_id = os.getenv(‘BOX_CLIENT_ID’)

client_secret = os.getenv(‘BOX_CLIENT_SECRET’)

access_token = os.getenv(‘BOX_ACCESS_TOKEN’)


auth = OAuth2(

    client_id=client_id,

    client_secret=client_secret,

    access_token=access_token,

    store_tokens=store_tokens

)

return Client(auth)


def find_and_download_file(client, filename, folder_id, download_path, df, index):

“”“Find and download file from Box.”“”

search_results = client.search().query(

query=filename,

limit=200,

ancestor_folder_ids=[folder_id],

file_extensions=[“pdf”],

type=“file”

)

found_files =


counter = 0  # Add a counter to track number of items iterated



for item in search_results:

    counter += 1  # Increment counter

    print(f" - {item.name}")

    if item.name == filename:

        found_files.append(item)

        print(f"Found file {item.name}.")

        df.at[index, 'Status'] = 'Downloaded'

        break

    if counter >= 2:  # Check if you've iterated through 10 items

        break



if not found_files:

    print(f"File {filename} not found. Updating DataFrame and moving to next file.")

    df.at[index, 'Status'] = 'Not Found'

    return



if found_files:

    file_to_download = found_files[0]

    download_file(file_to_download, download_path)


def download_file(file_to_download, download_path):

“”“Download file from Box to local system.”“”

print(f"Found file {file_to_download.name}. Downloading …“)

item_download_path = os.path.join(download_path, file_to_download.name)

with open(item_download_path, ‘wb’) as f:

file_to_download.download_to(f)

print(f"Download completed for {file_to_download.name}.”)


def main():

try:

client = authenticate_box_client()

df = pd.read_excel(‘filenames.xlsx’)

download_path = ‘download_folder’

os.makedirs(download_path, exist_ok=True)

folder_id = ‘myfolderID’


    for index, row in df.iterrows():

        filename = row['Filename']

        print(f"Searching for {filename} ...")

        find_and_download_file(client, filename, folder_id, download_path, df, index)



    df.to_excel('filenames.xlsx', index=False)



except Exception as e:

    print(f"An unexpected error occurred: {e}")

    if hasattr(e, 'context_info'):

        print("Context Info:", e.context_info)

    print("Debug Information:")

    print(f"Filename: {filename}")

    print(f"Folder ID: {folder_id}")


if name == ‘main’:

main()


  • Author
  • New Participant
  • 3 replies
  • September 4, 2023

I have adapted my script and it looks like it is working now!


rbarbosa Box
  • Developer Advocate
  • 553 replies
  • September 5, 2023

Hi @edztra , I guess I was too late in answering your other question.


My only suggestion to your find_and_domnwload method on top of what you already have, is to limit the search to only the name of the file. You should get even less false positives.

Something like this:


def simple_search(query: str, content_types: Iterable[str] = None) -> Iterable["Item"]:

    """Search by query in any Box content"""



    return client.search().query(query=query, content_types=content_types)



# Search only in name

search_results = simple_search(

    "ananas",

    content_types=[

        "name",

    ],

)

print_search_results(search_results)


The search will look for matches in the name, description, tags, comments, and the first 10k bytes of the file. Limiting the search to look only in name should give you better results.


Best regards


Cookie policy

We use cookies to enhance and personalize your experience. If you accept you agree to our full cookie policy. Learn more about our cookies.

 
Cookie settings