Skip to main content

A government agency publishes a public folder to share a data set. I have no relationship with the publisher, thus no credentials.


With the web browser interface, no authentication is required, works really well, can download every file.



The Python SDK is getting super difficult for the task of accessing a public box folder.


I can retrieve a list the file and folder names


When I try to get anything with the file object file(file_id=item.id).content … download_url(), download…



I get a 404 error, Not found.



The assumptions I’m making:





  1. I need a developer account


  2. I need to create a custom app


  3. Select an auth method (e.g. OAuth)


    3.5 Download the 3 secrets (client_id, client_secret, token) from custom app.


  4. Build a client object with the OAuth


    5 With either the URL or the folderID get an object to the root folder.


  5. Iterate over objects in the root_folder




Based on one post from Box engineer, I get the sense these developer accounts and credentials don’t give you “Permissions” on public folders. No info was given on how to proceed.



Is there a solution to automating downloads of public folders?


Thanks!

Code



client = Client(auth)

SHARED_URL = "https://nihcc.app.box.com/v/ChestXray-NIHCC"

root_folder = client.get_shared_item(SHARED_URL, '')

print(root_folder.name) # prints CXR8

items = root_folder.get_items()



for item in items:

print('{0} {1} is named "{2}"'.format(item.type.capitalize(), item.id, item.name))

# prints: File 939374043869 is named "AAA Job Opportunity!!! Cloud Computing - Masters or PhD Degree.pdf"

if item.type == 'file':

file_name = F"{download_folder}/{item.name}"

print(item.id) # prints: 939374043869

download_url = client.file(item.id).get_download_url(). # fail 404 error


Hi @dmoore247 , welcome to the forum!



Yes this can be a little confusing…



I’m not sure which Python SDK you are using, but with the Next Gen one, the issue is that you need to send the original shared link to download the files.



I have prepared an example for you:



import os

import shutil

from io import BufferedIOBase

import dotenv

from box_sdk_gen import (

BoxCCGAuth,

CCGConfig,

BoxClient as Client,

BoxAPIError,

Items,

)

from box_sdk_gen import DownloadsManager



ENV_CCG = ".ccg.env"





class ConfigCCG:

"""application configurations"""



def __init__(self) -> None:

dotenv.load_dotenv(ENV_CCG)

# Common configurations

self.client_id = os.getenv("CLIENT_ID")

self.client_secret = os.getenv("CLIENT_SECRET")



# CCG configurations

self.enterprise_id = os.getenv("ENTERPRISE_ID")

self.ccg_user_id = os.getenv("CCG_USER_ID")



self.cache_file = os.getenv("CACHE_FILE", ".ccg.tk")





def main():



conf = ConfigCCG()



ccg_conf = CCGConfig(

client_id=conf.client_id,

client_secret=conf.client_secret,

enterprise_id=conf.enterprise_id,

)

auth = BoxCCGAuth(ccg_conf)

client = Client(auth)



web_link_url = "https://nihcc.app.box.com/v/ChestXray-NIHCC"



user = client.users.get_user_me()

print(f"User: {user.id}:{user.name}")



try:

shared_folder = (

client.shared_links_folders.find_folder_for_shared_link(

boxapi="shared_link=" + web_link_url

)

)

print(f"Shared Folder: {shared_folder.id}:{shared_folder.name}")

print("#" * 80)



print("Type\tID\t\tName")

os.chdir("downloads")

items = client.folders.get_folder_items(

shared_folder.id, boxapi="shared_link=" + web_link_url

)

download_items(client, items, web_link_url)

os.chdir("..")

except BoxAPIError as e:

print(f"Error: {e}")





def download_items(client: Client, items: Items, web_link_url):



for item in items.entries:

if item.type == "folder":

if not os.path.exists(item.name):

os.mkdir(item.name)

os.chdir(item.name)

# print the folder name

print("-" * 80)

print(f"\n\n{item.type.value}\t{item.id}\t{item.name}")

print("-" * 80)

items = client.folders.get_folder_items(

item.id, boxapi="shared_link=" + web_link_url

)

download_items(client, items, web_link_url)

os.chdir("..")



if item.type == "file":

print(f"{item.type.value}\t{item.id}\t{item.name}", end="")



# check if item name ends with .tar.gz

if item.name.endswith(".tar.gz"):

print("\t .tar.gz skipped")

continue

file_content_stream: BufferedIOBase = (

client.downloads.download_file(

item.id, boxapi="shared_link=" + web_link_url

)

)

with open(item.name, "wb") as f:

shutil.copyfileobj(file_content_stream, f)



print("\tdone")





if __name__ == "__main__":

main()

print("Done")





The result is (I’ve skipped the big .tar.gz files):



User: 20706451735:CCG

Shared Folder: 36938765345:CXR8

################################################################################

Type ID Name

--------------------------------------------------------------------------------





folder 37178474737 images

--------------------------------------------------------------------------------

file 371647823217 batch_download_zips.py done

file 219764235225 images_001.tar.gz .tar.gz skipped

file 219767703471 images_002.tar.gz .tar.gz skipped

file 219770039352 images_003.tar.gz .tar.gz skipped

file 221185642661 images_004.tar.gz .tar.gz skipped

file 219776556743 images_005.tar.gz .tar.gz skipped

file 219777758783 images_006.tar.gz .tar.gz skipped

file 220610700915 images_007.tar.gz .tar.gz skipped

file 219776273384 images_008.tar.gz .tar.gz skipped

file 219782291318 images_009.tar.gz .tar.gz skipped

file 219781375034 images_010.tar.gz .tar.gz skipped

file 219777519815 images_011.tar.gz .tar.gz skipped

file 219778785923 images_012.tar.gz .tar.gz skipped

--------------------------------------------------------------------------------





folder 174256157515 LongTailCXR

--------------------------------------------------------------------------------

file 1022679833262 nih-cxr-lt_image_ids.csv done

file 1022664274647 nih-cxr-lt_single-label_balanced-test.csv done

file 1022634602877 nih-cxr-lt_single-label_balanced-val.csv done

file 1022664681945 nih-cxr-lt_single-label_test.csv done

file 1022681300213 nih-cxr-lt_single-label_train.csv done

file 1022683738717 README.txt done

--------------------------------------------------------------------------------





folder 223604149466 PruneCXR

--------------------------------------------------------------------------------

file 1292084530974 miccai2023_nih-cxr-lt_labels_test.csv done

file 1292081161269 miccai2023_nih-cxr-lt_labels_train.csv done

file 1292096337058 miccai2023_nih-cxr-lt_labels_val.csv done

file 1292097450400 README.txt done

file 939374043869 AAA Job Opportunity!!! Cloud Computing - Masters or PhD Degree.pdf done

file 1001272740624 AAA Physician AI Research Opportunity!!!.pdf done

file 906187165990 AAA Postdoctoral Fellowship Opportunity!!! - NIH Medical Image Analysis Postdoc.pdf done

file 256057377774 ARXIV_V5_CHESTXRAY.pdf done

file 219760940956 BBox_List_2017.csv done

file 219760887468 Data_Entry_2017_v2020.csv done

file 249502714403 FAQ_CHESTXRAY.pdf done

file 249505703122 LOG_CHESTXRAY.pdf done

file 220660789610 README_CHESTXRAY.pdf done

file 256055473534 test_list.txt done

file 256056636701 train_val_list.txt done

Done



The trick in this Gen SDK is this line:



items = client.folders.get_folder_items(

item.id, boxapi="shared_link=" + web_link_url

)



It sends the shared link information, so it can be used as security context, otherwise it is looking for the folder/file in your own box instance, as opposed to NIHCC.



Let me know if this helps.



Best regards


Hi @dmoore247



I was looking into the differences between the classic and next gen sdk.



So here is the same example for the classic SDK:



"""demo to download files from a box web link"""



import os

from boxsdk import JWTAuth, Client





def main():

auth = JWTAuth.from_settings_file(".jwt.config.json")

auth.authenticate_instance()

client = Client(auth)



web_link_url = "https://nihcc.app.box.com/v/ChestXray-NIHCC"



user = client.user().get()

print(f"User: {user.id}:{user.name}")



shared_folder = client.get_shared_item(web_link_url, "")

print(f"Shared Folder: {shared_folder.id}:{shared_folder.name}")

print("#" * 80)



print("Type\tID\t\tName")

os.chdir("downloads")

items = shared_folder.get_items()

download_items(items)

os.chdir("..")





def download_items(items):



for item in items:

if item.type == "folder":

if not os.path.exists(item.name):

os.mkdir(item.name)

os.chdir(item.name)



# print the folder name

print("-" * 80)

print(f"\n\n{item.type}\t{item.id}\t{item.name}")

print("-" * 80)



download_items(item.get_items())

os.chdir("..")



if item.type == "file":

print(f"{item.type}\t{item.id}\t{item.name}", end="")



# check if item name ends with .tar.gz

if item.name.endswith(".tar.gz"):

print("\t .tar.gz skipped")

continue



with open(item.name, "wb") as download_file:

item.download_to(download_file)

print("\tdone")





if __name__ == "__main__":

main()

print("Done")





This one ends up being simpler, because it already has the context of the shared folder, and automatically handles that extra parameter/header.



Let us know if this helps.


@rbarbosa Many thanks!


Very detailed examples, and they run in your environment!



A few things I struggle with these examples, and that is where to obtain the dependencies:





  1. .jwt.config.json


  2. What %pip installs to run (I eventually figured this out for the first attempt)


  3. I did figure out CLIENT_ID, CLIENT_SECRET came from the dev console → OAUTH app …


  4. I don’t know where to start to find the .ccg.env and what might go in there


  5. CCG_USER_ID ? (20706451735)


  6. The .ccg.tk CACHE_FILE? Does the api library create and manage this?




I eventually just manually downloaded, uploaded, unzipped the file with the miscellaneous. The publisher provided a simple python script with python requests calls for the .tar.gz files.



Thanks again,


Douglas


Hi





The examples I’ve sent were adapted from previous questions, and that is why you find multiple security models, my apologies for the inconsistency.



Hopefully they run in your environment also 🙂, let’s make sure that happens by taking a step back.



There are 3 types of authentication modes for Box applications:





  • OAuth 2.0 - Requires user to manually authorize the application


  • Client credential grants (CCG) - Requires client id, client secret, enterprise id or user id


  • JSON Web Tokens (JWT) - Requires client id, client secret, enterprise or user id, private key, private key passphrase, key id.




To use in a script my recommendation is to go for CCG, it is the easiest to set up and move forward, but you will need to activate your developer account. I’m not sure if you are using a corporate account, a free account or a free developer account.



For this exercise I recommend you create a free developer account, if you haven’t done so yet, and if you need to, later apply this to your other box account.



To create a CCG app, goto your developer console, and click create app, and select custom app:





On the second dialog select Server authentication (Client Credentials Grant)





This next step depends on what you need your app to do, but considering just downloads from a shared link, you need this:





Next you need to go through the authorization process. Flip to the authorization tab and press review and submit. This will submit the request to your box administrator, which in this free developer account is you.




Now go back to you box.com app and open your administrator console, select apps on the right side menu, and you should see your app pending authorization. Authorize the app.







Note: Remember to do this process (submit+authorize) every time you change the application configurations.





While you are at you administration console, go to account and billing and take note of your enterprise id. in my case:





Go back to the developer console and take not of the client id and client secret:





You now have everything you need to instantiate a CCG Client using the box SDK.


In my examples I tend to use a .ccg.env or just a .env file, and then use the python-dotenv to import these. You can also use them directly in your script, this is considered less secure, but it is up to you to evaluate that.



The .env file looks like this (macOS/Linux):



# Common settings

CLIENT_ID = YOUR_CLIENT_ID

CLIENT_SECRET = YOU_CLIENT_SECRET



# CCG Settings

ENTERPRISE_ID = YOUR_ENTERPRISE_ID

CCG_USER_ID = THE_USER_ID



You can ignore the CCG_USER_ID, since it is used if you want your script to act as a user as opposed to a service account)



You can also ignore the .ccg.tk cache file, it is used to cache the token and re-use it if still within the 60 minutes window, but for this application, I’m assuming getting a new token every time wont be an issue.





I’ve ignored the .tar.gz files because they were too big for a simple demo. You can remove the if statement to download everything.



Here is a revised version for the classic SDK:



"""demo to download files from a box web link"""



import os

import dotenv

from boxsdk import CCGAuth, Client



ENV_CCG = ".ccg.env"





class ConfigCCG:

"""application configurations"""



def __init__(self) -> None:

dotenv.load_dotenv(ENV_CCG)

# Common configurations

self.client_id = os.getenv("CLIENT_ID")

self.client_secret = os.getenv("CLIENT_SECRET")



# CCG configurations

self.enterprise_id = os.getenv("ENTERPRISE_ID")





def main():

conf = ConfigCCG()

auth = CCGAuth(

client_id=conf.client_id,

client_secret=conf.client_secret,

enterprise_id=conf.enterprise_id,

)

client = Client(auth)



web_link_url = "https://nihcc.app.box.com/v/ChestXray-NIHCC"



user = client.user().get()

print(f"User: {user.id}:{user.name}")



shared_folder = client.get_shared_item(web_link_url, "")

print(f"Shared Folder: {shared_folder.id}:{shared_folder.name}")

print("#" * 80)



print("Type\tID\t\tName")

os.chdir("downloads")

items = shared_folder.get_items()

download_items(items)

os.chdir("..")





def download_items(items):



for item in items:

if item.type == "folder":

if not os.path.exists(item.name):

os.mkdir(item.name)

os.chdir(item.name)



# print the folder name

print("-" * 80)

print(f"\n\n{item.type}\t{item.id}\t{item.name}")

print("-" * 80)



download_items(item.get_items())

os.chdir("..")



if item.type == "file":

print(f"{item.type}\t{item.id}\t{item.name}", end="")



# check if item name ends with .tar.gz

if item.name.endswith(".tar.gz"):

print("\t .tar.gz skipped")

continue

# comment the above block to download all files



with open(item.name, "wb") as download_file:

item.download_to(download_file)

print("\tdone")





if __name__ == "__main__":

main()

print("Done")





To install the Box SDK’s, although optional, you should create an environment first.


For Box classic python SDK:





  • pip install boxsdk


    or


  • pip install "boxsdknjwt]" - to include JWT support.




For the Next Gen Box SDK:





  • pip install box_sdk_gen


    or


  • pip install "box_sdk_genljwt]" - to include JWT support.




Let us know if this helps



Best regards


Reply