Code
client = Client(auth)
SHARED_URL = "https://nihcc.app.box.com/v/ChestXray-NIHCC"
root_folder = client.get_shared_item(SHARED_URL, '')
print(root_folder.name) # prints CXR8
items = root_folder.get_items()
for item in items:
print('{0} {1} is named "{2}"'.format(item.type.capitalize(), item.id, item.name))
# prints: File 939374043869 is named "AAA Job Opportunity!!! Cloud Computing - Masters or PhD Degree.pdf"
if item.type == 'file':
file_name = F"{download_folder}/{item.name}"
print(item.id) # prints: 939374043869
download_url = client.file(item.id).get_download_url(). # fail 404 error
Hi @dmoore247 , welcome to the forum!
Yes this can be a little confusing…
I’m not sure which Python SDK you are using, but with the Next Gen one, the issue is that you need to send the original shared link to download the files.
I have prepared an example for you:
import os
import shutil
from io import BufferedIOBase
import dotenv
from box_sdk_gen import (
BoxCCGAuth,
CCGConfig,
BoxClient as Client,
BoxAPIError,
Items,
)
from box_sdk_gen import DownloadsManager
ENV_CCG = ".ccg.env"
class ConfigCCG:
"""application configurations"""
def __init__(self) -> None:
dotenv.load_dotenv(ENV_CCG)
# Common configurations
self.client_id = os.getenv("CLIENT_ID")
self.client_secret = os.getenv("CLIENT_SECRET")
# CCG configurations
self.enterprise_id = os.getenv("ENTERPRISE_ID")
self.ccg_user_id = os.getenv("CCG_USER_ID")
self.cache_file = os.getenv("CACHE_FILE", ".ccg.tk")
def main():
conf = ConfigCCG()
ccg_conf = CCGConfig(
client_id=conf.client_id,
client_secret=conf.client_secret,
enterprise_id=conf.enterprise_id,
)
auth = BoxCCGAuth(ccg_conf)
client = Client(auth)
web_link_url = "https://nihcc.app.box.com/v/ChestXray-NIHCC"
user = client.users.get_user_me()
print(f"User: {user.id}:{user.name}")
try:
shared_folder = (
client.shared_links_folders.find_folder_for_shared_link(
boxapi="shared_link=" + web_link_url
)
)
print(f"Shared Folder: {shared_folder.id}:{shared_folder.name}")
print("#" * 80)
print("Type\tID\t\tName")
os.chdir("downloads")
items = client.folders.get_folder_items(
shared_folder.id, boxapi="shared_link=" + web_link_url
)
download_items(client, items, web_link_url)
os.chdir("..")
except BoxAPIError as e:
print(f"Error: {e}")
def download_items(client: Client, items: Items, web_link_url):
for item in items.entries:
if item.type == "folder":
if not os.path.exists(item.name):
os.mkdir(item.name)
os.chdir(item.name)
# print the folder name
print("-" * 80)
print(f"\n\n{item.type.value}\t{item.id}\t{item.name}")
print("-" * 80)
items = client.folders.get_folder_items(
item.id, boxapi="shared_link=" + web_link_url
)
download_items(client, items, web_link_url)
os.chdir("..")
if item.type == "file":
print(f"{item.type.value}\t{item.id}\t{item.name}", end="")
# check if item name ends with .tar.gz
if item.name.endswith(".tar.gz"):
print("\t .tar.gz skipped")
continue
file_content_stream: BufferedIOBase = (
client.downloads.download_file(
item.id, boxapi="shared_link=" + web_link_url
)
)
with open(item.name, "wb") as f:
shutil.copyfileobj(file_content_stream, f)
print("\tdone")
if __name__ == "__main__":
main()
print("Done")
The result is (I’ve skipped the big .tar.gz files):
User: 20706451735:CCG
Shared Folder: 36938765345:CXR8
################################################################################
Type ID Name
--------------------------------------------------------------------------------
folder 37178474737 images
--------------------------------------------------------------------------------
file 371647823217 batch_download_zips.py done
file 219764235225 images_001.tar.gz .tar.gz skipped
file 219767703471 images_002.tar.gz .tar.gz skipped
file 219770039352 images_003.tar.gz .tar.gz skipped
file 221185642661 images_004.tar.gz .tar.gz skipped
file 219776556743 images_005.tar.gz .tar.gz skipped
file 219777758783 images_006.tar.gz .tar.gz skipped
file 220610700915 images_007.tar.gz .tar.gz skipped
file 219776273384 images_008.tar.gz .tar.gz skipped
file 219782291318 images_009.tar.gz .tar.gz skipped
file 219781375034 images_010.tar.gz .tar.gz skipped
file 219777519815 images_011.tar.gz .tar.gz skipped
file 219778785923 images_012.tar.gz .tar.gz skipped
--------------------------------------------------------------------------------
folder 174256157515 LongTailCXR
--------------------------------------------------------------------------------
file 1022679833262 nih-cxr-lt_image_ids.csv done
file 1022664274647 nih-cxr-lt_single-label_balanced-test.csv done
file 1022634602877 nih-cxr-lt_single-label_balanced-val.csv done
file 1022664681945 nih-cxr-lt_single-label_test.csv done
file 1022681300213 nih-cxr-lt_single-label_train.csv done
file 1022683738717 README.txt done
--------------------------------------------------------------------------------
folder 223604149466 PruneCXR
--------------------------------------------------------------------------------
file 1292084530974 miccai2023_nih-cxr-lt_labels_test.csv done
file 1292081161269 miccai2023_nih-cxr-lt_labels_train.csv done
file 1292096337058 miccai2023_nih-cxr-lt_labels_val.csv done
file 1292097450400 README.txt done
file 939374043869 AAA Job Opportunity!!! Cloud Computing - Masters or PhD Degree.pdf done
file 1001272740624 AAA Physician AI Research Opportunity!!!.pdf done
file 906187165990 AAA Postdoctoral Fellowship Opportunity!!! - NIH Medical Image Analysis Postdoc.pdf done
file 256057377774 ARXIV_V5_CHESTXRAY.pdf done
file 219760940956 BBox_List_2017.csv done
file 219760887468 Data_Entry_2017_v2020.csv done
file 249502714403 FAQ_CHESTXRAY.pdf done
file 249505703122 LOG_CHESTXRAY.pdf done
file 220660789610 README_CHESTXRAY.pdf done
file 256055473534 test_list.txt done
file 256056636701 train_val_list.txt done
Done
The trick in this Gen SDK is this line:
items = client.folders.get_folder_items(
item.id, boxapi="shared_link=" + web_link_url
)
It sends the shared link information, so it can be used as security context, otherwise it is looking for the folder/file in your own box instance, as opposed to NIHCC.
Let me know if this helps.
Best regards
Hi @dmoore247
I was looking into the differences between the classic and next gen sdk.
So here is the same example for the classic SDK:
"""demo to download files from a box web link"""
import os
from boxsdk import JWTAuth, Client
def main():
auth = JWTAuth.from_settings_file(".jwt.config.json")
auth.authenticate_instance()
client = Client(auth)
web_link_url = "https://nihcc.app.box.com/v/ChestXray-NIHCC"
user = client.user().get()
print(f"User: {user.id}:{user.name}")
shared_folder = client.get_shared_item(web_link_url, "")
print(f"Shared Folder: {shared_folder.id}:{shared_folder.name}")
print("#" * 80)
print("Type\tID\t\tName")
os.chdir("downloads")
items = shared_folder.get_items()
download_items(items)
os.chdir("..")
def download_items(items):
for item in items:
if item.type == "folder":
if not os.path.exists(item.name):
os.mkdir(item.name)
os.chdir(item.name)
# print the folder name
print("-" * 80)
print(f"\n\n{item.type}\t{item.id}\t{item.name}")
print("-" * 80)
download_items(item.get_items())
os.chdir("..")
if item.type == "file":
print(f"{item.type}\t{item.id}\t{item.name}", end="")
# check if item name ends with .tar.gz
if item.name.endswith(".tar.gz"):
print("\t .tar.gz skipped")
continue
with open(item.name, "wb") as download_file:
item.download_to(download_file)
print("\tdone")
if __name__ == "__main__":
main()
print("Done")
This one ends up being simpler, because it already has the context of the shared folder, and automatically handles that extra parameter/header.
Let us know if this helps.
@rbarbosa Many thanks!
Very detailed examples, and they run in your environment!
A few things I struggle with these examples, and that is where to obtain the dependencies:
.jwt.config.json
- What %pip installs to run (I eventually figured this out for the first attempt)
- I did figure out CLIENT_ID, CLIENT_SECRET came from the dev console → OAUTH app …
- I don’t know where to start to find the
.ccg.env
and what might go in there
CCG_USER_ID
? (20706451735)
- The
.ccg.tk
CACHE_FILE? Does the api library create and manage this?
I eventually just manually downloaded, uploaded, unzipped the file with the miscellaneous. The publisher provided a simple python script with python requests
calls for the .tar.gz files.
Thanks again,
Douglas
Hi
The examples I’ve sent were adapted from previous questions, and that is why you find multiple security models, my apologies for the inconsistency.
Hopefully they run in your environment also 🙂, let’s make sure that happens by taking a step back.
There are 3 types of authentication modes for Box applications:
- OAuth 2.0 - Requires user to manually authorize the application
- Client credential grants (CCG) - Requires client id, client secret, enterprise id or user id
- JSON Web Tokens (JWT) - Requires client id, client secret, enterprise or user id, private key, private key passphrase, key id.
To use in a script my recommendation is to go for CCG, it is the easiest to set up and move forward, but you will need to activate your developer account. I’m not sure if you are using a corporate account, a free account or a free developer account.
For this exercise I recommend you create a free developer account, if you haven’t done so yet, and if you need to, later apply this to your other box account.
To create a CCG app, goto your developer console, and click create app, and select custom app:
On the second dialog select Server authentication (Client Credentials Grant)
This next step depends on what you need your app to do, but considering just downloads from a shared link, you need this:
Next you need to go through the authorization process. Flip to the authorization tab and press review and submit. This will submit the request to your box administrator, which in this free developer account is you.
Now go back to you box.com app and open your administrator console, select apps on the right side menu, and you should see your app pending authorization. Authorize the app.
Note: Remember to do this process (submit+authorize) every time you change the application configurations.
While you are at you administration console, go to account and billing and take note of your enterprise id. in my case:
Go back to the developer console and take not of the client id and client secret:
You now have everything you need to instantiate a CCG Client using the box SDK.
In my examples I tend to use a .ccg.env
or just a .env
file, and then use the python-dotenv to import these. You can also use them directly in your script, this is considered less secure, but it is up to you to evaluate that.
The .env file looks like this (macOS/Linux):
# Common settings
CLIENT_ID = YOUR_CLIENT_ID
CLIENT_SECRET = YOU_CLIENT_SECRET
# CCG Settings
ENTERPRISE_ID = YOUR_ENTERPRISE_ID
CCG_USER_ID = THE_USER_ID
You can ignore the CCG_USER_ID, since it is used if you want your script to act as a user as opposed to a service account)
You can also ignore the .ccg.tk cache file, it is used to cache the token and re-use it if still within the 60 minutes window, but for this application, I’m assuming getting a new token every time wont be an issue.
I’ve ignored the .tar.gz
files because they were too big for a simple demo. You can remove the if
statement to download everything.
Here is a revised version for the classic SDK:
"""demo to download files from a box web link"""
import os
import dotenv
from boxsdk import CCGAuth, Client
ENV_CCG = ".ccg.env"
class ConfigCCG:
"""application configurations"""
def __init__(self) -> None:
dotenv.load_dotenv(ENV_CCG)
# Common configurations
self.client_id = os.getenv("CLIENT_ID")
self.client_secret = os.getenv("CLIENT_SECRET")
# CCG configurations
self.enterprise_id = os.getenv("ENTERPRISE_ID")
def main():
conf = ConfigCCG()
auth = CCGAuth(
client_id=conf.client_id,
client_secret=conf.client_secret,
enterprise_id=conf.enterprise_id,
)
client = Client(auth)
web_link_url = "https://nihcc.app.box.com/v/ChestXray-NIHCC"
user = client.user().get()
print(f"User: {user.id}:{user.name}")
shared_folder = client.get_shared_item(web_link_url, "")
print(f"Shared Folder: {shared_folder.id}:{shared_folder.name}")
print("#" * 80)
print("Type\tID\t\tName")
os.chdir("downloads")
items = shared_folder.get_items()
download_items(items)
os.chdir("..")
def download_items(items):
for item in items:
if item.type == "folder":
if not os.path.exists(item.name):
os.mkdir(item.name)
os.chdir(item.name)
# print the folder name
print("-" * 80)
print(f"\n\n{item.type}\t{item.id}\t{item.name}")
print("-" * 80)
download_items(item.get_items())
os.chdir("..")
if item.type == "file":
print(f"{item.type}\t{item.id}\t{item.name}", end="")
# check if item name ends with .tar.gz
if item.name.endswith(".tar.gz"):
print("\t .tar.gz skipped")
continue
# comment the above block to download all files
with open(item.name, "wb") as download_file:
item.download_to(download_file)
print("\tdone")
if __name__ == "__main__":
main()
print("Done")
To install the Box SDK’s, although optional, you should create an environment first.
For Box classic python SDK:
pip install boxsdk
or
pip install "boxsdknjwt]"
- to include JWT support.
For the Next Gen Box SDK:
pip install box_sdk_gen
or
pip install "box_sdk_genljwt]"
- to include JWT support.
Let us know if this helps
Best regards