diff --git a/sparcur_internal/bioluc_upload/bioluc_imagemap.py b/sparcur_internal/bioluc_upload/bioluc_imagemap.py new file mode 100644 index 00000000..d0ec9770 --- /dev/null +++ b/sparcur_internal/bioluc_upload/bioluc_imagemap.py @@ -0,0 +1,66 @@ +import math +import base64 +import pathlib +import boto3 # sigh +import requests +from config import Config +import json + +log_file = open('progress_log.txt', 'a') + +bp_list = [] + + +def get_biolucida_token(): + url_bl_auth = f"{Config.BIOLUCIDA_ENDPOINT}/authenticate" + response = requests.post(url_bl_auth, + data=dict( + username=Config.BIOLUCIDA_USERNAME, + password=Config.BIOLUCIDA_PASSWORD, + token='')) + if response.status_code == requests.codes.ok: + content = response.json() + if content['status'] == 'success': + return content['token'] + return None + +def map_id(item, token, dataset_id, discover_id): + print(item, token) + url_bl_imagemap = f"{Config.BIOLUCIDA_ENDPOINT}/imagemap/add" + resp = requests.post(url_bl_imagemap, + data=dict( + imageId=item['img_id'], + sourceId=item['package_id'], + blackfynn_datasetId=dataset_id, + discover_datasetId=discover_id + ), + headers=dict(token=token)) + print(resp) + if resp.status_code == requests.codes.ok: + content = resp.json() + print(content) + + return item + + +def main(): + dataset_id = Config.DATASET_UUID + discover_id = Config.DISCOVER_ID + bp_list = [] + if dataset_id and discover_id: + try: + f = open('output_with_id.json', 'rb') + with f: + token = get_biolucida_token() + data = json.load(f) + for item in data: + if item['status'] == 'successful' and 'img_id' in item and item['img_id']: + map_id(item, token, dataset_id, discover_id) + + except OSError: + print("No input file") + else: + print("Missing dataset uuid or discover id or both.") + +if __name__ == "__main__": + main() diff --git a/sparcur_internal/bioluc_upload/config.py b/sparcur_internal/bioluc_upload/config.py new file mode 100644 index 00000000..dac25ad4 --- /dev/null +++ b/sparcur_internal/bioluc_upload/config.py @@ -0,0 +1,16 @@ +import os + +class Config(object): + PENNSIEVE_API_HOST = os.environ.get("PENNSIEVE_API_HOST", "https://api.pennsieve.io") + PENNSIEVE_API_SECRET = os.environ.get("PENNSIEVE_API_SECRET", "local-secret-key") + PENNSIEVE_API_TOKEN = os.environ.get("PENNSIEVE_API_TOKEN", "local-api-key") + PENNSIEVE_ORGANIZATION_ID = os.environ.get("PENNSIEVE_ORGANIZATION") + BIOLUCIDA_ENDPOINT = os.environ.get("BIOLUCIDA_ENDPOINT", "https://sparc.biolucida.net/api/v1") + BIOLUCIDA_USERNAME = os.environ.get("BIOLUCIDA_USERNAME", "major-user") + BIOLUCIDA_PASSWORD = os.environ.get("BIOLUCIDA_PASSWORD", "local-password") + TEST_DATASET_ID = os.environ.get("TEST_DATASET_ID", "") + TEST_PACKAGE_ID = os.environ.get("TEST_PACKAGE_ID", "") + SPARC_API = os.environ.get("SPARC_API", "https://api.sparc.science/") + DATASET_UUID = os.environ.get("DATASET_UUID", "") + DISCOVER_ID = os.environ.get("DISCOVER_ID", "") + COLLECTION_ID = os.environ.get("COLLECTION_ID", "") diff --git a/sparcur_internal/bioluc_upload/get_id.py b/sparcur_internal/bioluc_upload/get_id.py new file mode 100644 index 00000000..8a4528b5 --- /dev/null +++ b/sparcur_internal/bioluc_upload/get_id.py @@ -0,0 +1,69 @@ +import math +import base64 +import pathlib +import boto3 # sigh +import requests +from config import Config +import json + +log_file = open('progress_log.txt', 'a') + +bp_list = [] + + +def get_biolucida_token(): + url_bl_auth = f"{Config.BIOLUCIDA_ENDPOINT}/authenticate" + response = requests.post(url_bl_auth, + data=dict( + username=Config.BIOLUCIDA_USERNAME, + password=Config.BIOLUCIDA_PASSWORD, + token='')) + if response.status_code == requests.codes.ok: + content = response.json() + if content['status'] == 'success': + return content['token'] + return None + +def get_biolucida_id(item, token, collection_id): + print("original data:", item, token) + col_id = collection_id + if not col_id: + col_id=item['collection_id'] + url_bl_colandbasename = f"{Config.BIOLUCIDA_ENDPOINT}/image/colandbasename" + resp = requests.post(url_bl_colandbasename, + data=dict( + col_id=col_id, + basename=item['basename'], + ), + headers=dict(token=token)) + print(resp) + if resp.status_code == requests.codes.ok: + content = resp.json() + print(content) + if content['status'] == 'success' and 'image_id' in content: + item['img_id'] = content['image_id'] + item['collection_id'] = col_id + return item + + +def main(): + dataset_id = Config.DATASET_UUID # f001 + collection_id = Config.COLLECTION_ID + bp_list = [] + try: + f = open('input.json', 'rb') + with f: + token = get_biolucida_token() + data = json.load(f) + for item in data: + if item['status'] == 'successful': + bp_list.append(get_biolucida_id(item, token, collection_id)) + + except OSError: + print("No input file") + + with open('output_with_id.json', 'w') as f: + json.dump(bp_list, f) + +if __name__ == "__main__": + main() diff --git a/sparcur_internal/bioluc_upload/instructions.txt b/sparcur_internal/bioluc_upload/instructions.txt new file mode 100644 index 00000000..7bd6e160 --- /dev/null +++ b/sparcur_internal/bioluc_upload/instructions.txt @@ -0,0 +1,41 @@ +Before running the script, please make sure the following environment variables are ready: + +PENNSIEVE_API_SECRET +PENNSIEVE_API_TOKEN +PENNSIEVE_ORGANIZATION +BIOLUCIDA_USERNAME +BIOLUCIDA_PASSWORD +DATASET_UUID + +Note: The script does not create a new collection, you will need to move the images to a collection then make the collection public if neccessary but please beware that moving an image to a new collection chages the image ids. +It may be possible to upload an image directly into a collection but I have not tried that yet. + +First run the penn_bioluc.py script: +1. For the dataset of interest, get the dataset UUID and set it with the DATASET_UUID environment variable. + +Steps 2 to 7 are details in the penn_bioluc.py script: + +2. Using the dataset UUID, get metadata and path metadata information from SciCrunch? - +"https://cassava.ucsd.edu/sparc/datasets/{dataset_uuid}/LATEST/curation-export.json" and "https://cassava.ucsd.edu/sparc/datasets/{dataset_uuid}/LATEST/path-metadata.json". Information required are pennsieve dataset id, published id, package id, filename and filesize. + +3. Authenticate to Pennsieve API server with curator access. From this server, we get the s3 URL for downloading/streaming the file. + +4. Authenticate to the Biolucida server and get the access token for further API calls + +5. Initiate Biolucidaupload with /upload/init, pass in the filename, filesize, chunk_size and token as parameters + +6. Request data of the file in chunks and send them to Biolucida using the /upload/continue endpoint + +7. After the last chunk has been sent, finalise the Biolucida upload by calling /upload/finish + +8. Timeout may occur and in that case, wait for the process to finish running and then copy and rename output.json to input.json and run the penn_bioluc.py script again. +Based on my experience, the script may need to be rerun multiple names. + +9. Once the script runs successfully, copy and rename output.json to input.json then run get_id.py, a new file called output_with_id.json will be created. + + +Note: If the images have been moved to a collection, all the image ids will be changed in the new collection, in that case, please run get_id.py with the environment variable COLLECTION_ID set which will allow the image_ids to be fetched from the dataset. + +10. The output_with_id.json should contain the basename, collection id, pennsieve package id and biolucida image id for each of the uploaded images. It may take some time for the biolucida server to process all the files and the biolucida image id may be missing for some entries in the json file, in that case please rerun step 9 again after a few hours. + +11. Once all image ids have been collected, with the output_with_id.json file in the directory; set the environment variable DISCOVER_ID then run bioluc_imagemap.py to map the images on biolucida server. diff --git a/sparcur_internal/bioluc_upload/penn_bioluc.py b/sparcur_internal/bioluc_upload/penn_bioluc.py new file mode 100644 index 00000000..2e8a7ba9 --- /dev/null +++ b/sparcur_internal/bioluc_upload/penn_bioluc.py @@ -0,0 +1,264 @@ +import math +import base64 +import pathlib +import boto3 # sigh +import requests +from config import Config +import json + +log_file = open('progress_log.txt', 'a') + +bp_list = [] + + +def get_biolucida_token(): + url_bl_auth = f"{Config.BIOLUCIDA_ENDPOINT}/authenticate" + response = requests.post(url_bl_auth, + data=dict( + username=Config.BIOLUCIDA_USERNAME, + password=Config.BIOLUCIDA_PASSWORD, + token='')) + if response.status_code == requests.codes.ok: + content = response.json() + if content['status'] == 'success': + return content['token'] + return None + +def initiate_biolucida_upload(filename, filesize, chunk_size, token): + url_bl_uinit = f"{Config.BIOLUCIDA_ENDPOINT}/upload/init" + response = requests.post(url_bl_uinit, + data=dict( + filename=filename, + filesize=filesize, + chunk_size=chunk_size), + headers=dict(token=token)) + if response.status_code == requests.codes.ok: + content = response.json() + if content['status'] == 'success': + return content['upload_key'], content['total_chunks'] + return None + + +def cancel_biolucida_upload(upload_key): + url_bl_ucancel = f"{Config.BIOLUCIDA_ENDPOINT}/upload/cancel" + response = requests.post(url_bl_ucancel, + data=dict( + upload_key=upload_key + )) + if response.status_code == requests.codes.ok: + content = response.json() + if content['status'] == 'success': + return content['filepath'], content['files'] + return None + +def finalise_biolucida_upload(upload_key, filename): + url_bl_ufin = f"{Config.BIOLUCIDA_ENDPOINT}/upload/finish" + response = requests.post(url_bl_ufin, + data=dict(upload_key=upload_key)) + output = {} + if response.status_code == requests.codes.ok: + log_file.write(f"Upload for {filename} completed\n") + content = response.json() + if content['status'] == 'success': + content = response.json() + if 'img_id' in content: + log_file.write(f"Finish api biolucida id: {content['img_id']}\n") + output['img_id'] = content['img_id'] + else: + if 'collection_id' in content and 'basename' in content: + log_file.write(f"Finish api biolucida id: {content['collection_id']}\n") + output['collection_id'] = content['collection_id'] + output['basename'] = content['basename'] + return output + else: + log_file.write(f"Finish api for upload for {filename} failed\n") + return None + +def get_biolucida_id(filename): + url_bl_search = f"{Config.BIOLUCIDA_ENDPOINT}/search/{filename}" + resp = requests.get(url_bl_search) + if resp.status_code == requests.codes.ok: + content = resp.json() + if content['status'] == 'success': + images = content['images'] + for image in images: + if image['original_name'] == filename: + return image['url'] #this is the id + + return None + + +def get_upload_key(resp): + print(resp.headers, resp.text) + return imageid + + +def upload_to_bl(dataset_id, published_id, package_id, s3url, filename, filesize, chunk_size=1048576): + print(f"Uploading {published_id}, {s3url}, {filename}") + log_file.write(f"Upload {published_id}, {dataset_id}, {package_id}, {s3url}, {filename}, {filesize}\n") + # see https://documenter.getpostman.com/view/8986837/SWLh5mQL + # see also https://github.com/nih-sparc/sparc-app/blob/0ca1c33e245b39b0f07485a990e3862af085013e/nuxt.config.js#L101 + BL_SERVER_URL = Config.BIOLUCIDA_ENDPOINT + # filesize chunk_size filename -> upload_key + # chunk_size is after decoded from base64 + # chunk_id means we can go in parallel in principle + url_bl_ucont = f"{BL_SERVER_URL}/upload/continue" # upload_key upload_data chunk_id + url_bl_ima = f"{BL_SERVER_URL}/imagemap/add" # imageid sourceid blackfynn_datasetId discover_datasetId + + token = get_biolucida_token() + item = { + "package_id": package_id, + "filename": filename, + "discover_id": published_id, + "status": "failed" + } + + if token: + upload_key, expect_chunks = initiate_biolucida_upload(filename, filesize, chunk_size, token) + log_file.write(f"{upload_key}, {expect_chunks}\n") + # see https://documenter.getpostman.com/view/8986837/SWLh5mQL + + if upload_key: + resp_s3 = requests.get(s3url, stream=True) + for i, chunk in enumerate(resp_s3.iter_content(chunk_size=chunk_size)): + msg = f"Chunk {i} of {expect_chunks}: " + log_file.write(msg) + print(msg) + b64chunk = base64.encodebytes(chunk) + resp_cont = requests.post(url_bl_ucont, + data=dict( + upload_key=upload_key, + upload_data=b64chunk, + chunk_id=i)) + if resp_cont.status_code == requests.codes.ok: + content = resp_cont.json() + if content['status'] == 'success': + log_file.write("Successful\n") + print("Successful") + else: + log_file.write("Fail\n") + print("Fail") + else: + log_file.write("Fail\n") + print("Fail") + + data = finalise_biolucida_upload(upload_key, filename) + if data: + item['status'] = "successful" + for key in data: + item[key] = data[key] + bp_list.append(item) + print(item['status']) + + +def kwargs_from_pathmeta(blob, pennsieve_session, published_id): + dataset_id = 'N:' + blob['dataset_id'] + package_id = 'N:' + blob['remote_id'] + filename = blob['basename'] + filesize = blob['size_bytes'] + resp = pennsieve_session.get(blob['uri_api']) + s3url = resp.json()['url'] + return dict( + dataset_id=dataset_id, + published_id=published_id, + package_id=package_id, + s3url=s3url, + filename=filename, + filesize=filesize + ) + + +def make_pennsieve_session(): + api_key = Config.PENNSIEVE_API_TOKEN + api_secret = Config.PENNSIEVE_API_SECRET + + r = requests.get(f"{Config.PENNSIEVE_API_HOST}/authentication/cognito-config") + r.raise_for_status() + + cognito_app_client_id = r.json()["tokenPool"]["appClientId"] + cognito_region = r.json()["region"] + + cognito_idp_client = boto3.client( + "cognito-idp", + region_name=cognito_region, + aws_access_key_id="", + aws_secret_access_key="", + ) + + login_response = cognito_idp_client.initiate_auth( + AuthFlow="USER_PASSWORD_AUTH", + AuthParameters={"USERNAME": api_key, "PASSWORD": api_secret}, + ClientId=cognito_app_client_id, + ) + + api_token = login_response["AuthenticationResult"]["AccessToken"] + + session = requests.Session() + session.headers.update({"Authorization": f"Bearer {api_token}"}) + return session + + +def process_files(dataset_id, skipped, extensions=("jpx", "jp2"), bioluc_username=None): + dataset_uuid = dataset_id.split(':')[-1] + url_metadata = f"https://cassava.ucsd.edu/sparc/datasets/{dataset_uuid}/LATEST/curation-export.json" + url_path_metadata = f"https://cassava.ucsd.edu/sparc/datasets/{dataset_uuid}/LATEST/path-metadata.json" + + # fetch metadata and path metadata + metadata = requests.get(url_metadata).json() + path_metadata = requests.get(url_path_metadata).json() + published_id = metadata['meta'].get('id_published', None) + + pennsieve_session = make_pennsieve_session() + + # get jpx and jp2 files + matches = [] + for blob in path_metadata['data']: + bn = blob['basename'] + if bn.endswith('.jpx') or bn.endswith('.jp2'): + matches.append(blob) + + wargs = [] + + for match in matches: + wargs.append(kwargs_from_pathmeta(match, pennsieve_session, published_id)) + + for warg in wargs: + try: + if not warg['package_id'] in skipped: + print('Required uploading', warg['filename']) + upload_to_bl(**warg) + else: + print('uploaded', warg['filename']) + except: + item = { + "package_id": warg['package_id'], + "filename": warg['filename'], + "discover_id": warg['published_id'], + "status": "failed" + } + bp_list.append(item) + + +def main(): + dataset_id = Config.DATASET_UUID # f001 + skipped = [] + try: + f = open('input.json', 'rb') + with f: + data = json.load(f) + for item in data: + if item['status'] == 'successful': + bp_list.append(item) + skipped.append(item['package_id']) + print(skipped) + except OSError: + print("No input file") + + process_files(dataset_id, skipped) + log_file.close() + + with open('output.json', 'w') as f: + json.dump(bp_list, f) + +if __name__ == "__main__": + main() diff --git a/sparcur_internal/penn_bioluc.py b/sparcur_internal/penn_bioluc.py deleted file mode 100644 index c0cdcf08..00000000 --- a/sparcur_internal/penn_bioluc.py +++ /dev/null @@ -1,170 +0,0 @@ -import math -import base64 -import pathlib -import boto3 # sigh -import requests -from orthauth.stores import Secrets - - -def fun0(resp): - print(resp.headers, resp.text) - return token - - -def fun1(resp): - print(resp.headers, resp.text) - return upload_key - - -def fun2(resp): - print(resp.headers, resp.text) - return imageid - - -def upload_to_bl(dataset_id, published_id, package_id, s3url, filename, filesize, - secrets=None, username=None, BL_SERVER_URL="sparc.biolucida.net", chunk_size=4096): - # see https://documenter.getpostman.com/view/8986837/SWLh5mQL - # see also https://github.com/nih-sparc/sparc-app/blob/0ca1c33e245b39b0f07485a990e3862af085013e/nuxt.config.js#L101 - url_bl_auth = f"https://{BL_SERVER_URL}/api/v1/authenticate" # username password token - url_bl_uinit = f"https://{BL_SERVER_URL}/api/v1/upload/init" # filesize chunk_size filename -> upload_key - # chunk_size is after decoded from base64 - # chunk_id means we can go in parallel in principle - url_bl_ucont = f"https://{BL_SERVER_URL}/api/v1/upload/continue" # upload_key upload_data chunk_id - url_bl_ufin = f"https://{BL_SERVER_URL}/api/v1/upload/finish" # upload_key - url_bl_ima = f"https://{BL_SERVER_URL}/api/v1/imagemap/add" # imageid sourceid blackfynn_datasetId discover_datasetId - - password = secrets('biolucida', 'sparc', 'api', username, 'password') - fake_token = 'derp-fake-token' - resp_auth = requests.post(url_bl_auth, - data=dict( - username=username, - password=password, - token=fake_token)) - token = fun0(resp_auth) - - resp_init = requests.post(url_bl_uinit, - data=dict( - filename=filename, - filesize=filesize, - chunk_size=chunk_size), - headers=dict(token=token)) - upload_key = fun1(resp_init) - - resp_s3 = requests.get(s3url, stream=True) - expect_chunks = math.ceil(filesize / chunk_size) - for i, chunk in enumerate(resps3.iter_content(chunk_size=chunk_size)): - b64chunk = base64.encode(chunk) - resp_cont = requests.post(url_bl_ucont, - data=dict( - upload_key=upload_key, - upload_data=b64chunk, - chunk_id=i)) - print(resp_cont.text) - - resp_fin = requests.post(url_bl_ufin, - data=dict(upload_key=upload_key)) - - imageid = fun2(resp_fin) # ... uh no idea how we get this, hopefully it is in resp_fin ??? - resp_img = requests.post(url_bl_ima, - data=dict( - imageId=imageid, - sourceId=package_id, - blackfynn_datasetId=dataset_id, - discover_datasetId=id_published), - headers=dict(token=token)) - print(resp_img.text) - - -def kwargs_from_pathmeta(blob, pennsieve_session, published_id): - dataset_id = 'N:' + blob['dataset_id'] - package_id = 'N:' + blob['remote_id'] - filename = blob['basename'] - filesize = blob['size_bytes'] - - resp = pennsieve_session.get(blob['uri_api']) - s3url = resp.json()['url'] - return dict( - dataset_id=dataset_id, - published_id=published_id, - package_id=package_id, - s3url=s3url, - filename=filename, - filesize=filesize - ) - - -def make_pennsieve_session(secrets, organization_id): - api_key = secrets('pennsieve', organization_id, 'key') - api_secret = secrets('pennsieve', organization_id, 'secret') - PENNSIEVE_URL = "https://api.pennsieve.io" - - r = requests.get(f"{PENNSIEVE_URL}/authentication/cognito-config") - r.raise_for_status() - - cognito_app_client_id = r.json()["tokenPool"]["appClientId"] - cognito_region = r.json()["region"] - - cognito_idp_client = boto3.client( - "cognito-idp", - region_name=cognito_region, - aws_access_key_id="", - aws_secret_access_key="", - ) - - login_response = cognito_idp_client.initiate_auth( - AuthFlow="USER_PASSWORD_AUTH", - AuthParameters={"USERNAME": api_key, "PASSWORD": api_secret}, - ClientId=cognito_app_client_id, - ) - - api_token = login_response["AuthenticationResult"]["AccessToken"] - - session = requests.Session() - session.headers.update({"Authorization": f"Bearer {api_token}"}) - return session - - -def upload_dataset_files_to_bioluc(dataset_id, secrets=None, extensions=("jpx", "jp2"), bioluc_username=None): - dataset_uuid = dataset_id.split(':')[-1] - url_metadata = f"https://cassava.ucsd.edu/sparc/datasets/{dataset_uuid}/LATEST/curation-export.json" - url_path_metadata = f"https://cassava.ucsd.edu/sparc/datasets/{dataset_uuid}/LATEST/path-metadata.json" - - # fetch metadata and path metadata - metadata = requests.get(url_metadata).json() - path_metadata = requests.get(url_path_metadata).json() - published_id = metadata['meta'].get('id_published', None) - organization_id = 'N:' + path_metadata['data'][0]['external_parent_id'] - - pennsieve_session = make_pennsieve_session(secrets, organization_id) - - # get jpx and jp2 files - matches = [] - for blob in path_metadata['data']: - bn = blob['basename'] - if bn.endswith('.jpx') or bn.endswith('.jp2'): - matches.append(blob) - - wargs = [] - for match in matches: - wargs.append(kwargs_from_pathmeta(match, pennsieve_session, published_id)) - - for warg in wargs: - upload_to_bl(**warg, secrets=secrets, username=bioluc_username) - - # filter for just the jpx and jp2 files - # get the package ids - # loop over the package ids and - # get the s3 key from pennsieve api - # pull from the s3 address and upload the biolucida endpoint - # get the image id from biolucida - # post the package id to the biolucida image id so that it is mapped - - -def main(): - dataset_id = "N:dataset:aa43eda8-b29a-4c25-9840-ecbd57598afc" # f001 - secrets = Secrets(pathlib.Path('~/ni/dev/secrets.sxpr').expanduser()) - upload_dataset_files_to_bioluc(dataset_id, secrets=secrets, bioluc_username='tgbugs') - - -if __name__ == "__main__": - main()