diff --git a/README.md b/README.md index 2d3c98a..946f3fc 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ To simulate selenium and production integration tests: One of the key steps in ReDU is the updating of the database to include the latest identifications for files within ReDU. These are the following steps: 1. Download batch template for GNPS at ```https://redu.ucsd.edu/metabatchdump``` -1. Run Batch Workflow for Spectral Library Search +1. Run [Batch Workflow for Spectral Library Search](https://github.com/mwang87/ReDU-MS2-GNPS/blob/master/code/search_all_data.py) 1. Get the set of tasks as tsv and save to [here](https://github.com/mwang87/ReDU-MS2-GNPS/blob/master/database/global_tasks.tsv). 1. Remove database [here](https://github.com/mwang87/ReDU-MS2-GNPS/tree/master/database) 1. Remove all untracked files in temp, this will be for the global pca diff --git a/code/search_all_data.py b/code/search_all_data.py index 77e9984..7fcedc4 100644 --- a/code/search_all_data.py +++ b/code/search_all_data.py @@ -7,29 +7,26 @@ import credentials from models import * -def main(): - all_filenames = list(Filename.select()) +def parse_metabatch_dump(): + metabatch_filename = '../database/metabatchdump.tsv' + all_filenames = pd.read_table(metabatch_filename) + task_id_list = [] + parallelism = len(all_filenames) - PARALLISM = 20 - for i in range(PARALLISM): - partition_filenames = all_filenames[i::PARALLISM] + for index, row in all_filenames.iterrows(): + filenames = row.filename + id = row.id - filenames_list = [filename.filepath for filename in partition_filenames] - - print("Searching %d Files", len(filenames_list)) - - taskid = util.launch_GNPS_librarysearchworkflow(filenames_list, "ReDU-MS2 Global Analysis Populate %d of %d" % (i, PARALLISM), \ - credentials.USERNAME, credentials.PASSWORD, "miw023@ucsd.edu") - - print(taskid) - + taskid = util.launch_GNPS_librarysearchworkflow(filenames, "ReDU-MS2 Global Analysis Populate %d of %d" % (id, parallelism), \ + credentials.USERNAME, credentials.PASSWORD, "christineaceves22@gmail.com") + task_id_list.append(taskid) - + df = pd.DataFrame() df["taskid"] = task_id_list - df.to_csv("./database/global_tasks.tsv", sep="\t", index=False) + df.to_csv("../database/global_tasks.tsv", sep="\t", index=False) if __name__ == '__main__': - main() + parse_metabatch_dump() diff --git a/code/util.py b/code/util.py new file mode 100644 index 0000000..ee2a68f --- /dev/null +++ b/code/util.py @@ -0,0 +1,396 @@ +import os +from app import app +import ftputil +import credentials +import json +import requests +from werkzeug.utils import secure_filename + +ALLOWED_EXTENSIONS = set(['mgf', 'mzxml', 'mzml', 'csv', 'txt', 'raw', 'msp']) + + +def allowed_file(filename): + return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS + + +def thermoconvert_localfile(input_filename, save_dir): + extension = input_filename.rsplit('.', 1)[-1].lower() + print(extension) + + """Do Nothing""" + if extension != "raw": + return input_filename + + """Perform Conversion""" + cmd = "mono /src/bin/x64/Debug/ThermoRawFileParser.exe -i=%s -o=%s -f=1" % (input_filename, save_dir) + os.system(cmd) + os.remove(input_filename) + output_filename = os.path.join(save_dir, os.path.basename(input_filename).replace(".raw", ".mzML")) + return output_filename + +def upload_single_file(request, group): + sessionid = request.cookies.get('sessionid') + + filename = "" + + if 'file' not in request.files: + return "{}" + request_file = request.files['file'] + + return upload_single_file_push(request_file, sessionid, group) + +def upload_single_file_push(request_file, uuid_folder, collection_name): + if request_file.filename == '': + return "{}" + if request_file and allowed_file(request_file.filename): + filename = secure_filename(request_file.filename) + save_dir = os.path.join(app.config['UPLOAD_FOLDER'], uuid_folder, collection_name) + if not os.path.exists(save_dir): + os.makedirs(save_dir) + local_filename = os.path.join(save_dir, filename) + request_file.save(local_filename) + + """If we need to convert raw file, we do it here""" + local_filename = thermoconvert_localfile(local_filename, save_dir) + + #Uploading to FTP + upload_to_gnps(local_filename, uuid_folder, collection_name) + + #Remove local file + os.remove(local_filename) + else: + print("not allowed") + return json.dumps({"status": "Invalid File Type"}) + + return json.dumps({"filename": filename}) + + +def check_ftp_folders(username): + url = "ccms-ftp01.ucsd.edu" + present_folders = [] + + with ftputil.FTPHost(url, credentials.USERNAME, credentials.PASSWORD) as ftp_host: + names = ftp_host.listdir(ftp_host.curdir) + if not username in names: + return present_folders + + ftp_host.chdir(username) + + return ftp_host.listdir(ftp_host.curdir) + + return present_folders + + +def upload_to_gnps(input_filename, folder_for_spectra, group_name, username=credentials.USERNAME, password=credentials.PASSWORD): + url = "ccms-ftp01.ucsd.edu" + + with ftputil.FTPHost(url, username, password) as ftp_host: + names = ftp_host.listdir(ftp_host.curdir) + try: + if not folder_for_spectra in names: + print("MAKING DIR") + ftp_host.mkdir(folder_for_spectra) + except: + print("Cannot Make Folder", folder_for_spectra) + + ftp_host.chdir(folder_for_spectra) + try: + if not group_name in ftp_host.listdir(ftp_host.curdir): + print("MAKING Group DIR") + ftp_host.mkdir(group_name) + except: + print("Cannot Make Folder", group_name) + ftp_host.chdir(group_name) + + ftp_host.upload(input_filename, os.path.basename(input_filename)) + +def get_classic_networking_lowres_parameters(): + invokeParameters = {} + invokeParameters["workflow"] = "METABOLOMICS-SNETS-V2" + invokeParameters["protocol"] = "None" + invokeParameters["workflow_version"] = "release_22" + invokeParameters["library_on_server"] = "d.speclibs;" + invokeParameters["tolerance.PM_tolerance"] = "2.0" + invokeParameters["tolerance.Ion_tolerance"] = "0.5" + invokeParameters["PAIRS_MIN_COSINE"] = "0.70" + invokeParameters["MIN_MATCHED_PEAKS"] = "6" + invokeParameters["TOPK"] = "10" + invokeParameters["CLUSTER_MIN_SIZE"] = "2" + invokeParameters["RUN_MSCLUSTER"] = "on" + invokeParameters["MAXIMUM_COMPONENT_SIZE"] = "100" + invokeParameters["MIN_MATCHED_PEAKS_SEARCH"] = "6" + invokeParameters["SCORE_THRESHOLD"] = "0.7" + invokeParameters["ANALOG_SEARCH"] = "0" + invokeParameters["MAX_SHIFT_MASS"] = "100.0" + invokeParameters["FILTER_STDDEV_PEAK_datasetsINT"] = "0.0" + invokeParameters["MIN_PEAK_INT"] = "0.0" + invokeParameters["FILTER_PRECURSOR_WINDOW"] = "1" + invokeParameters["FILTER_LIBRARY"] = "1" + invokeParameters["WINDOW_FILTER"] = "1" + invokeParameters["CREATE_CLUSTER_BUCKETS"] = "1" + invokeParameters["CREATE_ILI_OUTPUT"] = "0" + invokeParameters["FILTER_G6_BLANKS"] = "0" + + + return invokeParameters + +def get_classic_networking_highres_parameters(): + invokeParameters = {} + invokeParameters["workflow"] = "METABOLOMICS-SNETS-V2" + invokeParameters["protocol"] = "None" + invokeParameters["workflow_version"] = "release_22" + invokeParameters["library_on_server"] = "d.speclibs;" + invokeParameters["tolerance.PM_tolerance"] = "0.05" + invokeParameters["tolerance.Ion_tolerance"] = "0.05" + invokeParameters["PAIRS_MIN_COSINE"] = "0.70" + invokeParameters["MIN_MATCHED_PEAKS"] = "6" + invokeParameters["TOPK"] = "10" + invokeParameters["CLUSTER_MIN_SIZE"] = "2" + invokeParameters["RUN_MSCLUSTER"] = "on" + invokeParameters["MAXIMUM_COMPONENT_SIZE"] = "100" + invokeParameters["MIN_MATCHED_PEAKS_SEARCH"] = "6" + invokeParameters["SCORE_THRESHOLD"] = "0.7" + invokeParameters["ANALOG_SEARCH"] = "0" + invokeParameters["MAX_SHIFT_MASS"] = "100.0" + invokeParameters["FILTER_STDDEV_PEAK_datasetsINT"] = "0.0" + invokeParameters["MIN_PEAK_INT"] = "0.0" + invokeParameters["FILTER_PRECURSOR_WINDOW"] = "1" + invokeParameters["FILTER_LIBRARY"] = "1" + invokeParameters["WINDOW_FILTER"] = "1" + invokeParameters["CREATE_CLUSTER_BUCKETS"] = "1" + invokeParameters["CREATE_ILI_OUTPUT"] = "0" + invokeParameters["FILTER_G6_BLANKS"] = "0" + + return invokeParameters + +def launch_GNPS_workflow(ftp_path, job_description, username, password, groups_present, email, preset): + invokeParameters = {} + + if preset == "LOWRES": + invokeParameters = get_classic_networking_lowres_parameters() + elif preset == "HIGHRES": + invokeParameters = get_classic_networking_highres_parameters() + else: + return "Error No Preset" + + invokeParameters["desc"] = job_description + invokeParameters["spec_on_server"] = "d." + ftp_path + "/G1;" + if "G2" in groups_present: + invokeParameters["spec_on_server_group2"] = "d." + ftp_path + "/G2;" + if "G3" in groups_present: + invokeParameters["spec_on_server_group3"] = "d." + ftp_path + "/G3;" + + invokeParameters["email"] = email + + + task_id = invoke_workflow("gnps.ucsd.edu", invokeParameters, username, password) + + return task_id + +def launch_GNPS_featurenetworking_workflow(ftp_path, job_description, username, password, email, featuretool, present_folders, preset): + invokeParameters = {} + + if preset == "LOWRES": + invokeParameters = get_featurenetworking_lowres_parameters() + elif preset == "HIGHRES": + invokeParameters = get_featurenetworking_highres_parameters() + else: + return "Error No Preset" + + #Specific Parameters Update + invokeParameters["desc"] = job_description + + invokeParameters["quantification_table"] = "d." + ftp_path + "/featurequantification;" + invokeParameters["spec_on_server"] = "d." + ftp_path + "/featurems2;" + if "samplemetadata" in present_folders: + invokeParameters["metadata_table"] = "d." + ftp_path + "/samplemetadata;" + + #Quant + invokeParameters["QUANT_TABLE_SOURCE"] = featuretool + + #Additional Pairs + if "additionalpairs" in present_folders: + invokeParameters["additional_pairs"] = "d." + ftp_path + "/additionalpairs;" + + invokeParameters["email"] = email + + task_id = invoke_workflow("gnps.ucsd.edu", invokeParameters, username, password) + + return task_id + +def get_featurenetworking_lowres_parameters(): + invokeParameters = {} + invokeParameters["workflow"] = "FEATURE-BASED-MOLECULAR-NETWORKING" + invokeParameters["protocol"] = "None" + invokeParameters["workflow_version"] = "release_27" + invokeParameters["desc"] = "Job Description" + invokeParameters["library_on_server"] = "d.speclibs;" + + #Networking + invokeParameters["tolerance.PM_tolerance"] = "2.0" + invokeParameters["tolerance.Ion_tolerance"] = "0.5" + invokeParameters["PAIRS_MIN_COSINE"] = "0.70" + invokeParameters["MIN_MATCHED_PEAKS"] = "6" + invokeParameters["TOPK"] = "10" + invokeParameters["MAX_SHIFT"] = "500" + + #Network Pruning + invokeParameters["MAXIMUM_COMPONENT_SIZE"] = "100" + + #Library Search + invokeParameters["MIN_MATCHED_PEAKS_SEARCH"] = "6" + invokeParameters["SCORE_THRESHOLD"] = "0.7" + invokeParameters["TOP_K_RESULTS"] = "1" + invokeParameters["ANALOG_SEARCH"] = "0" + invokeParameters["MAX_SHIFT_MASS"] = "100.0" + invokeParameters["FILTER_STDDEV_PEAK_datasetsINT"] = "0.0" + invokeParameters["MIN_PEAK_INT"] = "0.0" + invokeParameters["FILTER_PRECURSOR_WINDOW"] = "1" + invokeParameters["FILTER_LIBRARY"] = "1" + invokeParameters["WINDOW_FILTER"] = "1" + + #Quant + invokeParameters["QUANT_TABLE_SOURCE"] = "" + invokeParameters["GROUP_COUNT_AGGREGATE_METHOD"] = "Mean" + invokeParameters["QUANT_FILE_NORM"] = "RowSum" + + # Stats + invokeParameters["RUN_STATS"] = "No" + invokeParameters["METADATA_COLUMN"] = "None" + invokeParameters["METADATA_COLUMN_FACET"] = "None" + invokeParameters["METADATA_CONDITION_ONE"] = "None" + invokeParameters["METADATA_CONDITION_TWO"] = "None" + + #External tools + invokeParameters["RUN_DEREPLICATOR"] = "1" + + # Qiime2 + invokeParameters["QIIME2_PCOA_DISTANCE"] = "cosine" + + # Metadata + invokeParameters["googlesheetsmetadata"] = "None" + + invokeParameters["email"] = "ccms.web@gmail.com" + invokeParameters["uuid"] = "1DCE40F7-1211-0001-979D-15DAB2D0B500" + + return invokeParameters + +def get_featurenetworking_highres_parameters(): + invokeParameters = {} + invokeParameters["workflow"] = "FEATURE-BASED-MOLECULAR-NETWORKING" + invokeParameters["protocol"] = "None" + invokeParameters["workflow_version"] = "release_27" + invokeParameters["desc"] = "Job Description" + invokeParameters["library_on_server"] = "d.speclibs;" + + #Networking + invokeParameters["tolerance.PM_tolerance"] = "0.05" + invokeParameters["tolerance.Ion_tolerance"] = "0.05" + invokeParameters["PAIRS_MIN_COSINE"] = "0.70" + invokeParameters["MIN_MATCHED_PEAKS"] = "6" + invokeParameters["TOPK"] = "10" + invokeParameters["MAX_SHIFT"] = "500" + + #Network Pruning + invokeParameters["MAXIMUM_COMPONENT_SIZE"] = "100" + + #Library Search + invokeParameters["MIN_MATCHED_PEAKS_SEARCH"] = "6" + invokeParameters["SCORE_THRESHOLD"] = "0.7" + invokeParameters["TOP_K_RESULTS"] = "1" + invokeParameters["ANALOG_SEARCH"] = "0" + invokeParameters["MAX_SHIFT_MASS"] = "100.0" + invokeParameters["FILTER_STDDEV_PEAK_datasetsINT"] = "0.0" + invokeParameters["MIN_PEAK_INT"] = "0.0" + invokeParameters["FILTER_PRECURSOR_WINDOW"] = "1" + invokeParameters["FILTER_LIBRARY"] = "1" + invokeParameters["WINDOW_FILTER"] = "1" + + #Quant + invokeParameters["QUANT_TABLE_SOURCE"] = "" + invokeParameters["GROUP_COUNT_AGGREGATE_METHOD"] = "Mean" + invokeParameters["QUANT_FILE_NORM"] = "RowSum" + + # Stats + invokeParameters["RUN_STATS"] = "No" + invokeParameters["METADATA_COLUMN"] = "None" + invokeParameters["METADATA_COLUMN_FACET"] = "None" + invokeParameters["METADATA_CONDITION_ONE"] = "None" + invokeParameters["METADATA_CONDITION_TWO"] = "None" + + #External tools + invokeParameters["RUN_DEREPLICATOR"] = "1" + + # Qiime2 + invokeParameters["QIIME2_PCOA_DISTANCE"] = "cosine" + + # Metadata + invokeParameters["googlesheetsmetadata"] = "None" + + invokeParameters["email"] = "ccms.web@gmail.com" + invokeParameters["uuid"] = "1DCE40F7-1211-0001-979D-15DAB2D0B500" + + return invokeParameters + +#set for redu library search +def get_librarysearch_parameters(): + invokeParameters = {} + invokeParameters["ANALOG_SEARCH"] = "0" + invokeParameters["FILTER_LIBRARY"] = "1" + invokeParameters["FILTER_PRECURSOR_WINDOW"] = "1" + invokeParameters["FILTER_SNR_PEAK_INT"] = "0.0" + invokeParameters["FILTER_STDDEV_PEAK_INT"] = "0.0" + invokeParameters["MAX_SHIFT_MASS"] = "100.0" + invokeParameters["MIN_MATCHED_PEAKS"]= "6" + invokeParameters["MIN_PEAK_INT"] = "0.0" + invokeParameters["SCORE_THRESHOLD"] = "0.7" + invokeParameters["SEARCH_LIBQUALITY"]="3" + invokeParameters["TOP_K_RESULTS"]="1" + invokeParameters["WINDOW_FILTER"]="1" + invokeParameters["library_on_server"] = "d.speclibs;" + invokeParameters["reanalyzed_datasets"]="" + invokeParameters["tolerance.Ion_tolerance"] = "0.5" + invokeParameters["tolerance.PM_tolerance"]="2.0" + invokeParameters["workflow"] = "MOLECULAR-LIBRARYSEARCH-V2" + invokeParameters["workflow_version"] = "release_10.1" + + return(invokeParameters) + +def launch_GNPS_librarysearchworkflow(filenames_list, description, username, password, email): + invokeParameters = {} + invokeParameters = get_librarysearch_parameters() + + invokeParameters["desc"] = description + invokeParameters["spec_on_server"] = filenames_list + invokeParameters["email"] = email + + task_id = invoke_workflow("gnps.ucsd.edu", invokeParameters, username, password) + + return task_id + + +def invoke_workflow(base_url, parameters, login, password): + username = login + password = password + + s = requests.Session() + + payload = { + 'user' : username, + 'password' : password, + 'login' : 'Sign in' + } + + r = s.post('https://' + base_url + '/ProteoSAFe/user/login.jsp', data=payload, verify=False) + r = s.post('https://' + base_url + '/ProteoSAFe/InvokeTools', data=parameters, verify=False) + task_id = r.text + + import sys + print(r.text, file=sys.stderr, flush=True) + + if len(task_id) > 4 and len(task_id) < 60: + print("Launched Task: : " + r.text) + return task_id + else: + print(task_id) + return None diff --git a/database/global_tasks.tsv b/database/global_tasks.tsv index aca2ff5..6e83c45 100644 --- a/database/global_tasks.tsv +++ b/database/global_tasks.tsv @@ -1,42 +1,47 @@ taskid -ed84d61e46784541a96f6abf95dcaa99 -3145a858f95f4e8cb4587d1808a157a2 -8cc0334c53404374ae68e27fba0851e5 -d23f32e91b054d39a2520eaf03e16e0f -22963ee0fc6f4478aa30e724565b16eb -54244ae106ed4997a4ff2f7c3066bb8a -95235e2f21e7415ba35b6578712460c2 -75fda147b62f447f8d05a31b40ce95ac -e4690d072a4c47af805a1c16bbaff7b3 -422960a5b5174455a9a1f94ce19d8831 -00c6b75d153541dda3862485776f2b9b -4314039619fd4f4eabffb62b3e8bedb3 -5af0186046484ba7b1c54d900c989781 -d2680660eea8464fb463b718cc46e4a2 -819903afed3747c8888848a156077594 -d5cfcfd5d1484aecb0d8ae102541ab14 -4ad7f71eca6840239e5d5f6cc9a57fb0 -bf75124cd2984431b301ed7cb5d012b4 -509d7561d1fc4e6e99fa0f2deb2a3a8e -96929b4d8fbe428284a179dfe2644670 -1062d943a67a470e8e49cacbe198cb78 -5c19fef49a504e21b5e7402930ecf6a0 -d7ae82f1d2bc429395e08eac087cd002 -364de829b0c44009a574ec01a424fe79 -715c0edf2c0346288707a341dbf030d2 -b2de244db93b4e7bba98e61a4a409780 -65bd3f2301694ab6b00a4e7854e7939f -67f2fd340aa842a5829cc5bdb7a8c12f -96a2fe954dae413dac3b179ccf61908c -f5ed45429db24531a64c50df57c7daa9 -ce4a373914744423871a58111ac4cb2c -0d90469436ad44dd921a1457d32f727b -f0249caee4da40c7987ed30e6764b091 -432b3ac467204b37b45966f2903efafd -0774c3a5840b4bd3af796e4e450a3a84 -616cdd14484b46ce8f03d63505adb17e -7ab781bff3824eda8daa819fd601cb41 -0188282a9c5c446fa5f3f75baa7617c3 -56cd01b19281448899855eaf118fa254 -69220799e6cb40a58b5b7d263bd45894 -1b84ff0cb3d94e1680c4249450222d3f \ No newline at end of file +67769681881643509a2bad20e2877d71 +b21ac7596dd64bd1822b67981a5cf93d +e0379c6deef046408e1b2dbdb024d022 +2d75a36d6028473e80044176035e84c8 +48937b8deb37463c82852c8ea29385ba +7ff012016c32499aaf7333c0dc0baabf +0bab99e4f2724ed8b3f0b6d4249307f5 +44a7a6157333419ab8d428181a981579 +39b8280a74594b5ab10dfd53db9f7d59 +e4acbcc58112466ab1602daafd4652cd +ea30a36a79aa4c5fa410905ced18d2cd +3c6b13a76f024888b72985869a89b6c3 +d500055517bd4c16bfa7df0a13f0b5b4 +ef47f63e4eff4f408cec3694bd6eedc1 +f32c67d3e9164120bc0f362600db08dd +96822c9eb005477a9d03db10e32566f9 +40cc9ae5faf84bc78aedad32fd440806 +4ef545e6576d4e2fa0c8756573a46ce1 +50aedeebb9874442bca9d328199c797d +21ce388dbd1a433a9bf68a6bd6b2ec35 +64ef1afd092f4ca0a988c40b1fe4a1cb +c6f0c5f7145045f7870bf17230f06528 +2b3b198deaf349639375291ed1be7bc7 +c33cedc6773f4a1681974a63c47bf2b8 +ba83773e23f34acfb6806eeaa25c624c +253ee8a33a1443bc98d21c5cf7fa12a4 +96446a7b9ac84df0a7b19e6ca1edaddb +eac60cf67d9a4f67841527f44aefa3a7 +8c3c6083678f4e69ad18a6e709ac4344 +a820966db13c4275a0c3de43b08836a8 +84a1bfc513d943a6913dbf66fe9b00fa +a46cdb6292fb435dbf6c3e4c5c42b70f +9dec7dbc74714e1ab336bd3dc03d0c52 +a76c12b04bae4643a0a6ec6e161c24f0 +210144ee2ffb44dba93668c8ff8baa1d +8c355aed8c28484da78de18d4665e098 +e3544909887c4596a7edea2a586ff436 +f2c15c91fa764f4e84bd868d4fa01108 +c53a8ee7d93f40dfb520f50ee686615f +c319852ab0064ff2b56cb72fb92ecf3e +bd5fc96dc94a46d5b5c5b61cd93eed22 +4817387d2cc643748f3ef08b52d68e92 +01cab54f6d8442538538b9a40419b6e5 +85eab2fc87894323804b29bb6bf9a58d +e08a3c2f6812427a810f58707a888fd3 +78a43204dd4b45f08961a966c003c357