mwang87 · cmaceves · Mar 3, 2021 · Mar 3, 2021 · Mar 3, 2021 · Mar 4, 2021
diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ To simulate selenium and production integration tests:
 One of the key steps in ReDU is the updating of the database to include the latest identifications for files within ReDU. These are the following steps:
 
 1. Download batch template for GNPS at ```https://redu.ucsd.edu/metabatchdump```
-1. Run Batch Workflow for Spectral Library Search
+1. Run [Batch Workflow for Spectral Library Search](https://github.com/mwang87/ReDU-MS2-GNPS/blob/master/code/search_all_data.py)
 1. Get the set of tasks as tsv and save to [here](https://github.com/mwang87/ReDU-MS2-GNPS/blob/master/database/global_tasks.tsv). 
 1. Remove database [here](https://github.com/mwang87/ReDU-MS2-GNPS/tree/master/database)
 1. Remove all untracked files in temp, this will be for the global pca

diff --git a/code/search_all_data.py b/code/search_all_data.py
@@ -7,29 +7,26 @@
 import credentials
 from models import *
 
-def main():
-    all_filenames = list(Filename.select())
 
+def parse_metabatch_dump():
+    metabatch_filename = '../database/metabatchdump.tsv'
+    all_filenames = pd.read_table(metabatch_filename)
+
     task_id_list = []
+    parallelism = len(all_filenames)
 
-    PARALLISM = 20
-    for i in range(PARALLISM):
-        partition_filenames = all_filenames[i::PARALLISM]
+    for index, row in all_filenames.iterrows():
+        filenames = row.filename
+        id = row.id
 
-        filenames_list = [filename.filepath for filename in partition_filenames]
-
-        print("Searching %d Files", len(filenames_list))
-
-        taskid = util.launch_GNPS_librarysearchworkflow(filenames_list, "ReDU-MS2 Global Analysis Populate %d of %d" % (i, PARALLISM), \
-            credentials.USERNAME, credentials.PASSWORD, "[email protected]")
-
-        print(taskid)
-
+        taskid = util.launch_GNPS_librarysearchworkflow(filenames, "ReDU-MS2 Global Analysis Populate %d of %d" % (id, parallelism), \
+        credentials.USERNAME, credentials.PASSWORD, "[email protected]")
+
         task_id_list.append(taskid)
-
+    
     df = pd.DataFrame()
     df["taskid"] = task_id_list
-    df.to_csv("./database/global_tasks.tsv", sep="\t", index=False)
+    df.to_csv("../database/global_tasks.tsv", sep="\t", index=False)
 
 if __name__ == '__main__':
-    main()
+    parse_metabatch_dump()