diff --git a/.gitignore b/.gitignore index b1cb160..767acfe 100644 --- a/.gitignore +++ b/.gitignore @@ -157,5 +157,5 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ diff --git a/flask_app/app.py b/flask_app/app.py new file mode 100644 index 0000000..d64e92b --- /dev/null +++ b/flask_app/app.py @@ -0,0 +1,71 @@ +import os,sys + +sys.path.append('../') +sys.path.append('../library_spectra_validation') + +from flask import Flask +from flask import request, render_template, redirect, url_for + +import pandas as pd +import numpy as np +from matplotlib.figure import Figure + +from library_spectra_validation.library_handler import LibraryHandler + + +app = Flask(__name__) +app.config.from_pyfile("config.py") + +@app.route('/') +def index(): + return render_template('index.html') + +@app.route('/upload', methods=['GET','POST']) +def upload(): + if request.method == 'POST': + file = request.files['file'] + file.save(os.path.join(app.config['UPLOAD_FOLDER'], file.filename)) + fpath = os.path.join(app.config['UPLOAD_FOLDER'], file.filename) + if file: + library_handler = LibraryHandler(fpath) + df_spectra = pd.DataFrame({"spectrum": library_handler.spectra}) #TODO + return render_template("preview.html", data=df_spectra.to_html()) + return render_template('upload_data.html') + +@app.route('/preview', methods=['POST']) +def preview(): + sheet_name = request.form['sheet'] + df = pd.read_excel(request.files['file'], sheet_name=sheet_name) + return render_template('preview.html', df=df.to_html(), sheet_name=sheet_name) + +# @app.route('/plot_spectrum', methods=['POST']) +# def plot_spectrum(): +# # TODO plotly?? +# cmp_selector = request.form['compound_name'] +# cmp_id = cmp_list.index(cmp_selector) +# cmp_smile = df_spectra.loc[cmp_id]["smiles"] + +# plt_spectrum = spectra[cmp_id] + +# fig, axs = plt.subplots(1, 2, figsize=(12.8, 4.2), gridspec_kw={'width_ratios': [2, 5]}, sharey=False) +# cmp_img = Chem.Draw.MolToImage(Chem.MolFromSmiles(cmp_smile), ax=axs[0]) + +# axs[0].grid(False) +# axs[0].tick_params(axis='both', bottom=False, labelbottom=False, left=False, labelleft=False) +# axs[0].set_title(cmp_smile) +# axs[0].imshow(cmp_img) +# axs[0].axis("off") + +# plot_spectrum(plt_spectrum, axs[1]) + +# # Save the plot to a temporary file or convert it to a base64 string to embed in HTML +# # Example: plt.savefig('static/plot.png') +# # Pass the path or base64 string to the template +# return render_template('plot_spectrum.html', plot_path='static/plot.png') + +@app.route('/about') +def about(): + return render_template('about.html') + +if __name__ == '__main__': + app.run(debug=True) \ No newline at end of file diff --git a/flask_app/config.py b/flask_app/config.py new file mode 100644 index 0000000..96f9a31 --- /dev/null +++ b/flask_app/config.py @@ -0,0 +1 @@ +UPLOAD_FOLDER = 'resources' \ No newline at end of file diff --git a/flask_app/resources/Broken_records.txt b/flask_app/resources/Broken_records.txt new file mode 100644 index 0000000..9df55e1 --- /dev/null +++ b/flask_app/resources/Broken_records.txt @@ -0,0 +1,19 @@ +I clean all the 12 mass spectra in the "test_case_correct.mgf", in terms of +--> adding inchikey +--> cleaning the inchi +--> adding formula + +So all mass spectra in the "test_case_wrong.mgf" are missing formula and inchikey + +mass spectrum 1: no change +mass spectrum 2: wrong adduct +mass spectrum 3: wrong pepmass (precursor) +mass spectrum 4: wrong smiles +mass spectrum 5: missing adduct +mass spectrum 6: no change (share the same compound name as mass spectrum 5, but different adduct) +mass spectrum 7: missing adduct +mass spectrum 8: missing adduct +mass spectrum 9: no change +mass spectrum 10: missing compound name and adduct +mass spectrum 11: no change +mass spectrum 12: wroing inchi \ No newline at end of file diff --git a/flask_app/resources/test_case_correct.mgf b/flask_app/resources/test_case_correct.mgf new file mode 100644 index 0000000..0f48a7f --- /dev/null +++ b/flask_app/resources/test_case_correct.mgf @@ -0,0 +1,725 @@ +BEGIN IONS +ID=1 +PEPMASS=181.051 +CHARGE=-1 +MSLEVEL=2 +SOURCE_INSTRUMENT=ESI-Orbitrap +FILENAME=MSV000084072/peak/negative/GNPS00001_E3_n.mzXML +SEQ=*..* +IONMODE=Negative +ORGANISM=GNPS-MSMLS +NAME=HYDROXYPHENYLLACTATE +ADDUCT=M-H +FORMULA=C9H10O4 +PI=Dorrestein +DATACOLLECTOR=Fernando Vargas +SMILES=OC(CC1=CC=C(O)C=C1)C(O)=O +INCHI="1S/C9H10O4/c10-7-3-1-6(2-4-7)5-8(11)9(12)13/h1-4,8,10-11H,5H2,(H,12,13)" +INCHIKEY=JVGVDSSUAVXRDY-UHFFFAOYSA-N +INCHIAUX=N/A +PUBMED=na +SUBMITUSER=mpanitchpakdi +LIBRARYQUALITY=1 +SPECTRUMID=CCMSLIB00005463540 +SCANS=18 +57.993649 2301.143799 +58.669075 2842.239258 +68.020729 2971.482422 +70.564552 2513.018311 +72.878845 2689.933105 +72.975563 3853.803467 +72.991684 718360.75 +73.995079 12563.180664 +75.007271 3571.86499 +77.404732 2682.560059 +81.033073 4734.676758 +81.774086 2626.58374 +92.918541 5914.78418 +92.992386 76993.929688 +93.033134 41289.710938 +99.857903 2497.19165 +101.239021 2791.706299 +105.356773 3432.287598 +105.568924 3190.02417 +106.041046 4729.494629 +107.048927 59435.523438 +109.02813 5910.954102 +112.984322 45298.136719 +117.033318 5821.436035 +118.203209 2873.571289 +119.015236 4521.074707 +119.048965 1540851.625 +119.084404 5451.431152 +120.052269 105341.335938 +121.028297 18331.339844 +121.108994 2778.724121 +133.028061 3424.031738 +134.031036 5080.327637 +134.036118 83965.585938 +134.04097 5610.906738 +134.998993 3203.721924 +135.00325 4971.822266 +135.043961 2215514.25 +135.087067 6798.085938 +135.733582 2684.214844 +136.047302 150738.0625 +136.982407 174019.640625 +137.985413 4226.324707 +142.010696 3147.537842 +147.589081 4205.175781 +148.523163 5009.87207 +152.916489 5558.07959 +162.985809 8731.754883 +163.039062 3223728.5 +163.096542 9567.081055 +164.042374 236031.921875 +172.091812 3181.887695 +179.034134 8333.552734 +180.911652 4656.253906 +180.972595 24925.556641 +180.987289 4149.787598 +181.049759 1808479.75 +181.116394 6630.770508 +182.053116 139884.703125 +200.736801 3475.613037 +END IONS + + + +BEGIN IONS +ID=2 +PEPMASS=123.08 +CHARGE=1 +MSLEVEL=2 +SOURCE_INSTRUMENT=ESI-Orbitrap +FILENAME=MSV000084072/peak/positive/GNPS00001_C10_p.mzXML +SEQ=*..* +IONMODE=Positive +ORGANISM=GNPS-MSMLS +NAME=1-PHENYLETHANOL +ADDUCT=M+H +FORMULA=C8H10O +PI=Dorrestein +DATACOLLECTOR=Fernando Vargas +SMILES=CC(O)C1=CC=CC=C1 +INCHI="1S/C8H10O/c1-7(9)8-5-3-2-4-6-8/h2-7,9H,1H3" +INCHKEY=WAPNOHKVXSQRPX-UHFFFAOYSA-N +INCHIAUX=N/A +PUBMED=na +SUBMITUSER=mpanitchpakdi +LIBRARYQUALITY=1 +SPECTRUMID=CCMSLIB00005463542 +SCANS=9 +50.09074 1563.576538 +51.847359 1482.95874 +52.35796 1460.236206 +55.054832 4853.48291 +56.050049 1694.615234 +57.053383 2707.153564 +57.070385 3209.65625 +64.927635 2554.697266 +67.054626 24782.671875 +70.341408 1578.040405 +78.997322 2244.730469 +79.054504 6207.743164 +80.99279 2491.513672 +81.070122 27285.521484 +85.559135 2069.354004 +91.039093 37482.609375 +93.070007 3839.950195 +95.049271 6394.27832 +95.08564 33426.289062 +97.00779 14587.337891 +99.003365 33937.394531 +105.070114 4827.084473 +105.073883 4377.217285 +106.028793 2912.144043 +106.943672 3527.098633 +108.056923 3015.60376 +108.866722 1828.277222 +110.307365 1760.50647 +111.023346 5263.312012 +113.018959 24370.855469 +113.90287 1687.043091 +122.09655 4319.949707 +123.043922 4615.996094 +123.055367 13869.549805 +123.080566 637932.5 +123.099785 25931.980469 +123.116882 17160.033203 +123.964729 4258.138184 +124.084068 3809.735352 +125.038429 2065.710205 +END IONS + + +BEGIN IONS +ID=3 +PEPMASS=341.109 +CHARGE=-1 +MSLEVEL=2 +SOURCE_INSTRUMENT=ESI-Orbitrap +FILENAME=MSV000084072/peak/negative/GNPS00001_G7_n.mzXML +SEQ=*..* +IONMODE=Negative +ORGANISM=GNPS-MSMLS +NAME=MELIBIOSE +ADDUCT=M-H +FORMULA=C12H22O11 +PI=Dorrestein +DATACOLLECTOR=Fernando Vargas +SMILES=OC[C@H]1O[C@H](OC[C@H]2OC(O)[C@H](O)[C@@H](O)[C@@H]2O)[C@H](O)[C@@H](O)[C@H]1O +INCHI="1S/C12H22O11/c13-1-3-5(14)8(17)10(19)12(23-3)21-2-4-6(15)7(16)9(18)11(20)22-4/h3-20H,1-2H2/t3-,4-,5+,6-,7+,8+,9-,10-,11?,12+/m1/s1" +INCHIKEY=DLRVVLDZNNYCBX-ZZFZYMBESA-N +INCHIAUX=N/A +PUBMED=na +SUBMITUSER=mpanitchpakdi +LIBRARYQUALITY=1 +SPECTRUMID=CCMSLIB00005463543 +SCANS=20 +50.540623 1971.973022 +53.1106 2001.842529 +59.012436 48294.171875 +66.468216 2272.426514 +67.171623 2172.375 +71.012428 33443.402344 +71.837288 2314.537598 +75.025131 2055.233398 +85.028244 3493.986084 +87.00737 2498.182617 +89.02301 46724.976562 +101.023033 29916.160156 +113.02298 15779.530273 +119.033714 7765.674316 +127.43145 2624.237549 +136.582092 2697.059326 +138.292267 2471.99292 +143.03392 2970.882812 +161.044678 5681.307617 +179.055206 12105.952148 +212.821136 2522.489258 +221.066132 20078.255859 +223.95575 2675.333252 +227.882217 2573.143311 +230.085648 2318.889404 +232.42157 2854.153076 +249.285477 2839.33252 +256.436523 2698.845215 +300.400299 2575.356934 +332.259033 2550.45874 +END IONS + + +BEGIN IONS +ID=4 +PEPMASS=179.056 +CHARGE=-1 +MSLEVEL=2 +SOURCE_INSTRUMENT=ESI-Orbitrap +FILENAME=MSV000084072/peak/negative/GNPS00001_H2_n.mzXML +SEQ=*..* +IONMODE=Negative +ORGANISM=GNPS-MSMLS +NAME=PSICOSE +ADDUCT=M-H +FORMULA=C6H12O6 +PI=Dorrestein +DATACOLLECTOR=Fernando Vargas +SMILES=C1C(C(C(C(O1)(CO)O)O)O)O +INCHI="1S/C6H12O6/c7-2-6(11)5(10)4(9)3(8)1-12-6/h3-5,7-11H,1-2H2" +INCHIKEY=LKDRXBCSQODPBY-JDJSBBGDSA-N +INCHIAUX=N/A +PUBMED=na +SUBMITUSER=mpanitchpakdi +LIBRARYQUALITY=1 +SPECTRUMID=CCMSLIB00005463544 +SCANS=23 +50.752808 1545.243286 +59.012474 19256.941406 +60.720825 2136.356934 +61.612026 1596.15271 +71.012405 15034.151367 +76.279961 1849.901611 +79.870834 1799.16272 +89.023018 10720.604492 +90.996696 14408.519531 +91.915749 1874.651733 +92.994606 2727.983643 +93.787796 1993.916138 +102.502869 2168.821777 +110.237366 1885.488647 +113.023308 2582.52124 +122.958046 7717.575684 +134.966919 4371.566406 +134.986786 32977.765625 +139.372147 2003.396973 +140.994339 2101.980225 +143.275223 2098.721191 +149.183472 2278.868652 +150.952957 36120.242188 +151.434525 1920.847046 +158.950302 7290.147461 +178.956345 3667.172363 +178.977158 5032.843262 +197.86763 2062.096924 +198.909378 1795.668457 +199.788757 2060.966553 +END IONS + + +BEGIN IONS +ID=5 +PEPMASS=172.098 +CHARGE=-1 +MSLEVEL=2 +SOURCE_INSTRUMENT=ESI-Orbitrap +FILENAME=MSV000084072/peak/negative/GNPS00001_C1_n.mzXML +SEQ=*..* +IONMODE=Negative +ORGANISM=GNPS-MSMLS +NAME=N-ACETYLLEUCINE +ADDUCT=M-H +FORMULA=C8H15NO3 +PI=Dorrestein +DATACOLLECTOR=Fernando Vargas +SMILES=CC(C)C[C@H](NC(C)=O)C(O)=O +INCHI="1S/C8H15NO3/c1-5(2)4-7(8(11)12)9-6(3)10/h5,7H,4H2,1-3H3,(H,9,10)(H,11,12)/t7-/m0/s1" +INCHIKEY=WXNXCEHXYPACJF-ZETCQYMHSA-N +INCHIAUX=N/A +PUBMED=na +SUBMITUSER=mpanitchpakdi +LIBRARYQUALITY=1 +SPECTRUMID=CCMSLIB00005463545 +SCANS=17 +50.494568 2305.545166 +51.01384 2171.879883 +51.815498 2572.902344 +51.995987 2339.180176 +52.061226 2207.783203 +52.57711 2512.306152 +55.847023 2212.564697 +58.028439 87037.296875 +60.502853 2177.543945 +60.521988 2473.28418 +63.7416 2167.294678 +66.966057 2456.80835 +69.75647 2647.145264 +73.483582 2592.871826 +82.064796 34505.539062 +83.57148 2632.96167 +84.080338 6847.935059 +85.034073 3291.934326 +85.425201 2819.0354 +93.145638 2523.094238 +98.150711 3326.976807 +100.950996 10970.250977 +104.952477 4165.535156 +110.096161 5159.470215 +111.080231 16679.011719 +116.945961 17610.503906 +128.106979 256260.953125 +128.876358 4115.287598 +128.946121 11519.360352 +129.110062 22094.078125 +130.046829 33868.449219 +130.086136 9670261.0 +130.128036 19900.978516 +131.083527 30412.302734 +131.089355 600183.375 +144.941299 4816.934082 +154.086502 5185.08252 +154.94632 40799.105469 +172.09697 1241461.125 +172.153442 3612.325439 +172.957062 7246.802246 +172.990631 5766.550781 +173.081223 5280.883789 +173.100342 119199.460938 +END IONS + + +BEGIN IONS +ID=6 +PEPMASS=367.185 +CHARGE=-1 +MSLEVEL=2 +SOURCE_INSTRUMENT=ESI-Orbitrap +FILENAME=MSV000084072/peak/negative/GNPS00001_C1_n.mzXML +SEQ=*..* +IONMODE=Negative +ORGANISM=GNPS-MSMLS +NAME=N-ACETYLLEUCINE +ADDUCT=2M-2H+Na +FORMULA=C8H15NO3 +PI=Dorrestein +DATACOLLECTOR=Fernando Vargas +SMILES=CC(C)C[C@H](NC(C)=O)C(O)=O +INCHI="1S/C8H15NO3/c1-5(2)4-7(8(11)12)9-6(3)10/h5,7H,4H2,1-3H3,(H,9,10)(H,11,12)/t7-/m0/s1" +INCHIKEY=WXNXCEHXYPACJF-ZETCQYMHSA-N +INCHIAUX=N/A +PUBMED=na +SUBMITUSER=mpanitchpakdi +LIBRARYQUALITY=1 +SPECTRUMID=CCMSLIB00005463546 +SCANS=29 +50.182331 1945.575806 +54.236893 2076.633789 +54.998264 2017.147461 +56.697639 2070.655029 +56.92223 2073.952148 +58.39571 2177.846924 +60.040657 2549.327148 +61.123554 2238.305908 +78.493736 2203.665283 +83.095627 2948.52832 +90.494804 2418.666016 +98.84182 2410.284912 +100.659897 2554.206543 +109.763535 2380.613525 +114.740555 2268.283447 +116.718567 2305.662109 +119.477913 2791.840332 +129.462494 2899.041748 +130.081238 3260.672363 +130.086151 61868.832031 +172.096985 143111.046875 +173.100342 4825.463379 +182.386368 2966.69873 +194.3871 2497.397705 +210.838821 3673.36377 +221.303009 2314.973633 +223.828033 2770.725098 +265.850708 2885.436035 +301.585999 3519.039551 +354.620514 2900.132812 +367.184906 7250.871582 +367.265472 2715.052979 +END IONS + + +BEGIN IONS +ID=7 +PEPMASS=150.077 +CHARGE=1 +MSLEVEL=2 +SOURCE_INSTRUMENT=ESI-Orbitrap +FILENAME=MSV000084072/peak/positive/GNPS00001_E2_p.mzXML +SEQ=*..* +IONMODE=Positive +ORGANISM=GNPS-MSMLS +NAME=3-METHYLADENINE +ADDUCT=M+H +FORMULA=C6H7N5 +PI=Dorrestein +DATACOLLECTOR=Fernando Vargas +SMILES=CN1C=NC(N)=C2N=CN=C12 +INCHI="1S/C6H7N5/c1-11-3-10-5(7)4-6(11)9-2-8-4/h2-3H,7H2,1H3" +INCHIKEY=ZPBYVFQJHWLTFB-UHFFFAOYSA-N +INCHIAUX=N/A +PUBMED=na +SUBMITUSER=mpanitchpakdi +LIBRARYQUALITY=1 +SPECTRUMID=CCMSLIB00005463547 +SCANS=20 +50.786098 9135.300781 +54.021648 17471.267578 +55.029583 43251.820312 +56.674351 9691.061523 +57.045086 9206.999023 +59.332993 9622.615234 +66.418159 10749.587891 +66.546501 10004.551758 +67.029388 12454.616211 +67.425049 9276.546875 +69.045151 31944.523438 +72.456032 9194.083008 +76.090012 11107.212891 +78.258568 10212.219727 +82.040123 110676.773438 +84.959908 18314.753906 +92.024513 20427.816406 +96.055695 125424.664062 +100.490868 11362.44043 +105.962997 40719.503906 +106.040092 30840.582031 +108.042923 35402.824219 +108.055611 101371.835938 +109.050941 336142.5625 +111.252899 10821.838867 +123.066544 511276.15625 +133.050964 856015.75 +134.046265 37031.992188 +135.054138 72465.648438 +150.077545 54001820.0 +150.125961 92599.414062 +151.06189 21264.314453 +151.074524 58892.902344 +151.08078 216933.5625 +152.056854 30901.611328 +END IONS + + +BEGIN IONS +ID=8 +PEPMASS=221.057 +CHARGE=-1 +MSLEVEL=2 +SOURCE_INSTRUMENT=ESI-Orbitrap +FILENAME=MSV000084072/peak/negative/GNPS00001_B7_n.mzXML +SEQ=*..* +IONMODE=Negative +ORGANISM=GNPS-MSMLS +NAME="2,6-DIHYDROXYPYRIDINE" +ADDUCT=2M-H +FORMULA=C5H5NO2 +PI=Dorrestein +DATACOLLECTOR=Fernando Vargas +SMILES=C1=CC(=O)NC(=C1)O +INCHI="1S/C5H5NO2/c7-4-2-1-3-5(8)6-4/h1-3H,(H2,6,7,8)" +INCHIKEY=WLFXSECCHULRRO-UHFFFAOYSA-N +INCHIAUX=N/A +PUBMED=na +SUBMITUSER=mpanitchpakdi +LIBRARYQUALITY=1 +SPECTRUMID=CCMSLIB00005463548 +SCANS=27 +53.619167 1898.965576 +54.11095 2265.26123 +61.186977 1738.268433 +66.779572 1851.167114 +70.708031 1796.109985 +72.527184 1818.754028 +79.024498 1845.408569 +80.350136 2010.399048 +83.263359 1960.025024 +103.919197 2362.208008 +105.173363 2122.984619 +109.281166 2030.764893 +136.039307 33703.570312 +137.319504 2297.305908 +148.751129 2441.170898 +161.034668 2347.14917 +177.71376 2034.473755 +179.045242 38240.597656 +179.052521 2948.430664 +180.048599 2745.167236 +193.928101 2060.133057 +204.702805 2358.849609 +205.28389 2073.789062 +221.056381 51394.78125 +222.060211 3162.884521 +END IONS + + +BEGIN IONS +ID=9 +PEPMASS=365.105 +CHARGE=1 +MSLEVEL=2 +SOURCE_INSTRUMENT=ESI-Orbitrap +FILENAME=MSV000084072/peak/positive/GNPS00001_G2_p.mzXML +SEQ=*..* +IONMODE=Positive +ORGANISM=GNPS-MSMLS +NAME=SUCROSE +ADDUCT=M+Na +FORMULA=C12H22O11 +PI=Dorrestein +DATACOLLECTOR=Fernando Vargas +SMILES=OC[C@H]1O[C@@](CO)(O[C@H]2O[C@H](CO)[C@@H](O)[C@H](O)[C@H]2O)[C@@H](O)[C@@H]1O +INCHI="1S/C12H22O11/c13-1-4-6(16)8(18)9(19)11(21-4)23-12(3-15)10(20)7(17)5(2-14)22-12/h4-11,13-20H,1-3H2/t4-,5-,6-,7-,8+,9-,10+,11-,12+/m1/s1" +INCHIKEY=CZMRCDWAGMRECN-UGDNZRGBSA-N +INCHIAUX=N/A +PUBMED=na +SUBMITUSER=mpanitchpakdi +LIBRARYQUALITY=1 +SPECTRUMID=CCMSLIB00005463549 +SCANS=20 +51.110802 2751.992676 +52.042931 2427.323242 +54.204987 2326.957031 +55.620426 2482.91333 +56.299335 2671.567383 +58.450058 2513.786133 +63.139435 2968.675537 +66.149086 2727.473633 +66.624603 7029.318848 +66.626404 4924.129395 +89.847275 2804.193115 +97.383942 3441.288574 +103.737549 3060.2854 +107.869011 3100.476074 +114.239304 3123.896484 +116.606705 2991.578857 +139.445679 3365.54126 +143.453751 3923.98877 +152.061234 2890.845703 +177.644165 2889.41748 +184.255539 3648.265625 +185.042358 758432.1875 +192.030014 4091.378174 +200.209564 3764.209961 +202.973297 3485.999023 +203.052887 1701323.0 +204.05629 5566.01416 +213.650085 3444.569824 +218.029663 2887.408691 +221.062866 7352.574219 +290.206573 3159.827148 +351.157837 3244.633545 +365.106232 2048528.625 +366.109528 6856.312012 +END IONS + + +BEGIN IONS +ID=10 +PEPMASS=191.118 +CHARGE=1 +MSLEVEL=2 +SOURCE_INSTRUMENT=ESI-LC-ESI-QTOF +FILENAME=Massbank_ESI_positive_8_1_2014_peaks.mgf +SEQ=*..* +IONMODE=Positive +ORGANISM=MASSBANK +NAME=Cytisine +ADDUCT=M+H +FORMULA=C11H14N2O +PI="Putative Massbank Match" +DATACOLLECTOR=Massbank +SMILES=C1[C@H]2CNC[C@@H]1C3=CC=CC(=O)N3C2 +INCHI=1S/C11H14N2O/c14-11-3-1-2-10-9-4-8(5-12-6-9)7-13(10)11/h1-3,8-9,12H,4-7H2/t8-,9+/m0/s1 +INCHIKEY=ANJTVLIZGCUXLD-DTWKUNHWSA-N +INCHIAUX=N/A +PUBMED=N/A +SUBMITUSER=mwang87 +LIBRARYQUALITY=3 +SPECTRUMID=CCMSLIB00000204751 +SCANS=11 +120.0805 215.0 +130.0652 159.0 +133.0512 196.0 +134.0586 206.0 +146.0606 182.0 +148.0755 12133.0 +148.1124 232.0 +148.1288 161.0 +148.1582 128.0 +148.1883 267.0 +160.0762 146.0 +162.0892 375.0 +174.0923 136.0 +191.1175 7529.0 +191.1614 182.0 +191.2455 133.0 +END IONS + + +BEGIN IONS +ID=11 +PEPMASS=260.092 +CHARGE=1 +MSLEVEL=2 +SOURCE_INSTRUMENT=ESI-LC-ESI-QTOF +FILENAME=Massbank_ESI_positive_8_1_2014_peaks.mgf +SEQ=*..* +IONMODE=Positive +ORGANISM=MASSBANK +NAME=Skimmianine +ADDUCT=M+H +FORMULA=C14H13NO4 +PI="Putative Massbank Match" +DATACOLLECTOR=Massbank +SMILES=COC1=C(C2=C(C=C1)C(=C3C=COC3=N2)OC)OC +INCHI=1S/C14H13NO4/c1-16-10-5-4-8-11(13(10)18-3)15-14-9(6-7-19-14)12(8)17-2/h4-7H,1-3H3 +INCHIKEY=SLSIBLKBHNKZTB-UHFFFAOYSA-N +INCHIAUX=N/A +PUBMED=N/A +SUBMITUSER=mwang87 +LIBRARYQUALITY=3 +SPECTRUMID=CCMSLIB00000204756 +SCANS=16 +156.0445 78.0 +184.0367 144.0 +199.063 596.0 +199.1029 35.0 +212.037 49.0 +216.0646 458.0 +227.0571 2793.0 +227.1144 38.0 +227.1307 39.0 +227.1957 37.0 +230.0439 260.0 +244.0586 157.0 +245.0678 695.0 +245.1017 38.0 +260.0922 648.0 +END IONS + + +BEGIN IONS +ID=12 +PEPMASS=181.072 +CHARGE=1 +MSLEVEL=2 +SOURCE_INSTRUMENT=ESI-Orbitrap +FILENAME=MSV000084479/peak/mzXML/GNPS00005_G8_p.mzXML +SEQ=*..* +IONMODE=Positive +ORGANISM=GNPS-MSMLS +NAME=THEOBROMINE +ADDUCT=M+H +FORMULA=C7H8N4O2 +PI=Dorrestein +DATACOLLECTOR=Kelly Weldon +SMILES=CN1C=NC2=C1C(=O)NC(=O)N2C +INCHI="1S/C7H8N4O2/c1-10-3-8-5-4(10)6(12)9-7(13)11(5)2/h3H,1-2H3,(H,9,12,13)" +INCHIKEY=YAPQBXQYLJRXSA-UHFFFAOYSA-N +INCHIAUX=N/A +PUBMED=N/A +SUBMITUSER=mpanitchpakdi +LIBRARYQUALITY=1 +SPECTRUMID=CCMSLIB00005464198 +SCANS=26 +53.984585 2200.075928 +56.698093 1968.606812 +62.04755 2400.987793 +67.029778 19701.019531 +69.045395 23568.802734 +70.177971 2305.233643 +83.060989 12263.048828 +84.960213 10869.87793 +85.738388 2162.437988 +91.209183 2625.989014 +94.040466 2627.905273 +96.056137 23821.800781 +97.717781 2385.289551 +98.896057 2513.965576 +107.010551 2494.426758 +108.056038 60805.148438 +110.071846 102124.40625 +112.087212 5632.263672 +113.035294 4863.25293 +120.055634 2745.1521 +122.059227 20169.671875 +123.041679 102874.648438 +123.080559 6526.376953 +124.050507 6017.03125 +135.066483 85918.375 +137.082413 197951.296875 +138.066544 591843.4375 +139.069885 4696.746094 +139.075333 5365.431641 +139.111893 2735.634277 +149.023682 5598.092773 +156.077072 6471.537109 +158.954269 2866.068848 +159.013214 8620.223633 +163.062057 58183.132812 +181.072327 5051340.5 +182.069122 4534.439941 +182.076019 28589.470703 +184.980209 4541.842285 +190.653259 2998.242432 +203.411301 2598.710449 +END IONS \ No newline at end of file diff --git a/flask_app/services/plot_spectra.py b/flask_app/services/plot_spectra.py new file mode 100644 index 0000000..617c52f --- /dev/null +++ b/flask_app/services/plot_spectra.py @@ -0,0 +1,26 @@ +''' +TODO Migrate here functionality to plot spectra +''' + +from matchms.plotting.spectrum_plots import plot_spectra_mirror, plot_spectrum +import streamlit as st +import matplotlib.pyplot as plt +import pandas as pd +import numpy as np +import pubchempy +from rdkit import Chem +from rdkit.Chem import Draw + +def plot_spectra(spectrum): + fig, axs = plt.subplots(1, 2, figsize=(12.8, 4.2), gridspec_kw={'width_ratios': [2, 5]}, sharey=False) + cmp_img = Chem.Draw.MolToImage(Chem.MolFromSmiles(cmp_smile), ax=axs[0]) + + axs[0].grid(False) + axs[0].tick_params(axis='both', bottom=False, labelbottom=False, left=False, labelleft=False) + axs[0].set_title(cmp_smile) + axs[0].imshow(cmp_img) + axs[0].axis("off") + + plot_spectrum(spectrum, axs[1]) + return fig + #TODO pass the plot to flask \ No newline at end of file diff --git a/flask_app/static/loading.gif b/flask_app/static/loading.gif new file mode 100644 index 0000000..8ff5c37 Binary files /dev/null and b/flask_app/static/loading.gif differ diff --git a/flask_app/static/styles.css b/flask_app/static/styles.css new file mode 100644 index 0000000..a0adead --- /dev/null +++ b/flask_app/static/styles.css @@ -0,0 +1,56 @@ +body { + margin: 0; + padding: 0; + font-family: "Helvetica Neue", Helvetica, Arial, sans-serif; + color: #444; + } + /* + * Formatting the header area + */ + header { + background-color: #DFB887; + height: 35px; + width: 100%; + opacity: .9; + margin-left: 10px; + margin-bottom: 10px; + } + ul { + list-style-type: none; + margin: 10; + padding: 0; + overflow: hidden; + background-color: #333; + } + + li { + float: left; + } + + li a { + display: block; + color: white; + text-align: center; + padding: 10px 10px; + text-decoration: none; + } + + /* Change the link color to #111 (black) on hover */ + li a:hover { + background-color: #111; + } + + div.content { + padding-left: 10px; + } + + .button-style { + border: none; + color: black; + padding: 15px 32px; + text-align: center; + text-decoration: none; + display: inline-block; + font-size: 16px; + cursor: pointer; +} \ No newline at end of file diff --git a/flask_app/templates/about.html b/flask_app/templates/about.html new file mode 100644 index 0000000..c2837ae --- /dev/null +++ b/flask_app/templates/about.html @@ -0,0 +1,7 @@ +{% extends 'base.html' %} + +{% block title %}About{% endblock %} + +{% block content %} +

This is the creation and curation wizard for FAIR MS Libraries. You can find source code here

+{% endblock %} \ No newline at end of file diff --git a/flask_app/templates/base.html b/flask_app/templates/base.html new file mode 100644 index 0000000..9e9e11c --- /dev/null +++ b/flask_app/templates/base.html @@ -0,0 +1,24 @@ + + + + + + {% block title %}FAIR spectral db{% endblock %} + + + + + +
+ {% block content %} + {% endblock %} +
+ + diff --git a/flask_app/templates/index.html b/flask_app/templates/index.html new file mode 100644 index 0000000..dfd7105 --- /dev/null +++ b/flask_app/templates/index.html @@ -0,0 +1,8 @@ +{% extends 'base.html' %} + +{% block title %}Home{% endblock %} + +{% block content %} +

Welcome to the FAIRification of mass spectral libraries

+

This webservice facilitates easy and intuitive curation of your mass spectral data.

+{% endblock %} \ No newline at end of file diff --git a/flask_app/templates/plot_spectrum.html b/flask_app/templates/plot_spectrum.html new file mode 100644 index 0000000..50a18c5 --- /dev/null +++ b/flask_app/templates/plot_spectrum.html @@ -0,0 +1,11 @@ +
+ + + +
+ + \ No newline at end of file diff --git a/flask_app/templates/preview.html b/flask_app/templates/preview.html new file mode 100644 index 0000000..2b1480d --- /dev/null +++ b/flask_app/templates/preview.html @@ -0,0 +1,6 @@ +{% extends "base.html" %} +{% block content %} + +

{{name}}

+{{data | safe}} +{% endblock %} \ No newline at end of file diff --git a/flask_app/templates/upload_data.html b/flask_app/templates/upload_data.html new file mode 100644 index 0000000..321204a --- /dev/null +++ b/flask_app/templates/upload_data.html @@ -0,0 +1,20 @@ +{% extends 'base.html' %} + +{% block content %} + +
+
+ + +
+
+ + + +{% endblock %} \ No newline at end of file diff --git a/library_spectra_validation/.gitignore b/library_spectra_validation/.gitignore deleted file mode 100644 index cd25e0f..0000000 --- a/library_spectra_validation/.gitignore +++ /dev/null @@ -1,35 +0,0 @@ -*.py[cod] -*.egg-info -*.eggs -.ipynb_checkpoints - -build -dist -.cache -__pycache__ - -htmlcov -.coverage -coverage.xml -.pytest_cache -pylint-report.txt -xunit-result.xml -.scannerwork/ - -docs/_build -docs/apidocs - -# ide -.idea -.eclipse -.vscode - -# Mac -.DS_Store -config.py -__pycache__/ -debug.log - - -# conda build directory -/_build \ No newline at end of file diff --git a/streamlit_app/requirements.txt b/requirements.txt similarity index 100% rename from streamlit_app/requirements.txt rename to requirements.txt diff --git a/streamlit_app/FAIR_MS_Library_Editor.py b/streamlit_app/FAIR_MS_Library_Editor.py deleted file mode 100644 index 37e93eb..0000000 --- a/streamlit_app/FAIR_MS_Library_Editor.py +++ /dev/null @@ -1,72 +0,0 @@ -import json -import os -import tempfile -import streamlit as st - -st.set_page_config( - layout="wide", - page_title="FAIR MS Library Curation Editor", - #page_icon="assets/favicon.ico", - menu_items={ - 'Get Help': 'https://github.com/mzmine/biohack23_p15', - 'Report a bug': "https://github.com/mzmine/biohack23_p15/issues/new/choose", - 'About': "# This is the creation and curation wizard for FAIR MS Libraries." - } -) - -from streamlit.runtime.scriptrunner.script_run_context import get_script_run_ctx -ctx = get_script_run_ctx() -if 'session_id' not in st.session_state: - print("Setting session ID:", ctx.session_id) - st.session_state.session_id = ctx.session_id -else: - print("Retrieving session ID:", st.session_state.session_id) - -if 'submission_id' not in st.session_state: - submission_id = "FMSL-"+st.session_state.session_id - print(f"Setting submission id {submission_id}") - st.session_state.submission_id = submission_id - -submission_id = st.session_state.submission_id - -if 'cv_config' not in st.session_state: - # reading the data from the file - with open('config.json') as f: - data = f.read() - config = json.loads(data) - st.session_state.cv_config = config - -tmp_dir = tempfile.gettempdir() -working_dir = os.path.join(tmp_dir, "fairmslib", submission_id) -os.makedirs(working_dir, exist_ok=True) -print("Working dir:", working_dir) -st.session_state['working_dir'] = working_dir - -st.title("FAIR MS Library Curation Editor") -st.markdown(f"Provisional submission ID: {submission_id}") - -# Using "with" notation -with st.sidebar: - st.markdown("## Datasets") - if 'datasets' not in st.session_state or st.session_state['datasets'] == {}: - st.warning("Please upload a file to begin!") - if 'selected_sheets' not in st.session_state or st.session_state['selected_sheets'] == {}: - st.warning("Please select a dataset to begin!") - # with st.spinner("Loading..."): - # time.sleep(5) - # st.success("Done!") - if 'datasets' in st.session_state and st.session_state['datasets'] != {}: - for key in st.session_state['selected_sheets']: - with st.expander(key): - datasets = st.session_state['datasets'] - rowsMetricColumn, columnsMetricColumn = st.columns(2) - with rowsMetricColumn: - st.metric('Rows', datasets[key].shape[0]) - with columnsMetricColumn: - st.metric('Columns', datasets[key].shape[1]) - # if st.button("Edit", key=key): - # selected_sheet = key - # if key in datasets_metadata: - # st.write(datasets_metadata[key].keys()) - # st.write(datasets[dataset_key]) - # st.json(datasets) diff --git a/streamlit_app/README.md b/streamlit_app/README.md deleted file mode 100644 index 3cb118f..0000000 --- a/streamlit_app/README.md +++ /dev/null @@ -1,24 +0,0 @@ -# Streamlit Application for FAIR MassSpectral Library Curation and Editing - -[GitHub repository](https://github.com/mzmine/biohack23_p15) - folder _streamlit_app_ - -## Installation - -## Development - -Use python venv to use defined dependencies. - - python -m venv venv - source venv/bin/activate - -to activate the virtual environment. - -You can then use the provided requirements.txt to populate the required dependencies in your virtual environment. - - pip install -r requirements.txt - -## Running the application - -After you have activated the virtual environment and the packages listed in requirements.txt are installed, you can launch the streamlit application as follows: - - streamlit run FAIR_MS_Library_Editor.py diff --git a/streamlit_app/__init__.py b/streamlit_app/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/streamlit_app/assets/.empty b/streamlit_app/assets/.empty deleted file mode 100644 index e69de29..0000000 diff --git a/streamlit_app/config.json b/streamlit_app/config.json deleted file mode 100644 index 9a3693d..0000000 --- a/streamlit_app/config.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "ontologies": { - "NCIT": "ncit", - "MS": "ms", - "MSIO": "msio", - "UO": "uo", - "NCBITaxon": "ncbitaxon", - "BTO": "bto", - "PRIDE": "pride", - "EFO": "efo" - }, - "static_cv_terms": { - "MS:1003309": { - "term_obo_id": "MS:1003309", - "cv": "MS", - "name": "Goslin", - "value": null - } - } -} \ No newline at end of file diff --git a/streamlit_app/models.py b/streamlit_app/models.py deleted file mode 100644 index 8f9546d..0000000 --- a/streamlit_app/models.py +++ /dev/null @@ -1,612 +0,0 @@ -# generated by datamodel-codegen: -# filename: -# timestamp: 2023-05-30T11:14:55+00:00 - -from __future__ import annotations - -from enum import Enum -from typing import List, Optional - -from pydantic import AnyUrl, BaseModel, Field, conint, constr - - -class CommentPrefix(Enum): - COM = 'COM' - - -class Comment(BaseModel): - prefix: CommentPrefix - msg: str - line_number: Optional[int] = None - - -class MTDPrefix(Enum): - MTD = 'MTD' - - -class SMLPrefix(Enum): - SML = 'SML' - - -class SMHeaderPrefix(Enum): - SMH = 'SMH' - - -class SMFPrefix(Enum): - SMF = 'SMF' - - -class SFHeaderPrefix(Enum): - SFH = 'SFH' - - -class SMEPrefix(Enum): - SME = 'SME' - - -class SEHeaderPrefix(Enum): - SEH = 'SEH' - - -class Parameter(BaseModel): - id: conint(ge=1) = Field(..., readOnly=True) - cv_label: Optional[str] = '' - cv_accession: Optional[str] = '' - name: str - value: str - - -class Instrument(BaseModel): - id: conint(ge=1) = Field(..., readOnly=True) - name: Optional[Parameter] = None - source: Optional[Parameter] = None - analyzer: Optional[List[Parameter]] = Field( - [], description="The instrument's mass analyzer, as defined by the parameter." - ) - detector: Optional[Parameter] = None - - -class SampleProcessing(BaseModel): - id: conint(ge=1) = Field(..., readOnly=True) - sampleProcessing: Optional[List[Parameter]] = Field( - [], - description='Parameters specifiying sample processing that was applied within one step.', - ) - - -class Software(BaseModel): - id: conint(ge=1) = Field(..., readOnly=True) - parameter: Optional[Parameter] = None - setting: Optional[List[str]] = Field( - [], - description='A software setting used. This field MAY occur multiple times for a\nsingle software. The value of this field is deliberately set as a\nString, since there currently do not exist cvParams for every\npossible setting.\n', - ) - - -class PublicationType(Enum): - doi = 'doi' - pubmed = 'pubmed' - uri = 'uri' - - -class PublicationItem(BaseModel): - type: PublicationType = Field(..., description='The type qualifier of this publication item.') - accession: str = Field( - ..., description='The native accession id for this publication item.' - ) - - -class StringList(BaseModel): - __root__: List[str] = Field(..., description='A typed list of strings.') - - -class Contact(BaseModel): - id: conint(ge=1) = Field(..., readOnly=True) - name: str = Field(..., description="The contact's name.", min_length=2) - affiliation: str = Field(..., description="The contact's affiliation.", min_length=2) - email: Optional[str] = Field(None, regex=r'^\w+([\.-]?\w+)*@\w+([\.-]?\w+)*(\.\w{2,3})+$', description="The contact's email address.") - orcid: Optional[str] = Field(None, regex=r'^[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]{1}$', description="The contact's ORCID identifier.") - - -class Uri(BaseModel): - id: conint(ge=1) = Field(..., readOnly=True) - value: Optional[AnyUrl] = Field( - None, description='The URI pointing to the external resource.' - ) - - -class Sample(BaseModel): - id: conint(ge=1) = Field(..., readOnly=True) - name: Optional[str] = Field(None, description="The sample's name.") - custom: Optional[List[Parameter]] = Field( - [], description='Additional user or cv parameters.' - ) - species: Optional[List[Parameter]] = Field( - [], description='Biological species information on the sample.' - ) - tissue: Optional[List[Parameter]] = Field( - [], description='Biological tissue information on the sample.' - ) - cell_type: Optional[List[Parameter]] = Field( - [], description='Biological cell type information on the sample.' - ) - disease: Optional[List[Parameter]] = Field( - [], description='Disease information on the sample.' - ) - description: Optional[str] = Field( - None, description='A free form description of the sample.' - ) - - -class MsRun(BaseModel): - id: conint(ge=1) = Field(..., readOnly=True) - name: Optional[str] = Field(None, description="The msRun's name.") - location: AnyUrl = Field(..., description="The msRun's location URI.") - instrument_ref: Optional[Instrument] = None - format: Optional[Parameter] = None - id_format: Optional[Parameter] = None - fragmentation_method: Optional[List[Parameter]] = Field( - [], description='The fragmentation methods applied during this msRun.' - ) - scan_polarity: Optional[List[Parameter]] = Field( - [], description='The scan polarity/polarities used during this msRun.' - ) - hash: Optional[str] = Field( - None, description="The file hash value of this msRun's data file." - ) - hash_method: Optional[Parameter] = None - - -class Assay(BaseModel): - id: conint(ge=1) = Field(..., readOnly=True) - name: str = Field(..., description='The assay name.') - custom: Optional[List[Parameter]] = Field( - [], description='Additional user or cv parameters.' - ) - external_uri: Optional[AnyUrl] = Field( - None, description='An external URI to further information about this assay.' - ) - sample_ref: Optional[Sample] = None - ms_run_ref: List[MsRun] = Field( - ..., description='The ms run(s) referenced by this assay.', min_items=1 - ) - - -class CV(BaseModel): - id: conint(ge=1) = Field(..., readOnly=True) - label: str = Field(..., description='The abbreviated CV label.') - full_name: str = Field(..., description='The full name of this CV, for humans.') - version: str = Field( - ..., description='The CV version used when the file was generated.' - ) - uri: AnyUrl = Field(..., description='A URI to the CV definition.') - - -class Database(BaseModel): - id: conint(ge=1) = Field(..., readOnly=True) - param: Parameter - prefix: str = Field(..., description='The database prefix.') - version: str = Field(..., description='The database version.') - uri: AnyUrl = Field(..., description='The URI to the online database.') - - -class ColumnParameterMapping(BaseModel): - column_name: str = Field(..., description='The fully qualified target column name.') - param: Parameter - - -class OptColumnMapping(BaseModel): - identifier: str = Field(..., description='The fully qualified column name.') - param: Optional[Parameter] = None - value: Optional[str] = Field( - None, description='The value for this column in a particular row.' - ) - - -class Error(BaseModel): - code: int - message: str - - -class Category(Enum): - format = 'format' - logical = 'logical' - cross_check = 'cross_check' - - -class MessageType(Enum): - error = 'error' - warn = 'warn' - info = 'info' - - -class ValidationMessage(BaseModel): - code: str - category: Category - message_type: Optional[MessageType] = 'info' - message: str - line_number: Optional[int] = None - - -class SmallMoleculeSummary(BaseModel): - prefix: Optional[SMLPrefix] = Field( - 'SML', - description='The small molecule table row prefix. SML MUST be used for rows of the small molecule table.', - ) - header_prefix: Optional[SMHeaderPrefix] = Field( - 'SMH', - description='The small molecule table header prefix. SMH MUST be used for the small molecule table header line (the column labels).', - ) - sml_id: int = Field( - ..., description='A within file unique identifier for the small molecule.' - ) - smf_id_refs: Optional[List[int]] = Field( - [], - description='References to all the features on which quantitation has been based (SMF elements) via referencing SMF_ID values. Multiple values SHOULD be provided as a “|” separated list. This MAY be null only if this is a Summary file.', - ) - database_identifier: Optional[List[str]] = Field( - [], - description='A list of “|” separated possible identifiers for the small molecule; multiple values MUST only be provided to indicate ambiguity in the identification of the molecule and not to demonstrate different identifier types for the same molecule. Alternative identifiers for the same molecule MAY be provided as optional columns.\n\nThe database identifier must be preceded by the resource description (prefix) followed by a colon, as specified in the metadata section. \n\nA null value MAY be provided if the identification is sufficiently ambiguous as to be meaningless for reporting or the small molecule has not been identified.\n', - ) - chemical_formula: Optional[List[str]] = Field( - [], - description='A list of “|” separated potential chemical formulae of the reported compound. The number of values provided MUST match the number of entities reported under “database_identifier”, even if this leads to redundant reporting of information (i.e. if ambiguity can be resolved in the chemical formula), and the validation software will throw an error if the number of “|” symbols does not match. “null” values between bars are allowed.\n\nThis should be specified in Hill notation (EA Hill 1900), i.e. elements in the order C, H and then alphabetically all other elements. Counts of one may be omitted. Elements should be capitalized properly to avoid confusion (e.g., “CO” vs. “Co”). The chemical formula reported should refer to the neutral form.\n\nExample: N-acetylglucosamine would be encoded by the string “C8H15NO6”.\n', - ) - smiles: Optional[List[str]] = Field( - [], - description='A list of “|” separated potential molecule structures in the simplified molecular-input line-entry system (SMILES) for the small molecule. The number of values provided MUST match the number of entities reported under “database_identifier”, and the validation software will throw an error if the number of “|” symbols does not match. “null” values between bars are allowed.', - ) - inchi: Optional[List[str]] = Field( - [], - description='A list of “|” separated potential standard IUPAC International Chemical Identifier (InChI) of the given substance.\n\nThe number of values provided MUST match the number of entities reported under “database_identifier”, even if this leads to redundant information being reported (i.e. if ambiguity can be resolved in the InChi), and the validation software will throw an error if the number of “|” symbols does not match. “null” values between bars are allowed.\n', - ) - chemical_name: Optional[List[str]] = Field( - [], - description='A list of “|” separated possible chemical/common names for the small molecule, or general description if a chemical name is unavailable. Multiple names are only to demonstrate ambiguity in the identification. The number of values provided MUST match the number of entities reported under “database_identifier”, and the validation software will throw an error if the number of “|” symbols does not match. “null” values between bars are allowed.\n', - ) - uri: Optional[List[AnyUrl]] = Field( - [], - description='A URI pointing to the small molecule’s entry in a reference database (e.g., the small molecule’s HMDB or KEGG entry). The number of values provided MUST match the number of entities reported under “database_identifier”, and the validation software will throw an error if the number of “|” symbols does not match. “null” values between bars are allowed.', - ) - theoretical_neutral_mass: Optional[List[float]] = Field( - [], - description='The small molecule’s precursor’s theoretical neutral mass.\n\nThe number of values provided MUST match the number of entities reported under “database_identifier”, and the validation software will throw an error if the number of “|” symbols does not match. “null” values (in general and between bars) are allowed for molecules that have not been identified only, or for molecules where the neutral mass cannot be calculated. In these cases, the SML entry SHOULD reference features in which exp_mass_to_charge values are captured.\n', - ) - adduct_ions: Optional[List[str]] = Field( - [], - description='A “|” separated list of detected adducts for this this molecule, following the general style in the 2013 IUPAC recommendations on terms relating to MS e.g. [M+H]1+, [M+Na]1+, [M+NH4]1+, [M-H]1-, [M+Cl]1-, [M+H]1+. If the adduct classification is ambiguous with regards to identification evidence it MAY be null.\n', - regex='^\\[\\d*M([+-][\\w]*)\\]\\d*[+-]$', - ) - reliability: Optional[str] = Field( - None, - description='The reliability of the given small molecule identification. This must be supplied by the resource and MUST be reported as an integer between 1-4:\n\n identified metabolite (1)\n\n putatively annotated compound (2)\n\n putatively characterized compound class (3)\n\n unknown compound (4)\n\nThese MAY be replaced using a suitable CV term in the metadata section e.g. to use MSI recommendation levels (see Section 6.2.57 for details).\n\nThe following CV terms are already available within the PSI MS CV. Future schemes may be implemented by extending the PSI MS CV with new terms and associated levels.\n\nThe MSI has recently discussed an extension of the original four level scheme into a five level scheme MS:1002896 (compound identification confidence level) with levels\n\n isolated, pure compound, full stereochemistry (0)\n\n reference standard match or full 2D structure (1)\n\n unambiguous diagnostic evidence (literature, database) (2)\n\n most likely structure, including isomers, substance class or substructure match (3)\n\n unknown compound (4)\n\nFor high-resolution MS, the following term and its levels may be used: MS:1002955 (hr-ms compound identification confidence level) with levels\n\n confirmed structure (1)\n\n probable structure (2)\n\n unambiguous ms library match (2a)\n\n diagnostic evidence (2b)\n\n tentative candidates (3)\n\n unequivocal molecular formula (4)\n\n exact mass (5)\n\nA String data type is set to allow for different systems to be specified in the metadata section.\n', - ) - best_id_confidence_measure: Optional[Parameter] = None - best_id_confidence_value: Optional[float] = Field( - None, - description='The best confidence measure in identification (for this type of score) for the given small molecule across all assays. The type of score MUST be defined in the metadata section. If the small molecule was not identified by the specified search engine, “null” MUST be reported. If the confidence measure does not report a numerical confidence value, “null” SHOULD be reported.', - ) - abundance_assay: Optional[List[float]] = Field( - [], - description='The small molecule’s abundance in every assay described in the metadata section MUST be reported. Null or zero values may be reported as appropriate. "null" SHOULD be used to report missing quantities, while zero SHOULD be used to indicate a present but not reliably quantifiable value (e.g. below a minimum noise threshold).', - ) - abundance_study_variable: Optional[List[float]] = Field( - [], - description='The small molecule’s abundance in all the study variables described in the metadata section (study_variable[1-n]_average_function), calculated using the method as described in the Metadata section (default = arithmetic mean across assays). Null or zero values may be reported as appropriate. "null" SHOULD be used to report missing quantities, while zero SHOULD be used to indicate a present but not reliably quantifiable value (e.g. below a minimum noise threshold).', - ) - abundance_variation_study_variable: Optional[List[float]] = Field( - [], - description='A measure of the variability of the study variable abundance measurement, calculated using the method as described in the metadata section (study_variable[1-n]_average_function), with a default = arithmethic co-efficient of variation of the small molecule’s abundance in the given study variable.', - ) - opt: Optional[List[OptColumnMapping]] = Field( - [], - description='Additional columns can be added to the end of the small molecule table. These column headers MUST start with the prefix “opt_” followed by the {identifier} of the object they reference: assay, study variable, MS run or “global” (if the value relates to all replicates). Column names MUST only contain the following characters: ‘A’-‘Z’, ‘a’-‘z’, ‘0’-‘9’, ‘’, ‘-’, ‘[’, ‘]’, and ‘:’. CV parameter accessions MAY be used for optional columns following the format: opt{identifier}_cv_{accession}_\\{parameter name}. Spaces within the parameter’s name MUST be replaced by ‘_’.\n', - ) - comment: Optional[List[Comment]] = [] - - -class SmallMoleculeFeature(BaseModel): - prefix: Optional[SMFPrefix] = Field( - 'SMF', - description='The small molecule feature table row prefix. SMF MUST be used for rows of the small molecule feature table.', - ) - header_prefix: Optional[SFHeaderPrefix] = Field( - 'SFH', - description='The small molecule feature table header prefix. SFH MUST be used for the small molecule feature table header line (the column labels).', - ) - smf_id: int = Field( - ..., - description='A within file unique identifier for the small molecule feature.', - ) - sme_id_refs: Optional[List[int]] = Field( - [], - description='References to the identification evidence (SME elements) via referencing SME_ID values. Multiple values MAY be provided as a “|” separated list to indicate ambiguity in the identification or to indicate that different types of data supported the identifiction (see SME_ID_REF_ambiguity_code). For the case of a consensus approach where multiple adduct forms are used to infer the SML ID, different features should just reference the same SME_ID value(s).', - ) - sme_id_ref_ambiguity_code: Optional[int] = Field( - None, - description='If multiple values are given under SME_ID_REFS, one of the following codes MUST be provided. 1=Ambiguous identification; 2=Only different evidence streams for the same molecule with no ambiguity; 3=Both ambiguous identification and multiple evidence streams. If there are no or one value under SME_ID_REFs, this MUST be reported as null.', - ) - adduct_ion: Optional[constr(regex=r'^\[\d*M([+-][\w]*)\]\d*[+-]$')] = Field( - None, - description='The assumed classification of this molecule’s adduct ion after detection, following the general style in the 2013 IUPAC recommendations on terms relating to MS e.g. [M+H]1+, [M+Na]1+, [M+NH4]1+, [M-H]1-, [M+Cl]1-, [M+H]1+.', - ) - isotopomer: Optional[Parameter] = None - exp_mass_to_charge: float = Field( - ..., - description='The experimental mass/charge value for the feature, by default assumed to be the mean across assays or a representative value. For approaches that report isotopomers as SMF rows, then the m/z of the isotopomer MUST be reported here.', - ) - charge: int = Field( - ..., - description='The feature’s charge value using positive integers both for positive and negative polarity modes.', - ) - retention_time_in_seconds: Optional[float] = Field( - None, - description='The apex of the feature on the retention time axis, in a Master or aggregate MS run. Retention time MUST be reported in seconds. Retention time values for individual MS runs (i.e. before alignment) MAY be reported as optional columns. Retention time SHOULD only be null in the case of direct infusion MS or other techniques where a retention time value is absent or unknown. Relative retention time or retention time index values MAY be reported as optional columns, and could be considered for inclusion in future versions of mzTab as appropriate.', - ) - retention_time_in_seconds_start: Optional[float] = Field( - None, - description='The start time of the feature on the retention time axis, in a Master or aggregate MS run. Retention time MUST be reported in seconds. Retention time start and end SHOULD only be null in the case of direct infusion MS or other techniques where a retention time value is absent or unknown and MAY be reported in optional columns.', - ) - retention_time_in_seconds_end: Optional[float] = Field( - None, - description='The end time of the feature on the retention time axis, in a Master or aggregate MS run. Retention time MUST be reported in seconds. Retention time start and end SHOULD only be null in the case of direct infusion MS or other techniques where a retention time value is absent or unknown and MAY be reported in optional columns..', - ) - abundance_assay: Optional[List[float]] = Field( - [], - description='The feature’s abundance in every assay described in the metadata section MUST be reported. Null or zero values may be reported as appropriate.', - ) - opt: Optional[List[OptColumnMapping]] = Field( - [], - description='Additional columns can be added to the end of the small molecule feature table. These column headers MUST start with the prefix “opt_” followed by the {identifier} of the object they reference: assay, study variable, MS run or “global” (if the value relates to all replicates). Column names MUST only contain the following characters: ‘A’-‘Z’, ‘a’-‘z’, ‘0’-‘9’, ‘’, ‘-’, ‘[’, ‘]’, and ‘:’. CV parameter accessions MAY be used for optional columns following the format: opt{identifier}_cv_{accession}_\\{parameter name}. Spaces within the parameter’s name MUST be replaced by ‘_’.\n', - ) - comment: Optional[List[Comment]] = [] - - -class Publication(BaseModel): - id: Optional[conint(ge=1)] = None - publicationItems: List[PublicationItem] = Field( - ..., description='The publication item ids referenced by this publication.' - ) - - -class SpectraRef(BaseModel): - ms_run: MsRun - reference: str = Field( - ..., - description='The (vendor-dependendent) reference string to the actual mass spectrum.\n', - ) - - -class StudyVariable(BaseModel): - id: conint(ge=1) - name: str = Field(..., description='The study variable name.') - assay_refs: Optional[List[Assay]] = Field( - [], description='The assays referenced by this study variable.' - ) - average_function: Optional[Parameter] = None - variation_function: Optional[Parameter] = None - description: Optional[str] = Field( - None, description='A free-form description of this study variable.' - ) - factors: Optional[List[Parameter]] = Field( - [], - description='Parameters indicating which factors were used for the assays referenced by this study variable, and at which levels.', - ) - - -class Metadata(BaseModel): - prefix: MTDPrefix = Field( - ..., - description='The metadata section prefix. MUST always be MTD.\n', - example='MTD', - ) - mzTab_version: constr(regex=r'^\d{1}\.\d{1}\.\d{1}-[A-Z]{1}$') = Field( - ..., - alias='mzTab-version', - description='The version of the mzTab file. The suffix MUST be "-M" for mzTab for metabolomics (mzTab-M).\n', - example='2.0.0-M', - ) - mzTab_ID: str = Field( - ..., - alias='mzTab-ID', - description='The ID of the mzTab file, this could be supplied by the repository from which it is downloaded or a local identifier from the lab producing the file. It is not intended to be a globally unique ID but carry some locally useful meaning.\n', - example='MTBLS214', - ) - title: Optional[str] = Field( - None, - description='The file’s human readable title.\n', - example='My first test experiment', - ) - description: Optional[str] = Field( - None, - description='The file’s human readable description.\n', - example='An experiment investigating the effects of Il-6.', - ) - contact: Optional[List[Contact]] = Field( - [], - description='The contact’s name, affiliation and e-mail. Several contacts can be given by indicating the number in the square brackets after "contact". A contact has to be supplied in the format [first name] [initials] [last name].', - ) - publication: Optional[List[Publication]] = Field( - [], - description='A publication associated with this file. Several publications can be given by indicating the number in the square brackets after “publication”. PubMed ids must be prefixed by “pubmed:”, DOIs by “doi:”. Multiple identifiers MUST be separated by “|”.', - ) - uri: Optional[List[Uri]] = Field( - [], - description='A URI pointing to the file’s source data (e.g., a MetaboLights records).', - ) - external_study_uri: Optional[List[Uri]] = Field( - [], - description='A URI pointing to an external file with more details about the study design (e.g., an ISA-TAB file).', - ) - instrument: Optional[List[Instrument]] = Field( - [], - description='The name, source, analyzer and detector of the instruments used in the experiment. Multiple instruments are numbered [1-n].', - ) - quantification_method: Parameter - sample: Optional[List[Sample]] = Field( - [], - description='Specification of sample.\n(empty) name: A name for each sample to serve as a list of the samples that MUST be reported in the following tables. Samples MUST be reported if a statistical design is being captured (i.e. bio or tech replicates). If the type of replicates are not known, samples SHOULD NOT be reported. \nspecies: The respective species of the samples analysed. For more complex cases, such as metagenomics, optional columns and userParams should be used. \ntissue: The respective tissue(s) of the sample. \ncell_type: The respective cell type(s) of the sample. \ndisease: The respective disease(s) of the sample. \ndescription: A human readable description of the sample. \ncustom: Custom parameters describing the sample’s additional properties. Dates MUST be provided in ISO-8601 format.\n', - ) - sample_processing: Optional[List[SampleProcessing]] = Field( - [], - description="A list of parameters describing a sample processing, preparation or handling step similar to a biological or analytical methods report. The order of the sample_processing items should reflect the order these processing steps were performed in. If multiple parameters are given for a step these MUST be separated by a “|”. If derivatization was performed, it MUST be reported here as a general step, e.g. 'silylation' and the actual derivatization agens MUST be specified in the Section 6.2.54 part.\n", - ) - software: List[Software] = Field( - ..., - description='Software used to analyze the data and obtain the reported results. The parameter’s value SHOULD contain the software’s version. The order (numbering) should reflect the order in which the tools were used. A software setting used. This field MAY occur multiple times for a single software. The value of this field is deliberately set as a String, since there currently do not exist CV terms for every possible setting.', - ) - derivatization_agent: Optional[List[Parameter]] = Field( - [], - description='A description of derivatization agents applied to small molecules, using userParams or CV terms where possible.', - ) - ms_run: List[MsRun] = Field( - ..., - description='Specification of ms_run. \nlocation: Location of the external data file e.g. raw files on which analysis has been performed. If the actual location of the MS run is unknown, a “null” MUST be used as a place holder value, since the [1-n] cardinality is referenced elsewhere. If pre-fractionation has been performed, then [1-n] ms_runs SHOULD be created per assay. \ninstrument_ref: If different instruments are used in different runs, instrument_ref can be used to link a specific instrument to a specific run. \nformat: Parameter specifying the data format of the external MS data file. If ms_run[1-n]-format is present, ms_run[1-n]-id_format SHOULD also be present, following the parameters specified in Table 1. \nid_format: Parameter specifying the id format used in the external data file. If ms_run[1-n]-id_format is present, ms_run[1-n]-format SHOULD also be present.\nfragmentation_method: The type(s) of fragmentation used in a given ms run.\nscan_polarity: The polarity mode of a given run. Usually only one value SHOULD be given here except for the case of mixed polarity runs.\nhash: Hash value of the corresponding external MS data file defined in ms_run[1-n]-location. If ms_run[1-n]-hash is present, ms_run[1-n]-hash_method SHOULD also be present.\nhash_method: A parameter specifying the hash methods used to generate the String in ms_run[1-n]-hash. Specifics of the hash method used MAY follow the definitions of the mzML format. If ms_run[1-n]-hash is present, ms_run[1-n]-hash_method SHOULD also be present.\n', - ) - assay: List[Assay] = Field( - ..., - description='Specification of assay.\n(empty) name: A name for each assay, to serve as a list of the assays that MUST be reported in the following tables. \ncustom: Additional custom parameters or values for a given assay. \nexternal_uri: An external reference uri to further information about the assay, for example via a reference to an object within an ISA-TAB file. \nsample_ref: An association from a given assay to the sample analysed. \nms_run_ref: An association from a given assay to the source MS run. All assays MUST reference exactly one ms_run unless a workflow with pre-fractionation is being encoded, in which case each assay MUST reference n ms_runs where n fractions have been collected. Multiple assays SHOULD reference the same ms_run to capture multiplexed experimental designs.\n', - ) - study_variable: List[StudyVariable] = Field( - ..., - description='Specification of study_variable.\n(empty) name: A name for each study variable (experimental condition or factor), to serve as a list of the study variables that MUST be reported in the following tables. For software that does not capture study variables, a single study variable MUST be reported, linking to all assays. This single study variable MUST have the identifier “undefined“.\nassay_refs: Bar-separated references to the IDs of assays grouped in the study variable.\naverage_function: The function used to calculate the study variable quantification value and the operation used is not arithmetic mean (default) e.g. “geometric mean”, “median”. The 1-n refers to different study variables.\nvariation_function: The function used to calculate the study variable quantification variation value if it is reported and the operation used is not coefficient of variation (default) e.g. “standard error”.\ndescription: A textual description of the study variable.\nfactors: Additional parameters or factors, separated by bars, that are known about study variables allowing the capture of more complex, such as nested designs.\n', - ) - custom: Optional[List[Parameter]] = Field( - [], description='Any additional parameters describing the analysis reported.' - ) - cv: List[CV] = Field( - ..., - description='Specification of controlled vocabularies.\nlabel: A string describing the labels of the controlled vocabularies/ontologies used in the mzTab file as a short-hand e.g. "MS" for PSI-MS.\nfull_name: A string describing the full names of the controlled vocabularies/ontologies used in the mzTab file.\nversion: A string describing the version of the controlled vocabularies/ontologies used in the mzTab file.\nuri: A string containing the URIs of the controlled vocabularies/ontologies used in the mzTab file.\n', - ) - small_molecule_quantification_unit: Parameter = Field( - ..., alias='small_molecule-quantification_unit' - ) - small_molecule_feature_quantification_unit: Parameter = Field( - ..., alias='small_molecule_feature-quantification_unit' - ) - small_molecule_identification_reliability: Optional[Parameter] = Field( - None, alias='small_molecule-identification_reliability' - ) - database: List[Database] = Field( - ..., - description='Specification of databases.\n(empty): The description of databases used. For cases, where a known database has not been used for identification, a userParam SHOULD be inserted to describe any identification performed e.g. de novo. If no identification has been performed at all then "no database" should be inserted followed by null.\nprefix: The prefix used in the “identifier” column of data tables. For the “no database” case "null" must be used.\nversion: The database version is mandatory where identification has been performed. This may be a formal version number e.g. “1.4.1”, a date of access “2016-10-27” (ISO-8601 format) or “Unknown” if there is no suitable version that can be annotated.\nuri: The URI to the database. For the “no database” case, "null" must be reported.\n', - ) - id_confidence_measure: List[Parameter] = Field( - ..., - description='The type of small molecule confidence measures or scores MUST be reported as a CV parameter [1-n]. The CV parameter definition should formally state whether the ordering is high to low or vice versa. The order of the scores SHOULD reflect their importance for the identification and be used to determine the identification’s rank.', - ) - colunit_small_molecule: Optional[List[ColumnParameterMapping]] = Field( - [], - alias='colunit-small_molecule', - description='Defines the used unit for a column in the small molecule section. The format of the value has to be \\{column name}=\\{Parameter defining the unit}. This field MUST NOT be used to define a unit for quantification columns. The unit used for small molecule quantification values MUST be set in small_molecule-quantification_unit.', - ) - colunit_small_molecule_feature: Optional[List[ColumnParameterMapping]] = Field( - [], - alias='colunit-small_molecule_feature', - description='Defines the used unit for a column in the small molecule feature section. The format of the value has to be \\{column name}=\\{Parameter defining the unit}. This field MUST NOT be used to define a unit for quantification columns. The unit used for small molecule quantification values MUST be set in small_molecule_feature-quantification_unit.', - ) - colunit_small_molecule_evidence: Optional[List[ColumnParameterMapping]] = Field( - [], - alias='colunit-small_molecule_evidence', - description='Defines the used unit for a column in the small molecule evidence section. The format of the value has to be \\{column name}=\\{Parameter defining the unit}.', - ) - - -class SmallMoleculeEvidence(BaseModel): - prefix: Optional[SMEPrefix] = Field( - 'SME', - description='The small molecule evidence table row prefix. SME MUST be used for rows of the small molecule evidence table.', - ) - header_prefix: Optional[SEHeaderPrefix] = Field( - 'SEH', - description='The small molecule evidence table header prefix. SEH MUST be used for the small molecule evidence table header line (the column labels).', - ) - sme_id: int = Field( - ..., - description='A within file unique identifier for the small molecule evidence result.', - ) - evidence_input_id: str = Field( - ..., - description='A within file unique identifier for the input data used to support this identification e.g. fragment spectrum, RT and m/z pair, isotope profile that was used for the identification process, to serve as a grouping mechanism, whereby multiple rows of results from the same input data share the same ID. The identifiers may be human readable but should not be assumed to be interpretable. For example, if fragmentation spectra have been searched then the ID may be the spectrum reference, or for accurate mass search, the ms_run[2]:458.75.', - ) - database_identifier: str = Field( - ..., - description='The putative identification for the small molecule sourced from an external database, using the same prefix specified in database[1-n]-prefix.\n\nThis could include additionally a chemical class or an identifier to a spectral library entity, even if its actual identity is unknown.\n\nFor the “no database” case, "null" must be used. The unprefixed use of "null" is prohibited for any other case. If no putative identification can be reported for a particular database, it MUST be reported as the database prefix followed by null.\n', - ) - chemical_formula: Optional[str] = Field( - None, - description='The chemical formula of the identified compound e.g. in a database, assumed to match the theoretical mass to charge (in some cases this will be the derivatized form, including adducts and protons).\n\nThis should be specified in Hill notation (EA Hill 1900), i.e. elements in the order C, H and then alphabetically all other elements. Counts of one may be omitted. Elements should be capitalized properly to avoid confusion (e.g., “CO” vs. “Co”). The chemical formula reported should refer to the neutral form. Charge state is reported by the charge field.\n\nExample N-acetylglucosamine would be encoded by the string “C8H15NO6”\n', - ) - smiles: Optional[str] = Field( - None, - description='The potential molecule’s structure in the simplified molecular-input line-entry system (SMILES) for the small molecule.', - ) - inchi: Optional[str] = Field( - None, - description='A standard IUPAC International Chemical Identifier (InChI) for the given substance.', - ) - chemical_name: Optional[str] = Field( - None, - description='The small molecule’s chemical/common name, or general description if a chemical name is unavailable.', - ) - uri: Optional[AnyUrl] = Field( - None, - description='A URI pointing to the small molecule’s entry in a database (e.g., the small molecule’s HMDB, Chebi or KEGG entry).', - ) - derivatized_form: Optional[Parameter] = None - adduct_ion: Optional[constr(regex=r'^\[\d*M([-][\w]*)\]\d*[+-]$')] = Field( - None, - description='The assumed classification of this molecule’s adduct ion after detection, following the general style in the 2013 IUPAC recommendations on terms relating to MS e.g. [M+H]+, [M+Na]1+, [M+NH4]1+, [M-H]1-, [M+Cl]1-. If the adduct classification is ambiguous with regards to identification evidence it MAY be null.', - ) - exp_mass_to_charge: float = Field( - ..., - description='The experimental mass/charge value for the precursor ion. If multiple adduct forms have been combined into a single identification event/search, then a single value e.g. for the protonated form SHOULD be reported here.', - ) - charge: int = Field( - ..., - description='The small molecule evidence’s charge value using positive integers both for positive and negative polarity modes.', - ) - theoretical_mass_to_charge: float = Field( - ..., - description='The theoretical mass/charge value for the small molecule or the database mass/charge value (for a spectral library match).', - ) - spectra_ref: List[SpectraRef] = Field( - ..., - description='Reference to a spectrum in a spectrum file, for example a fragmentation spectrum has been used to support the identification. If a separate spectrum file has been used for fragmentation spectrum, this MUST be reported in the metadata section as additional ms_runs. The reference must be in the format ms_run[1-n]:{SPECTRA_REF} where SPECTRA_REF MUST follow the format defined in 5.2 (including references to chromatograms where these are used to inform identification). Multiple spectra MUST be referenced using a “|” delimited list for the (rare) cases in which search engines have combined or aggregated multiple spectra in advance of the search to make identifications.\n\nIf a fragmentation spectrum has not been used, the value should indicate the ms_run to which is identification is mapped e.g. “ms_run[1]”.\n', - ) - identification_method: Parameter - ms_level: Parameter - id_confidence_measure: Optional[List[float]] = Field( - [], - description='Any statistical value or score for the identification. The metadata section reports the type of score used, as id_confidence_measure[1-n] of type Param.', - ) - rank: conint(ge=1) = Field( - ..., - description='The rank of this identification from this approach as increasing integers from 1 (best ranked identification). Ties (equal score) are represented by using the same rank – defaults to 1 if there is no ranking system used.', - ) - opt: Optional[List[OptColumnMapping]] = Field( - [], - description='Additional columns can be added to the end of the small molecule evidence table. These column headers MUST start with the prefix “opt_” followed by the {identifier} of the object they reference: assay, study variable, MS run or “global” (if the value relates to all replicates). Column names MUST only contain the following characters: ‘A’-‘Z’, ‘a’-‘z’, ‘0’-‘9’, ‘’, ‘-’, ‘[’, ‘]’, and ‘:’. CV parameter accessions MAY be used for optional columns following the format: opt{identifier}_cv_{accession}_\\{parameter name}. Spaces within the parameter’s name MUST be replaced by ‘_’.\n', - ) - comment: Optional[List[Comment]] = [] - - -class MzTab(BaseModel): - metadata: Metadata - smallMoleculeSummary: List[SmallMoleculeSummary] = Field( - ..., - description='The small molecule section is table-based. The small molecule section MUST always come after the metadata section. All table columns MUST be Tab separated. There MUST NOT be any empty cells; missing values MUST be reported using “null” for columns where Is Nullable = “True”.\n\nEach row of the small molecule section is intended to report one final result to be communicated in terms of a molecule that has been quantified. In many cases, this may be the molecule of biological interest, although in some cases, the final result could be a derivatized form as appropriate – although it is desirable for the database identifier(s) to reference to the biological (non-derivatized) form. In general, different adduct forms would generally be reported in the Small Molecule Feature section.\n\nThe order of columns MUST follow the order specified below.\n\nAll columns are MANDATORY except for “opt_” columns.\n', - min_items=1, - ) - smallMoleculeFeature: Optional[List[SmallMoleculeFeature]] = Field( - ..., - description='The small molecule feature section is table-based, representing individual MS regions (generally considered to be the elution profile for all isotopomers formed from a single charge state of a molecule), that have been measured/quantified. However, for approaches that quantify individual isotopomers e.g. stable isotope labelling/flux studies, then each SMF row SHOULD represent a single isotopomer.\n\nDifferent adducts or derivatives and different charge states of individual molecules should be reported as separate SMF rows.\n\nThe small molecule feature section MUST always come after the Small Molecule Table. All table columns MUST be Tab separated. There MUST NOT be any empty cells. Missing values MUST be reported using “null”.\n\nThe order of columns MUST follow the order specified below.\n\nAll columns are MANDATORY except for “opt_” columns.\n', - ) - smallMoleculeEvidence: Optional[List[SmallMoleculeEvidence]] = Field( - ..., - description='The small molecule evidence section is table-based, representing evidence for identifications of small molecules/features, from database search or any other process used to give putative identifications to molecules. In a typical case, each row represents one result from a single search or intepretation of a piece of evidence e.g. a database search with a fragmentation spectrum. Multiple results from a given input data item (e.g. one fragment spectrum) SHOULD share the same value under evidence_input_id.\n\nThe small molecule evidence section MUST always come after the Small Molecule Feature Table. All table columns MUST be Tab separated. There MUST NOT be any empty cells. Missing values MUST be reported using “null”.\n\nThe order of columns MUST follow the order specified below.\n\nAll columns are MANDATORY except for “opt_” columns.\n', - ) - comment: Optional[List[Comment]] = Field( - [], - description='Comment lines can be placed anywhere in an mzTab file. These lines must start with the three-letter code COM and are ignored by most parsers. Empty lines can also occur anywhere in an mzTab file and are ignored.\n', - ) diff --git a/streamlit_app/pages/1_File_Import.py b/streamlit_app/pages/1_File_Import.py deleted file mode 100644 index 4f6229e..0000000 --- a/streamlit_app/pages/1_File_Import.py +++ /dev/null @@ -1,63 +0,0 @@ -import streamlit as st -import pandas as pd - -st.set_page_config( - layout="wide", - page_title="File Import - FAIR MS Library Curation Editor", - #page_icon="assets/favicon.ico", - menu_items={ - 'Get Help': 'https://github.com/mzmine/biohack23_p15', - 'Report a bug': "https://github.com/mzmine/biohack23_p15/issues/new/choose", - 'About': "# This is the creation and curation wizard for FAIR MS Libraries." - } -) - -st.markdown("## File Import") -st.markdown("Please select an Excel file to upload. The file should contain one or more sheets. Each sheet should contain sample columns, detailing factors of each individual sample (rows). Lipid identities are the column headers of the non-sample columns, quantities should be reported in the cells.") - -uploaded_file = st.file_uploader("Choose a file", ) -if uploaded_file is not None: - print(uploaded_file) - st.session_state['uploaded_file'] = uploaded_file - -if 'uploaded_file' in st.session_state and st.session_state['uploaded_file'] is not None: - print("Uploaded file:", st.session_state['uploaded_file']) - uploaded_file = st.session_state['uploaded_file'] - with st.spinner('Loading data...'): - datasets = {} - if 'datasets' in st.session_state: - datasets = st.session_state['datasets'] - else: - st.session_state['datasets'] = datasets - - xl = pd.ExcelFile(uploaded_file) - sheets = xl.sheet_names - for sheet in sheets: - if sheet not in datasets: - df = pd.read_excel(uploaded_file, sheet_name=sheet) - datasets[sheet] = df - - st.markdown("## Preview Sheets") - sheet_selector = st.selectbox( - "Select a sheet", - sheets - ) - if sheet_selector is not None and sheet_selector in datasets: - rowsMetricColumn, columnsMetricColumn = st.columns(2) - with rowsMetricColumn: - st.metric('Rows', datasets[sheet_selector].shape[0]) - with columnsMetricColumn: - st.metric('Columns', datasets[sheet_selector].shape[1]) - st.write(datasets[sheet_selector]) - - st.markdown("## Select Sheets as Datasets") - selected_sheets = st.multiselect( - 'Each selected sheet will be converted to a dataset', - sheets, - sheets - ) - st.session_state['datasets'] = datasets - st.session_state['selected_sheets'] = selected_sheets - - if 'datasets' not in st.session_state: - st.session_state['datasets'] = [] diff --git a/streamlit_app/pages/4_Library_Export.py b/streamlit_app/pages/4_Library_Export.py deleted file mode 100644 index f1e08c5..0000000 --- a/streamlit_app/pages/4_Library_Export.py +++ /dev/null @@ -1,34 +0,0 @@ -import streamlit as st - -st.set_page_config( - layout="wide", - page_title="Library Export - FAIR MS Library Curation Editor", - #page_icon="assets/favicon.ico", - menu_items={ - 'Get Help': 'https://github.com/mzmine/biohack23_p15', - 'Report a bug': "https://github.com/mzmine/biohack23_p15/issues/new/choose", - 'About': "# This is the creation and curation wizard for FAIR MS Libraries." - } -) - -st.markdown("## Conversion to MS Library Export Format") - -datasets = {} -if 'datasets' in st.session_state: - datasets = st.session_state['datasets'] - -metadata_parts = {} -if 'metadata_parts' in st.session_state: - metadata_parts = st.session_state['metadata_parts'] - -with st.form("conversion-settings", clear_on_submit=False): - if datasets == {}: - st.warning("Please upload a file to begin!") - - if metadata_parts == {}: - st.warning("Please enter metadata to begin!") - - submit_disabled = (metadata_parts == {} or datasets == {}) - convert = st.form_submit_button("Create MS Library XYZ file", disabled=submit_disabled) - if convert: - st.info("Exporting to XYZ format...")