diff --git a/.env_template b/.env_template new file mode 100644 index 00000000..23398c80 --- /dev/null +++ b/.env_template @@ -0,0 +1 @@ +HOSTNAME=de.metabolomics-usi.gnps2.org \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 00000000..c7945e35 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,29 @@ +name: Docker Build Test + +on: + push: + branches: + master + pull_request: + branches: + master + schedule: + - cron: '0 0 * * 1' + +jobs: + build-test: + runs-on: ubuntu-latest + strategy: + max-parallel: 4 + matrix: + python-version: [3.8] +# TODO: We probably should switch to using the Docker version. + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Build Docker + run: | + docker build . diff --git a/.github/workflows/loadtest.yml b/.github/workflows/loadtest.yml index 921ec010..74c04fb5 100644 --- a/.github/workflows/loadtest.yml +++ b/.github/workflows/loadtest.yml @@ -38,7 +38,7 @@ jobs: run: | export PATH="$HOME/miniconda/bin:$PATH" source ~/.bashrc - locust -f ./test/locustfile.py --headless -u 4 -r 10 \ - -H https://metabolomics-usi.ucsd.edu/ -t 120s + # locust -f ./test/locustfile.py --headless -u 4 -r 10 \ + # -H https://metabolomics-usi.ucsd.edu/ -t 120s diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..e8a8b6e4 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "metabolomics_spectrum_resolver/mass-spec-package"] + path = metabolomics_spectrum_resolver/mass-spec-package + url = https://github.com/AkJay1722/mass-spec-package.git diff --git a/Dockerfile b/Dockerfile index fa855a97..f1eed3f5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,31 @@ -FROM continuumio/miniconda3:4.8.2 +FROM ubuntu:22.04 MAINTAINER Mingxun Wang "mwang87@gmail.com" WORKDIR /app RUN apt-get update -y && \ apt-get install -y libxrender-dev && \ - apt-get install -y git-core -RUN conda create -y -n usi -c conda-forge -c bioconda -c defaults celery \ + apt-get install -y git-core libarchive-dev build-essential wget vim curl + +# Install Mamba +ENV CONDA_DIR /opt/conda +RUN wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O ~/miniforge.sh && /bin/bash ~/miniforge.sh -b -p /opt/conda +ENV PATH=$CONDA_DIR/bin:$PATH +RUN echo "export PATH=$CONDA_DIR:$PATH" >> ~/.bashrc + +RUN mamba create -y -n usi -c conda-forge -c bioconda -c defaults celery==5.3.6 \ dash=1.20.0 dash-bootstrap-components=0.9.2 flask gunicorn \ - joblib matplotlib numba numpy openssl qrcode rdkit requests \ - requests-cache scipy spectrum_utils werkzeug + joblib matplotlib==3.6.3 numba numpy openssl qrcode rdkit requests \ + requests-cache scipy spectrum_utils==0.3.5 werkzeug==2.0.0 + +# install redis with pypi +RUN /bin/bash -c 'source activate usi && pip install redis' + +# installing hash RUN /bin/bash -c 'source activate usi && pip install "git+https://github.com/berlinguyinca/spectra-hash.git#subdirectory=python" && pip install celery-once' +# installing analytics +RUN /bin/bash -c 'source activate usi && pip install umami-analytics' + RUN echo "source activate usi" > ~/.bashrc COPY . /app diff --git a/Makefile b/Makefile index 5d5f5bc4..586efe7b 100644 --- a/Makefile +++ b/Makefile @@ -23,19 +23,19 @@ clear-cache: #Docker Compose server-compose-interactive: - docker-compose build - docker-compose up + docker-compose --compatibility build + docker-compose --compatibility up server-compose: - docker-compose build - docker-compose up -d + docker-compose --compatibility build + docker-compose --compatibility up -d server-compose-production-interactive: - docker-compose build + docker-compose --compatibility build docker-compose -f docker-compose.yml -f docker-compose-production.yml --compatibility up server-compose-production: - docker-compose build + docker-compose --compatibility build docker-compose -f docker-compose.yml -f docker-compose-production.yml --compatibility up -d attach: diff --git a/docker-compose-production.yml b/docker-compose-production.yml index 54234676..8e7399b6 100644 --- a/docker-compose-production.yml +++ b/docker-compose-production.yml @@ -5,29 +5,64 @@ services: - default - nginx-net environment: - VIRTUAL_HOST: metabolomics-usi.ucsd.edu,metabolomics-usi.gnps2.org - VIRTUAL_PORT: 5087 - LETSENCRYPT_HOST: metabolomics-usi.ucsd.edu,metabolomics-usi.gnps2.org + VIRTUAL_HOST: ${HOSTNAME:-metabolomics-usi.gnps2.org} + VIRTUAL_PORT: 5000 + LETSENCRYPT_HOST: ${HOSTNAME:-metabolomics-usi.gnps2.org} LETSENCRYPT_EMAIL: mwang87@gmail.com command: /app/run_server.sh deploy: resources: limits: memory: 16000M + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" + + metabolomicsusi-api1: + networks: + - default + - nginx-net + environment: + VIRTUAL_HOST: ${HOSTNAME:-api.metabolomics-usi.gnps2.org} + VIRTUAL_PORT: 5000 + LETSENCRYPT_HOST: ${HOSTNAME:-api.metabolomics-usi.gnps2.org} + LETSENCRYPT_EMAIL: mwang87@gmail.com + command: /app/run_server.sh + deploy: + resources: + limits: + memory: 16000M + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" metabolomicsusi-worker: deploy: resources: limits: memory: 16000M + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" metabolomicsusi-redis: deploy: resources: limits: - memory: 4000M + memory: 8000M + logging: + driver: "json-file" + options: + max-size: "10m" + max-file: "3" networks: nginx-net: external: - name: nginx-net \ No newline at end of file + name: nginx-net diff --git a/docker-compose.yml b/docker-compose.yml index a0432e7a..38b67f4d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,8 +12,39 @@ services: - ./logs/:/app/logs:rw networks: - default - restart: on-failure + restart: always command: /app/run_dev_server.sh + labels: + autoheal: true + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:5000/heartbeat"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 60s + + metabolomicsusi-api1: + build: + context: . + dockerfile: Dockerfile + container_name: metabolomicsusi-api1 + ports: + - "5088:5000" + volumes: + - ./tmp:/app/tmp:rw + - ./logsapi/:/app/logs:rw + networks: + - default + restart: always + command: /app/run_dev_server.sh + labels: + autoheal: true + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:5000/heartbeat"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 60s metabolomicsusi-worker: build: @@ -24,7 +55,7 @@ services: - ./tmp:/app/tmp:rw - ./logs:/app/logs:rw command: /app/run_worker.sh - restart: on-failure + restart: always depends_on: - metabolomicsusi-redis networks: @@ -33,10 +64,11 @@ services: metabolomicsusi-redis: container_name: metabolomicsusi-redis - image: redis + #image: valkey/valkey:alpine3.20 + image: redis:alpine networks: - default - restart: on-failure + restart: always networks: nginx-net: diff --git a/logsapi/.gitkeep b/logsapi/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/metabolomics_spectrum_resolver/dashinterface.py b/metabolomics_spectrum_resolver/dashinterface.py index 5069ef29..9e5c8695 100644 --- a/metabolomics_spectrum_resolver/dashinterface.py +++ b/metabolomics_spectrum_resolver/dashinterface.py @@ -39,6 +39,8 @@ gtag('config', 'UA-8412213-8'); + + {%metas%} {%title%} @@ -59,10 +61,10 @@ children=[ dbc.NavbarBrand( html.Img( - src="https://gnps-cytoscape.ucsd.edu/static/img/GNPS_logo.png", + src="https://gnps2.org/static/img/logo.png", width="120px", ), - href="https://gnps.ucsd.edu", + href="https://gnps2.org", ), dbc.Nav( [ @@ -426,7 +428,7 @@ dbc.CardHeader(html.H5("Contributors")), dbc.CardBody( [ - "Mingxun Wang, PhD – UC San Diego", + "Mingxun Wang, PhD – UC Riverside", html.Br(), "Wout Bittremieux, PhD – UC San Diego", html.Br(), diff --git a/metabolomics_spectrum_resolver/mass-spec-package b/metabolomics_spectrum_resolver/mass-spec-package new file mode 160000 index 00000000..c1cc52a7 --- /dev/null +++ b/metabolomics_spectrum_resolver/mass-spec-package @@ -0,0 +1 @@ +Subproject commit c1cc52a73645122d2eaac3ba52b1eeb346c36dc6 diff --git a/metabolomics_spectrum_resolver/parsing.py b/metabolomics_spectrum_resolver/parsing.py index f488e4c8..5c748f33 100644 --- a/metabolomics_spectrum_resolver/parsing.py +++ b/metabolomics_spectrum_resolver/parsing.py @@ -4,17 +4,24 @@ from typing import Tuple import requests +import pandas as pd +from io import StringIO import urllib.parse import spectrum_utils.spectrum as sus import splash from metabolomics_spectrum_resolver.error import UsiError +from metabolomics_spectrum_resolver.zenodo_mzml_repo import mzml_repo + + timeout = 45 # seconds MS2LDA_SERVER = "http://ms2lda.org/basicviz/" MOTIFDB_SERVER = "http://ms2lda.org/motifdb/" -MASSBANK_SERVER = "https://massbank.us/rest/spectra/" +MONA_SERVER = "https://massbank.us/rest/spectra/" +MASSBANKEUROPE_SERVER = "https://msbi.ipb-halle.de/MassBank-api/records/" +NORMAN_SERVER = "http://server.norman-data.eu:8770/getScan" # USI specification: http://www.psidev.info/usi usi_pattern = re.compile( @@ -42,8 +49,8 @@ r"^mzspec" # collection identifier # Unofficial proteomics spectral library identifier: MASSIVEKB - # Metabolomics collection identifiers: GNPS, MASSBANK, MS2LDA, MOTIFDB - r":(MASSIVEKB|GNPS|MASSBANK|MS2LDA|MOTIFDB)" + # Metabolomics collection identifiers: GNPS, MASSBANK, MS2LDA, MOTIFDB, MTBLS, ST + r":(MASSIVEKB|GNPS|GNPS2|MASSBANK|MS2LDA|MOTIFDB|TINYMASS|MTBLS\d+|ST\d{6}|ZENODO-\d+|NORMAN-[0-9a-fA-F-]+|)" # msRun identifier r":(.*)" # index flag @@ -90,6 +97,9 @@ def parse_usi(usi: str) -> Tuple[sus.MsmsSpectrum, str, str]: Tuple[sus.MsmsSpectrum, str, str] A tuple of the `MsmsSpectrum`, its source link, and its SPLASH. """ + # Very basic cleanup + usi = str(usi).strip() + match = _match_usi(usi) try: collection = match.group(1).lower() @@ -100,7 +110,6 @@ def parse_usi(usi: str) -> Tuple[sus.MsmsSpectrum, str, str]: # changes, be sure to change this logic. if ( annotation is not None - or collection.startswith("msv") or collection.startswith("pxd") or collection.startswith("pxl") or collection.startswith("rpxd") @@ -108,14 +117,33 @@ def parse_usi(usi: str) -> Tuple[sus.MsmsSpectrum, str, str]: or collection == "massive" ): spectrum, source_link = _parse_msv_pxd(usi) + elif collection.startswith("msv"): + # Lets try to use GNPS2 for this first + try: + spectrum, source_link = _parse_gnps2(usi) + except: + spectrum, source_link = _parse_msv_pxd(usi) elif collection == "gnps": spectrum, source_link = _parse_gnps(usi) + elif collection == "gnps2": + spectrum, source_link = _parse_gnps2(usi) + elif collection.startswith("mtbls"): + # Since they don't have their own resolver, we'll go here to GNPS2 for now + spectrum, source_link = _parse_gnps2(usi) elif collection == "massbank": spectrum, source_link = _parse_massbank(usi) elif collection == "ms2lda": spectrum, source_link = _parse_ms2lda(usi) elif collection == "motifdb": spectrum, source_link = _parse_motifdb(usi) + elif collection.startswith("st"): + spectrum, source_link = _parse_metabolomics_workbench(usi) + elif collection.startswith("tinymass"): + spectrum, source_link = _parse_tinymass(usi) + elif collection.startswith("norman"): + spectrum, source_link = _parse_norman(usi) + elif collection.startswith("zenodo"): + spectrum, source_link = _parse_zenodo(usi) else: raise UsiError(f"Unknown USI collection: {match.group(1)}", 400) splash_key = splash_builder.splash( @@ -318,6 +346,14 @@ def _parse_gnps(usi: str) -> Tuple[sus.MsmsSpectrum, str]: else: return _parse_gnps_library(usi) +def _parse_gnps2(usi: str) -> Tuple[sus.MsmsSpectrum, str]: + match = _match_usi(usi) + ms_run = match.group(2) + if ms_run.lower().startswith("task"): + return _parse_gnps2_task(usi) + else: + # We are likely dealing with a dataset on the GNPS2 side + return _parse_gnps2_dataset(usi) # Parse GNPS clustered spectra in Molecular Networking. def _parse_gnps_task(usi: str) -> Tuple[sus.MsmsSpectrum, str]: @@ -358,6 +394,157 @@ def _parse_gnps_task(usi: str) -> Tuple[sus.MsmsSpectrum, str]: raise UsiError("Unknown GNPS task USI", 404) +# Parse GNPS2 task spectra +def _parse_gnps2_task(usi: str) -> Tuple[sus.MsmsSpectrum, str]: + match = _match_usi(usi) + gnps_task_match = gnps_task_pattern.match(match.group(2)) + if gnps_task_match is None: + raise UsiError("Incorrectly formatted GNPS2 task", 400) + task = gnps_task_match.group(1) + filename = gnps_task_match.group(2) + index_flag = match.group(3) + + if not (index_flag.lower() == "scan" or index_flag.lower() == "nativeid"): + raise UsiError("Currently supported GNPS2 TASK index flags: scan and nativeId", 400) + + scan = match.group(4) + + # We will try in order these GNPS2 URLs to see if the task is actually there + gnps2_server_url_list = [ + "https://gnps2.org", + "https://beta.gnps2.org", + "http://dev2.gnps2.org", + "https://de.gnps2.org", + "https://br.gnps2.org", + "https://kr.gnps2.org", + "https://gnps2.jgi.doe.gov", + ] + + for gnps2server_url in gnps2_server_url_list: + try: + request_url = ( + f"{gnps2server_url}/spectrumpeaks?format=json&usi={usi}" + ) + lookup_request = requests.get(request_url, timeout=timeout) + lookup_request.raise_for_status() + spectrum_dict = lookup_request.json() + mz, intensity = zip(*spectrum_dict["peaks"]) + source_link = ( + f"{gnps2server_url}/status?task={task}" + ) + if "precursor_mz" in spectrum_dict: + precursor_mz = float(spectrum_dict["precursor_mz"]) + charge = 0 + else: + precursor_mz, charge = 0, 0 + + spectrum = sus.MsmsSpectrum(usi, precursor_mz, charge, mz, intensity) + return spectrum, source_link + except (requests.exceptions.HTTPError, json.decoder.JSONDecodeError): + pass + + raise UsiError("Unknown GNPS2 task USI", 404) + +def _parse_gnps2_dataset(usi: str) -> Tuple[sus.MsmsSpectrum, str]: + match = _match_usi(usi) + dataset_identifier = match.group(1) + index_flag = match.group(3) + scan = match.group(4) + + if not (index_flag.lower() == "scan" or index_flag.lower() == "nativeid"): + raise UsiError("Currently supported GNPS2 Dataset index flags: scan and nativeId", 400) + + try: + request_url = ( + f"https://gnps2.org/spectrumpeaks?format=json&usi={usi}" + ) + lookup_request = requests.get(request_url, timeout=timeout) + lookup_request.raise_for_status() + spectrum_dict = lookup_request.json() + mz, intensity = zip(*spectrum_dict["peaks"]) + + if "MTBLS" in dataset_identifier: + source_link = ( + f"https://www.ebi.ac.uk/metabolights/editor/{dataset_identifier}/descriptors" + ) + elif "MSV" in dataset_identifier: + source_link = ( + f"https://massive.ucsd.edu/ProteoSAFe/" + f"QueryMSV?id={dataset_identifier}" + ) + + if "precursor_mz" in spectrum_dict: + precursor_mz = float(spectrum_dict["precursor_mz"]) + charge = 0 + else: + precursor_mz, charge = 0, 0 + + spectrum = sus.MsmsSpectrum(usi, precursor_mz, charge, mz, intensity) + return spectrum, source_link + except (requests.exceptions.HTTPError, json.decoder.JSONDecodeError): + raise UsiError("Unknown GNPS2 Dataset USI", 404) + +# parsing from Zenodo +def _parse_zenodo(usi: str) -> Tuple[sus.MsmsSpectrum, str]: + match = _match_usi(usi) + zenodo_id = match.group(1).split("-")[-1] + filename = match.group(2) + index_flag = match.group(3) + if index_flag.lower() == "scan": + scan = match.group(4) + + zenodo_obj = mzml_repo(zenodo_id) + zenodo_obj.partial_indexing = False + scan_obj = zenodo_obj.get_scan(filename, int(scan)) + + # get peaks + intensity_list = scan_obj["intensities"] + mz_list = scan_obj["mz"] + charge = scan_obj["charge"] + precursor_mz = scan_obj["precursor_mz"] + + try: + charge = int(charge) + except: + charge = 0 + + try: + precursor_mz = float(precursor_mz) + except: + precursor_mz = 0 + + source_link = f"https://zenodo.org/record/{zenodo_id}" + + spectrum = sus.MsmsSpectrum(usi, precursor_mz, charge, mz_list, intensity_list) + + return spectrum, source_link + +# Parse TINYMASS task spectra +def _parse_tinymass(usi: str) -> Tuple[sus.MsmsSpectrum, str]: + match = _match_usi(usi) + + try: + request_url = ( + f"https://tinymass.gnps2.org/resolve?usi={usi}" + ) + lookup_request = requests.get(request_url, timeout=timeout) + lookup_request.raise_for_status() + spectrum_dict = lookup_request.json() + mz, intensity = zip(*spectrum_dict["peaks"]) + source_link = ( + f"https://tinymass.gnps2.org/resolve?usi={usi}" + ) + if "precursor" in spectrum_dict: + precursor_mz = float(spectrum_dict["precursor"]) + charge = 0 + else: + precursor_mz, charge = 0, 0 + + spectrum = sus.MsmsSpectrum(usi, precursor_mz, charge, mz, intensity) + return spectrum, source_link + except (requests.exceptions.HTTPError, json.decoder.JSONDecodeError): + raise UsiError("Unknown Tiny Mass task USI", 404) + # Parse GNPS library. def _parse_gnps_library(usi: str) -> Tuple[sus.MsmsSpectrum, str]: match = _match_usi(usi) @@ -369,8 +556,8 @@ def _parse_gnps_library(usi: str) -> Tuple[sus.MsmsSpectrum, str]: index = match.group(4) try: request_url = ( - f"https://gnps.ucsd.edu/ProteoSAFe/" - f"SpectrumCommentServlet?SpectrumID={index}" + f"https://external.gnps2.org/" + f"gnpsspectrum?SpectrumID={index}" ) lookup_request = requests.get(request_url, timeout=timeout) lookup_request.raise_for_status() @@ -407,6 +594,23 @@ def _parse_gnps_library(usi: str) -> Tuple[sus.MsmsSpectrum, str]: # Parse MassBank entry. def _parse_massbank(usi: str) -> Tuple[sus.MsmsSpectrum, str]: + """ Parse a MassBank or MoNA USI and return the corresponding spectrum/source url. + + MassBank USIs are of the form: MSBNK-[A-Za-z0-9_]{1,32}-[A-Z0-9_]{1,64} + + Fall back to MoNA if MassBank EU fails to respond. Note that partial MassBank ids + (e.g., SM858102) will only resolve to MoNA. + + Parameters + ---------- + usi : str + The USI to be parsed. + + Returns + ------- + Tuple[sus.MsmsSpectrum, str] + The parsed spectrum and the source link. + """ match = _match_usi(usi) index_flag = match.group(3) if index_flag.lower() != "accession": @@ -416,16 +620,63 @@ def _parse_massbank(usi: str) -> Tuple[sus.MsmsSpectrum, str]: index = match.group(4) # Clean up the new MassBank accessions if necessary. massbank_accession = re.match( - r"MSBNK-[A-Z0-9_]{1,32}-([A-Z0-9_]{1,64})", index + # See https://github.com/MassBank/MassBank-web/blob/main/Documentation/MassBankRecordFormat.md#211-accession + r"(MSBNK-[A-Za-z0-9_]{1,32}-[A-Z0-9_]{1,64})", index ) if massbank_accession is not None: - index = massbank_accession.group(1) + # It's certiainly MassBank EU/JP + try: + return _parse_massbankEurope(usi) + + except UsiError: + pass + + # Either MassBank EU Failed or it's a MoNA entry, fallback to MoNA. + # Let the exception propagate if it fails + return _parse_mona(usi) + + +# Parse MONA entry. +def _parse_mona(usi: str) -> Tuple[sus.MsmsSpectrum, str]: + """ Parse a MONA USI and return the corresponding spectrum. Performs a web request to + MONA_SERVER. + + Parameters + ---------- + usi : str + The USI to be parsed. + + Globals + ------- + MONA_SERVER : str + The base URL for the MONA server. + + Returns + ------- + Tuple[sus.MsmsSpectrum, str] + The parsed spectrum and the source link. + + Raises + ------ + UsiError + If the USI could not be parsed because it is incorrectly formatted. + """ + match = _match_usi(usi) + index_flag = match.group(3) + if index_flag.lower() != "accession": + raise UsiError( + "Currently supported MassBank index flags: accession", 400 + ) + + index = match.group(4) + try: lookup_request = requests.get( - f"{MASSBANK_SERVER}{index}", timeout=timeout + f"{MONA_SERVER}{index}", timeout=timeout ) lookup_request.raise_for_status() spectrum_dict = lookup_request.json() + mz, intensity = [], [] for peak in spectrum_dict["spectrum"].split(): peak_mz, peak_intensity = peak.split(":") @@ -437,14 +688,82 @@ def _parse_massbank(usi: str) -> Tuple[sus.MsmsSpectrum, str]: precursor_mz = float(metadata["value"]) break source_link = ( - f"https://massbank.eu/MassBank/" f"RecordDisplay.jsp?id={index}" + f"https://massbank.us/spectra/display/{index}" ) spectrum = sus.MsmsSpectrum(usi, precursor_mz, 0, mz, intensity) + return spectrum, source_link + except requests.exceptions.HTTPError: raise UsiError("Unknown MassBank USI", 404) +# Parse MassBank entry. +def _parse_massbankEurope(usi: str) -> Tuple[sus.MsmsSpectrum, str]: + """ Parse a MassBank[EU|JP] USI and return the corresponding spectrum. Performs a web request to + MassBank Server. + + Parameters + ---------- + usi : str + The USI to be parsed. + + Globals + ------- + MassBank Server : str + The base URL for the MONA server. + + Returns + ------- + Tuple[sus.MsmsSpectrum, str] + The parsed spectrum and the source link. + + Raises + ------ + UsiError + If the USI could not be parsed because it is incorrectly formatted. + """ + match = _match_usi(usi) + index_flag = match.group(3) + if index_flag.lower() != "accession": + raise UsiError( + "Currently supported MassBank index flags: accession", 400 + ) + + index = match.group(4) + + try: + # Try requesting from massbankeurope first + lookup_request = requests.get( + f"{MASSBANKEUROPE_SERVER}{index}", timeout=timeout + ) + + lookup_request.raise_for_status() + spectrum_dict = lookup_request.json() + + # If request is successful we know it was massbankeurope and parse accordingly + peaks = spectrum_dict["peak"]["peak"]["values"] + + mz = [peak["mz"] for peak in peaks] + intensity = [peak["intensity"] for peak in peaks] + + precursor_mz = next( + (float(item["value"]) for item in spectrum_dict['mass_spectrometry']['focused_ion'] if item["subtag"] == "PRECURSOR_M/Z"), + 0 + ) + + source_link = ( + f"https://massbank.eu/MassBank/" f"RecordDisplay?id={index}" + ) + + spectrum = sus.MsmsSpectrum(usi, precursor_mz, 0, mz, intensity) + return spectrum, source_link + + + #show what error + except requests.exceptions.HTTPError: + raise UsiError("Unknown MassBank USI", 404) + # Parse MS2LDA from ms2lda.org. def _parse_ms2lda(usi: str) -> Tuple[sus.MsmsSpectrum, str]: @@ -490,29 +809,44 @@ def _parse_msv_pxd(usi: str) -> Tuple[sus.MsmsSpectrum, str]: scan = match.group(4) try: lookup_url = ( - f"https://massive.ucsd.edu/ProteoSAFe/" + f"https://proteomics3.ucsd.edu/ProteoSAFe/" f"QuerySpectrum?id={urllib.parse.quote_plus(usi)}" ) lookup_request = requests.get(lookup_url, timeout=timeout) - lookup_request.raise_for_status() + try: + lookup_request.raise_for_status() + except: + lookup_url = ( + f"https://proteomics3.ucsd.edu/ProteoSAFe/" + f"QuerySpectrum?id={urllib.parse.quote_plus(usi)}" + ) + lookup_request = requests.get(lookup_url, timeout=timeout) + lookup_request.raise_for_status() + lookup_json = lookup_request.json() for spectrum_file in lookup_json["row_data"]: + # Checking if its an actual file we can resolve or if MSV will go to PX directly if any( spectrum_file["file_descriptor"].lower().endswith(extension) for extension in ["mzml", "mzxml", "mgf"] - ): - request_url = ( + ) or spectrum_file["file_descriptor"].startswith("f.ProteomeCentral"): + file_descriptor = spectrum_file['file_descriptor'] + if file_descriptor.startswith("f."): + file_descriptor = file_descriptor[2:] + + peaks_request_url = ( f"https://massive.ucsd.edu/ProteoSAFe/" f"DownloadResultFile?" f"task=4f2ac74ea114401787a7e96e143bb4a1&" f"invoke=annotatedSpectrumImageText&block=0&file=FILE->" - f"{urllib.parse.quote(spectrum_file['file_descriptor'])}" + f"{urllib.parse.quote(file_descriptor)}" f"&scan={scan}&peptide=*..*&force=false&" f"format=JSON&uploadfile=True" ) + try: spectrum_request = requests.get( - request_url, timeout=timeout + peaks_request_url, timeout=timeout ) spectrum_request.raise_for_status() spectrum_dict = spectrum_request.json() @@ -569,6 +903,7 @@ def _parse_msv_pxd(usi: str) -> Tuple[sus.MsmsSpectrum, str]: return spectrum, source_link except requests.exceptions.HTTPError: + raise pass raise UsiError("Unsupported/unknown USI", 404) @@ -596,6 +931,57 @@ def _parse_motifdb(usi: str) -> Tuple[sus.MsmsSpectrum, str]: raise UsiError("Unknown MOTIFDB USI", 404) +# Parse GNPS library. +def _parse_metabolomics_workbench(usi: str) -> Tuple[sus.MsmsSpectrum, str]: + match = _match_usi(usi) + accession = match.group(1) + filename = match.group(2) + index_flag = match.group(3) + index = match.group(4) + + if index_flag.lower() != "scan": + raise UsiError( + "Currently supported MW index flags: scan", 400 + ) + try: + request_url = ( + f"https://www.metabolomicsworkbench.org/" + f"data/ms2.php?A={accession}.zip" + f"&F={urllib.parse.quote_plus(filename)}&S={index}" + ) + + # TODO: Do some extra exception handling if we don't find the filename directly. We might need to his another API to get the full filename + # Given the just the basename + + lookup_request = requests.get(request_url, timeout=timeout) + lookup_request.raise_for_status() + + response_text = lookup_request.text + response_text = (response_text.replace("
", "").replace("

", "").lstrip().rstrip()) + + # Parsing the MW Response + precursor_mz = float(response_text.split("\n")[0].split(":")[-1].replace("\"", "")) + charge = int(response_text.split("\n")[2].split(":")[-1].replace("\"", "")) + peaks_df = pd.read_csv(StringIO(response_text), sep=r" +", skiprows=4) + mz = list(peaks_df["m/z"]) + intensity = list(peaks_df["intensity"]) + + source_link = ( + f"https://www.metabolomicsworkbench.org/" + f"data/DRCCMetadata.php?Mode=Study&StudyID={accession}&StudyType=MS&ResultType=1" + ) + + spectrum = sus.MsmsSpectrum( + usi, + float(precursor_mz), + int(charge), + mz, + intensity, + ) + return spectrum, source_link + except requests.exceptions.HTTPError: + raise UsiError("Unknown MW USI", 404) + def _parse_sequence(peptide: str, peptide_clean: str) -> Tuple[str, str, list]: # Parse out gapped sequence (e.g. X+129.04259), faking it # with Glycine as the base residue and adding more mods to @@ -621,3 +1007,74 @@ def _parse_sequence(peptide: str, peptide_clean: str) -> Tuple[str, str, list]: modifications[i] = float(match.group()) previous_mod_len += found_len return peptide, peptide_clean, modifications + +def _parse_norman(usi: str) -> Tuple[sus.MsmsSpectrum, str]: + NORMAN_FILES_BASE = "https://files.dsfp.norman-data.eu/" + + match = _match_usi(usi) + _accession = match.group(1) # not used + file_path = match.group(2) # relative path, e.g. "webform/sample/.../file.mzML" + index_flag = match.group(3) + scan_no = match.group(4) + + if index_flag.lower() != "scan": + raise UsiError("Currently supported index flag: scan", 400) + + # Construct full URL from path + file_url = f"{NORMAN_FILES_BASE}{file_path.lstrip('/')}" + print(f"[DEBUG] Constructed NORMAN file URL: {file_url}") + + if not file_url.lower().endswith(".mzml"): + raise UsiError("NORMAN file URL must point to an .mzML file.", 400) + + # The service expects an URL-encoded file_path (not the full URL!) in the query parameters + encoded_path = urllib.parse.quote_plus(file_url, safe=":/") + params = { + "file_path": encoded_path, + "scan_number": str(scan_no), + } + print(f"[DEBUG] Request params: {params}") + + try: + r = requests.post(NORMAN_SERVER, params=params, headers={"accept": "*/*"}, data="") + print(f"[DEBUG] Requesting: {r.url}") # Shows full request URL with params + + r.raise_for_status() + + payload = r.json() + if not isinstance(payload, dict): + raise UsiError("Unexpected response format (not a JSON object).", 502) + + precursor_list = payload.get("precursormz", []) + try: + precursor_mz = float(precursor_list[0]) if precursor_list else 0.0 + except Exception: + precursor_mz = 0.0 + + charge = 0 # not provided by API + spec = payload.get("spectrum", []) + if not isinstance(spec, list) or not spec: + raise UsiError("No peaks in NORMAN scan response.", 502) + + try: + mz = [float(p["mz"]) for p in spec] + intensity = [float(p["intensity"]) for p in spec] + except Exception as e: + raise UsiError(f"Malformed peaks in NORMAN scan response: {e}", 502) + + spectrum = sus.MsmsSpectrum( + usi, + precursor_mz, + charge, + mz, + intensity, + ) + return spectrum, file_url # return the constructed full URL + + except requests.exceptions.HTTPError as e: + status = getattr(e.response, "status_code", 502) + raise UsiError(f"NORMAN scan lookup failed (HTTP {status}).", status) + except ValueError as e: + raise UsiError(f"NORMAN scan parsing error (invalid JSON): {e}", 502) + except Exception as e: + raise UsiError(f"NORMAN scan parsing error: {e}", 502) diff --git a/metabolomics_spectrum_resolver/tasks.py b/metabolomics_spectrum_resolver/tasks.py index 24817d3e..77418c0f 100644 --- a/metabolomics_spectrum_resolver/tasks.py +++ b/metabolomics_spectrum_resolver/tasks.py @@ -9,7 +9,7 @@ import spectrum_utils.spectrum as sus from metabolomics_spectrum_resolver import drawing, parsing - +from metabolomics_spectrum_resolver import tasks_analytics memory = joblib.Memory("tmp/joblibcache", verbose=0) cached_parse_usi = memory.cache(parsing.parse_usi) @@ -110,6 +110,16 @@ def parse_usi(usi: str) -> Tuple[sus.MsmsSpectrum, str, str]: A tuple of (i) the `MsmsSpectrum`, (ii) its source link, and (iii) its SPLASH. """ + + # We are going to do the analytics now + tasks_analytics.task_analytics_event( + "parse_usi_or_spectrum" + ) + + # Debugging logging + import sys + print("Parsing {}\n".format(usi), file=sys.stderr, flush=True) + # First attempt to schedule with Celery. try: return _task_parse_usi.apply_async(args=(usi,)).get() @@ -120,7 +130,7 @@ def parse_usi(usi: str) -> Tuple[sus.MsmsSpectrum, str, str]: return parsing.parse_usi(usi) -@celery_instance.task(time_limit=30, base=celery_once.QueueOnce) +@celery_instance.task(time_limit=45, base=celery_once.QueueOnce) def _task_parse_usi_or_spectrum( usi: str, spectrum: dict ) -> Tuple[sus.MsmsSpectrum, str, str]: @@ -142,6 +152,7 @@ def _task_parse_usi_or_spectrum( A tuple of (i) the `MsmsSpectrum`, (ii) its source link, and (iii) its SPLASH. """ + # noinspection PyTypeChecker return cached_parse_usi_or_spectrum(usi, spectrum) diff --git a/metabolomics_spectrum_resolver/tasks_analytics.py b/metabolomics_spectrum_resolver/tasks_analytics.py new file mode 100644 index 00000000..a4b3d277 --- /dev/null +++ b/metabolomics_spectrum_resolver/tasks_analytics.py @@ -0,0 +1,44 @@ +import io +import sys +from typing import Any, Tuple + +import celery +import umami + +celery_instance = celery.Celery( + "tasks_analytics", + backend="redis://metabolomicsusi-redis", + broker="redis://metabolomicsusi-redis", +) + +umami.set_url_base("https://analytics-api.gnps2.org/") +umami.set_website_id('2e8b3719-51ec-4786-9b29-3e9198c31ea5') +umami.set_hostname('analytics-api.gnps2.org') + +celery_instance.conf.task_routes = { + "metabolomics_spectrum_resolver.tasks_analytics._task_analytics_event": { + "queue": "worker-analytics" + }, +} + +def task_analytics_event(event_type: str) -> str: + """ + Task to log an analytics event using umami. + + Args: + event_type (str): The type of event to log. + + Returns: + str: Confirmation message indicating the event was sent. + """ + + _task_analytics_event.apply_async( + args=([event_type]) + ) + + return f"Event '{event_type}' logged." + +@celery_instance.task(time_limit=10) +def _task_analytics_event(event_type) -> str: + umami.new_event(event_name=event_type) + diff --git a/metabolomics_spectrum_resolver/templates/homepage.html b/metabolomics_spectrum_resolver/templates/homepage.html index 2e827bd8..3adeaf95 100644 --- a/metabolomics_spectrum_resolver/templates/homepage.html +++ b/metabolomics_spectrum_resolver/templates/homepage.html @@ -80,24 +80,8 @@

diff --git a/metabolomics_spectrum_resolver/templates/minimal.html b/metabolomics_spectrum_resolver/templates/minimal.html index 0e1fd4e2..a6be389d 100644 --- a/metabolomics_spectrum_resolver/templates/minimal.html +++ b/metabolomics_spectrum_resolver/templates/minimal.html @@ -50,7 +50,7 @@
diff --git a/metabolomics_spectrum_resolver/views.py b/metabolomics_spectrum_resolver/views.py index 17becf03..7be99508 100644 --- a/metabolomics_spectrum_resolver/views.py +++ b/metabolomics_spectrum_resolver/views.py @@ -39,7 +39,6 @@ def render_homepage(): return flask.render_template("homepage.html") - @blueprint.route("/contributors", methods=["GET"]) def render_contributors(): return flask.render_template("contributors.html") @@ -625,6 +624,10 @@ def generate_qr(): qr_bytes.seek(0) return flask.send_file(qr_bytes, "image/png") +@blueprint.route("/robot.txt") +def robot(): + # Disallow all + return "User-agent: *\nDisallow: /", 200 @blueprint.errorhandler(Exception) def render_error(error): diff --git a/metabolomics_spectrum_resolver/zenodo_mzml_repo.py b/metabolomics_spectrum_resolver/zenodo_mzml_repo.py new file mode 120000 index 00000000..e34961a1 --- /dev/null +++ b/metabolomics_spectrum_resolver/zenodo_mzml_repo.py @@ -0,0 +1 @@ +mass-spec-package/zenodo_mzml_repo.py \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index fdddf450..62ea4144 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,5 +19,5 @@ requests requests_cache scipy spectrum_utils -werkzeug -git+https://github.com/berlinguyinca/spectra-hash.git#subdirectory=python +werkzeug==2.0.0 +git+https://github.com/berlinguyinca/spectra-hash.git#subdirectory=python \ No newline at end of file diff --git a/run_worker.sh b/run_worker.sh index 2073a4fb..23601fbe 100755 --- a/run_worker.sh +++ b/run_worker.sh @@ -2,5 +2,9 @@ source activate usi export C_FORCE_ROOT="true" + +# Running an analytics worker +celery -A metabolomics_spectrum_resolver.tasks_analytics worker --concurrency=1 -Q worker-analytics --loglevel INFO --detach + #TODO: Make sure we don't run this worker as root -celery -A metabolomics_spectrum_resolver.tasks worker -l info --autoscale=12,1 -Q worker --max-tasks-per-child 10 --loglevel INFO +celery -A metabolomics_spectrum_resolver.tasks worker -l info --autoscale=32,1 -Q worker --max-tasks-per-child 10 --loglevel INFO diff --git a/test/test_unit.py b/test/test_unit.py index 98782bf9..91b402bb 100644 --- a/test/test_unit.py +++ b/test/test_unit.py @@ -218,7 +218,6 @@ def test_parse_motifdb(): parsing.parse_usi(usi.replace(":171163", ":this_index_does_not_exist")) assert exc_info.value.error_code == 404 - def test_parse_timeout(): with unittest.mock.patch( "metabolomics_spectrum_resolver.parsing.requests.get", diff --git a/test/usi_test_data.py b/test/usi_test_data.py index b80eee07..08357aac 100644 --- a/test/usi_test_data.py +++ b/test/usi_test_data.py @@ -10,6 +10,8 @@ "mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00005436077", "mzspec:MASSBANK::accession:SM858102", "mzspec:MASSBANK::accession:MSBNK-AAFC-AC000646", + # New Massbank identifier with lowercase + "mzspec:MASSBANK::accession:MSBNK-Athens_Univ-AU259904", "mzspec:MS2LDA:TASK-190:accession:270684", "mzspec:MOTIFDB::accession:171163", "mzspec:MSV000082791:(-)-epigallocatechin:scan:2", @@ -29,6 +31,8 @@ "mzspec:MassIVE:TASK-f4b86b150a164ee4a440b661e97a7193-spectra/specs_ms.mgf:scan:287215:HPYFYAPELLF[-10.059]FAKR/3", # MassIVE Task USIs disguised as GNPS Task USIs "mzspec:GNPS:TASK-f4b86b150a164ee4a440b661e97a7193-spectra/specs_ms.mgf:scan:287215:HPYFYAPELLF[-10.059]FAKR/3", + # Metabolomics Workbench USIs + "mzspec:ST000003:StemCell+Data+and+Raw+Files/iPSC-T1R1:scan:3", # Legacy cases. "mzspec:GNPSTASK-c95481f0c53d42e78a61bf899e9f9adb:spectra/specs_ms.mgf:scan:1943", "mzspec:GNPSTASK-64b22841ab3548f987b3cfc18696a581:spectra/specs_ms.mgf:scan:1469",