diff --git a/.env_template b/.env_template
new file mode 100644
index 00000000..23398c80
--- /dev/null
+++ b/.env_template
@@ -0,0 +1 @@
+HOSTNAME=de.metabolomics-usi.gnps2.org
\ No newline at end of file
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 00000000..c7945e35
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,29 @@
+name: Docker Build Test
+
+on:
+ push:
+ branches:
+ master
+ pull_request:
+ branches:
+ master
+ schedule:
+ - cron: '0 0 * * 1'
+
+jobs:
+ build-test:
+ runs-on: ubuntu-latest
+ strategy:
+ max-parallel: 4
+ matrix:
+ python-version: [3.8]
+# TODO: We probably should switch to using the Docker version.
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Build Docker
+ run: |
+ docker build .
diff --git a/.github/workflows/loadtest.yml b/.github/workflows/loadtest.yml
index 921ec010..74c04fb5 100644
--- a/.github/workflows/loadtest.yml
+++ b/.github/workflows/loadtest.yml
@@ -38,7 +38,7 @@ jobs:
run: |
export PATH="$HOME/miniconda/bin:$PATH"
source ~/.bashrc
- locust -f ./test/locustfile.py --headless -u 4 -r 10 \
- -H https://metabolomics-usi.ucsd.edu/ -t 120s
+ # locust -f ./test/locustfile.py --headless -u 4 -r 10 \
+ # -H https://metabolomics-usi.ucsd.edu/ -t 120s
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 00000000..e8a8b6e4
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "metabolomics_spectrum_resolver/mass-spec-package"]
+ path = metabolomics_spectrum_resolver/mass-spec-package
+ url = https://github.com/AkJay1722/mass-spec-package.git
diff --git a/Dockerfile b/Dockerfile
index fa855a97..f1eed3f5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,16 +1,31 @@
-FROM continuumio/miniconda3:4.8.2
+FROM ubuntu:22.04
MAINTAINER Mingxun Wang "mwang87@gmail.com"
WORKDIR /app
RUN apt-get update -y && \
apt-get install -y libxrender-dev && \
- apt-get install -y git-core
-RUN conda create -y -n usi -c conda-forge -c bioconda -c defaults celery \
+ apt-get install -y git-core libarchive-dev build-essential wget vim curl
+
+# Install Mamba
+ENV CONDA_DIR /opt/conda
+RUN wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -O ~/miniforge.sh && /bin/bash ~/miniforge.sh -b -p /opt/conda
+ENV PATH=$CONDA_DIR/bin:$PATH
+RUN echo "export PATH=$CONDA_DIR:$PATH" >> ~/.bashrc
+
+RUN mamba create -y -n usi -c conda-forge -c bioconda -c defaults celery==5.3.6 \
dash=1.20.0 dash-bootstrap-components=0.9.2 flask gunicorn \
- joblib matplotlib numba numpy openssl qrcode rdkit requests \
- requests-cache scipy spectrum_utils werkzeug
+ joblib matplotlib==3.6.3 numba numpy openssl qrcode rdkit requests \
+ requests-cache scipy spectrum_utils==0.3.5 werkzeug==2.0.0
+
+# install redis with pypi
+RUN /bin/bash -c 'source activate usi && pip install redis'
+
+# installing hash
RUN /bin/bash -c 'source activate usi && pip install "git+https://github.com/berlinguyinca/spectra-hash.git#subdirectory=python" && pip install celery-once'
+# installing analytics
+RUN /bin/bash -c 'source activate usi && pip install umami-analytics'
+
RUN echo "source activate usi" > ~/.bashrc
COPY . /app
diff --git a/Makefile b/Makefile
index 5d5f5bc4..586efe7b 100644
--- a/Makefile
+++ b/Makefile
@@ -23,19 +23,19 @@ clear-cache:
#Docker Compose
server-compose-interactive:
- docker-compose build
- docker-compose up
+ docker-compose --compatibility build
+ docker-compose --compatibility up
server-compose:
- docker-compose build
- docker-compose up -d
+ docker-compose --compatibility build
+ docker-compose --compatibility up -d
server-compose-production-interactive:
- docker-compose build
+ docker-compose --compatibility build
docker-compose -f docker-compose.yml -f docker-compose-production.yml --compatibility up
server-compose-production:
- docker-compose build
+ docker-compose --compatibility build
docker-compose -f docker-compose.yml -f docker-compose-production.yml --compatibility up -d
attach:
diff --git a/docker-compose-production.yml b/docker-compose-production.yml
index 54234676..8e7399b6 100644
--- a/docker-compose-production.yml
+++ b/docker-compose-production.yml
@@ -5,29 +5,64 @@ services:
- default
- nginx-net
environment:
- VIRTUAL_HOST: metabolomics-usi.ucsd.edu,metabolomics-usi.gnps2.org
- VIRTUAL_PORT: 5087
- LETSENCRYPT_HOST: metabolomics-usi.ucsd.edu,metabolomics-usi.gnps2.org
+ VIRTUAL_HOST: ${HOSTNAME:-metabolomics-usi.gnps2.org}
+ VIRTUAL_PORT: 5000
+ LETSENCRYPT_HOST: ${HOSTNAME:-metabolomics-usi.gnps2.org}
LETSENCRYPT_EMAIL: mwang87@gmail.com
command: /app/run_server.sh
deploy:
resources:
limits:
memory: 16000M
+ logging:
+ driver: "json-file"
+ options:
+ max-size: "10m"
+ max-file: "3"
+
+ metabolomicsusi-api1:
+ networks:
+ - default
+ - nginx-net
+ environment:
+ VIRTUAL_HOST: ${HOSTNAME:-api.metabolomics-usi.gnps2.org}
+ VIRTUAL_PORT: 5000
+ LETSENCRYPT_HOST: ${HOSTNAME:-api.metabolomics-usi.gnps2.org}
+ LETSENCRYPT_EMAIL: mwang87@gmail.com
+ command: /app/run_server.sh
+ deploy:
+ resources:
+ limits:
+ memory: 16000M
+ logging:
+ driver: "json-file"
+ options:
+ max-size: "10m"
+ max-file: "3"
metabolomicsusi-worker:
deploy:
resources:
limits:
memory: 16000M
+ logging:
+ driver: "json-file"
+ options:
+ max-size: "10m"
+ max-file: "3"
metabolomicsusi-redis:
deploy:
resources:
limits:
- memory: 4000M
+ memory: 8000M
+ logging:
+ driver: "json-file"
+ options:
+ max-size: "10m"
+ max-file: "3"
networks:
nginx-net:
external:
- name: nginx-net
\ No newline at end of file
+ name: nginx-net
diff --git a/docker-compose.yml b/docker-compose.yml
index a0432e7a..38b67f4d 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -12,8 +12,39 @@ services:
- ./logs/:/app/logs:rw
networks:
- default
- restart: on-failure
+ restart: always
command: /app/run_dev_server.sh
+ labels:
+ autoheal: true
+ healthcheck:
+ test: ["CMD", "curl", "-f", "http://localhost:5000/heartbeat"]
+ interval: 30s
+ timeout: 10s
+ retries: 3
+ start_period: 60s
+
+ metabolomicsusi-api1:
+ build:
+ context: .
+ dockerfile: Dockerfile
+ container_name: metabolomicsusi-api1
+ ports:
+ - "5088:5000"
+ volumes:
+ - ./tmp:/app/tmp:rw
+ - ./logsapi/:/app/logs:rw
+ networks:
+ - default
+ restart: always
+ command: /app/run_dev_server.sh
+ labels:
+ autoheal: true
+ healthcheck:
+ test: ["CMD", "curl", "-f", "http://localhost:5000/heartbeat"]
+ interval: 30s
+ timeout: 10s
+ retries: 3
+ start_period: 60s
metabolomicsusi-worker:
build:
@@ -24,7 +55,7 @@ services:
- ./tmp:/app/tmp:rw
- ./logs:/app/logs:rw
command: /app/run_worker.sh
- restart: on-failure
+ restart: always
depends_on:
- metabolomicsusi-redis
networks:
@@ -33,10 +64,11 @@ services:
metabolomicsusi-redis:
container_name: metabolomicsusi-redis
- image: redis
+ #image: valkey/valkey:alpine3.20
+ image: redis:alpine
networks:
- default
- restart: on-failure
+ restart: always
networks:
nginx-net:
diff --git a/logsapi/.gitkeep b/logsapi/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/metabolomics_spectrum_resolver/dashinterface.py b/metabolomics_spectrum_resolver/dashinterface.py
index 5069ef29..9e5c8695 100644
--- a/metabolomics_spectrum_resolver/dashinterface.py
+++ b/metabolomics_spectrum_resolver/dashinterface.py
@@ -39,6 +39,8 @@
gtag('config', 'UA-8412213-8');
+
+
{%metas%}
{%title%}
@@ -59,10 +61,10 @@
children=[
dbc.NavbarBrand(
html.Img(
- src="https://gnps-cytoscape.ucsd.edu/static/img/GNPS_logo.png",
+ src="https://gnps2.org/static/img/logo.png",
width="120px",
),
- href="https://gnps.ucsd.edu",
+ href="https://gnps2.org",
),
dbc.Nav(
[
@@ -426,7 +428,7 @@
dbc.CardHeader(html.H5("Contributors")),
dbc.CardBody(
[
- "Mingxun Wang, PhD – UC San Diego",
+ "Mingxun Wang, PhD – UC Riverside",
html.Br(),
"Wout Bittremieux, PhD – UC San Diego",
html.Br(),
diff --git a/metabolomics_spectrum_resolver/mass-spec-package b/metabolomics_spectrum_resolver/mass-spec-package
new file mode 160000
index 00000000..c1cc52a7
--- /dev/null
+++ b/metabolomics_spectrum_resolver/mass-spec-package
@@ -0,0 +1 @@
+Subproject commit c1cc52a73645122d2eaac3ba52b1eeb346c36dc6
diff --git a/metabolomics_spectrum_resolver/parsing.py b/metabolomics_spectrum_resolver/parsing.py
index f488e4c8..5c748f33 100644
--- a/metabolomics_spectrum_resolver/parsing.py
+++ b/metabolomics_spectrum_resolver/parsing.py
@@ -4,17 +4,24 @@
from typing import Tuple
import requests
+import pandas as pd
+from io import StringIO
import urllib.parse
import spectrum_utils.spectrum as sus
import splash
from metabolomics_spectrum_resolver.error import UsiError
+from metabolomics_spectrum_resolver.zenodo_mzml_repo import mzml_repo
+
+
timeout = 45 # seconds
MS2LDA_SERVER = "http://ms2lda.org/basicviz/"
MOTIFDB_SERVER = "http://ms2lda.org/motifdb/"
-MASSBANK_SERVER = "https://massbank.us/rest/spectra/"
+MONA_SERVER = "https://massbank.us/rest/spectra/"
+MASSBANKEUROPE_SERVER = "https://msbi.ipb-halle.de/MassBank-api/records/"
+NORMAN_SERVER = "http://server.norman-data.eu:8770/getScan"
# USI specification: http://www.psidev.info/usi
usi_pattern = re.compile(
@@ -42,8 +49,8 @@
r"^mzspec"
# collection identifier
# Unofficial proteomics spectral library identifier: MASSIVEKB
- # Metabolomics collection identifiers: GNPS, MASSBANK, MS2LDA, MOTIFDB
- r":(MASSIVEKB|GNPS|MASSBANK|MS2LDA|MOTIFDB)"
+ # Metabolomics collection identifiers: GNPS, MASSBANK, MS2LDA, MOTIFDB, MTBLS, ST
+ r":(MASSIVEKB|GNPS|GNPS2|MASSBANK|MS2LDA|MOTIFDB|TINYMASS|MTBLS\d+|ST\d{6}|ZENODO-\d+|NORMAN-[0-9a-fA-F-]+|)"
# msRun identifier
r":(.*)"
# index flag
@@ -90,6 +97,9 @@ def parse_usi(usi: str) -> Tuple[sus.MsmsSpectrum, str, str]:
Tuple[sus.MsmsSpectrum, str, str]
A tuple of the `MsmsSpectrum`, its source link, and its SPLASH.
"""
+ # Very basic cleanup
+ usi = str(usi).strip()
+
match = _match_usi(usi)
try:
collection = match.group(1).lower()
@@ -100,7 +110,6 @@ def parse_usi(usi: str) -> Tuple[sus.MsmsSpectrum, str, str]:
# changes, be sure to change this logic.
if (
annotation is not None
- or collection.startswith("msv")
or collection.startswith("pxd")
or collection.startswith("pxl")
or collection.startswith("rpxd")
@@ -108,14 +117,33 @@ def parse_usi(usi: str) -> Tuple[sus.MsmsSpectrum, str, str]:
or collection == "massive"
):
spectrum, source_link = _parse_msv_pxd(usi)
+ elif collection.startswith("msv"):
+ # Lets try to use GNPS2 for this first
+ try:
+ spectrum, source_link = _parse_gnps2(usi)
+ except:
+ spectrum, source_link = _parse_msv_pxd(usi)
elif collection == "gnps":
spectrum, source_link = _parse_gnps(usi)
+ elif collection == "gnps2":
+ spectrum, source_link = _parse_gnps2(usi)
+ elif collection.startswith("mtbls"):
+ # Since they don't have their own resolver, we'll go here to GNPS2 for now
+ spectrum, source_link = _parse_gnps2(usi)
elif collection == "massbank":
spectrum, source_link = _parse_massbank(usi)
elif collection == "ms2lda":
spectrum, source_link = _parse_ms2lda(usi)
elif collection == "motifdb":
spectrum, source_link = _parse_motifdb(usi)
+ elif collection.startswith("st"):
+ spectrum, source_link = _parse_metabolomics_workbench(usi)
+ elif collection.startswith("tinymass"):
+ spectrum, source_link = _parse_tinymass(usi)
+ elif collection.startswith("norman"):
+ spectrum, source_link = _parse_norman(usi)
+ elif collection.startswith("zenodo"):
+ spectrum, source_link = _parse_zenodo(usi)
else:
raise UsiError(f"Unknown USI collection: {match.group(1)}", 400)
splash_key = splash_builder.splash(
@@ -318,6 +346,14 @@ def _parse_gnps(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
else:
return _parse_gnps_library(usi)
+def _parse_gnps2(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
+ match = _match_usi(usi)
+ ms_run = match.group(2)
+ if ms_run.lower().startswith("task"):
+ return _parse_gnps2_task(usi)
+ else:
+ # We are likely dealing with a dataset on the GNPS2 side
+ return _parse_gnps2_dataset(usi)
# Parse GNPS clustered spectra in Molecular Networking.
def _parse_gnps_task(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
@@ -358,6 +394,157 @@ def _parse_gnps_task(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
raise UsiError("Unknown GNPS task USI", 404)
+# Parse GNPS2 task spectra
+def _parse_gnps2_task(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
+ match = _match_usi(usi)
+ gnps_task_match = gnps_task_pattern.match(match.group(2))
+ if gnps_task_match is None:
+ raise UsiError("Incorrectly formatted GNPS2 task", 400)
+ task = gnps_task_match.group(1)
+ filename = gnps_task_match.group(2)
+ index_flag = match.group(3)
+
+ if not (index_flag.lower() == "scan" or index_flag.lower() == "nativeid"):
+ raise UsiError("Currently supported GNPS2 TASK index flags: scan and nativeId", 400)
+
+ scan = match.group(4)
+
+ # We will try in order these GNPS2 URLs to see if the task is actually there
+ gnps2_server_url_list = [
+ "https://gnps2.org",
+ "https://beta.gnps2.org",
+ "http://dev2.gnps2.org",
+ "https://de.gnps2.org",
+ "https://br.gnps2.org",
+ "https://kr.gnps2.org",
+ "https://gnps2.jgi.doe.gov",
+ ]
+
+ for gnps2server_url in gnps2_server_url_list:
+ try:
+ request_url = (
+ f"{gnps2server_url}/spectrumpeaks?format=json&usi={usi}"
+ )
+ lookup_request = requests.get(request_url, timeout=timeout)
+ lookup_request.raise_for_status()
+ spectrum_dict = lookup_request.json()
+ mz, intensity = zip(*spectrum_dict["peaks"])
+ source_link = (
+ f"{gnps2server_url}/status?task={task}"
+ )
+ if "precursor_mz" in spectrum_dict:
+ precursor_mz = float(spectrum_dict["precursor_mz"])
+ charge = 0
+ else:
+ precursor_mz, charge = 0, 0
+
+ spectrum = sus.MsmsSpectrum(usi, precursor_mz, charge, mz, intensity)
+ return spectrum, source_link
+ except (requests.exceptions.HTTPError, json.decoder.JSONDecodeError):
+ pass
+
+ raise UsiError("Unknown GNPS2 task USI", 404)
+
+def _parse_gnps2_dataset(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
+ match = _match_usi(usi)
+ dataset_identifier = match.group(1)
+ index_flag = match.group(3)
+ scan = match.group(4)
+
+ if not (index_flag.lower() == "scan" or index_flag.lower() == "nativeid"):
+ raise UsiError("Currently supported GNPS2 Dataset index flags: scan and nativeId", 400)
+
+ try:
+ request_url = (
+ f"https://gnps2.org/spectrumpeaks?format=json&usi={usi}"
+ )
+ lookup_request = requests.get(request_url, timeout=timeout)
+ lookup_request.raise_for_status()
+ spectrum_dict = lookup_request.json()
+ mz, intensity = zip(*spectrum_dict["peaks"])
+
+ if "MTBLS" in dataset_identifier:
+ source_link = (
+ f"https://www.ebi.ac.uk/metabolights/editor/{dataset_identifier}/descriptors"
+ )
+ elif "MSV" in dataset_identifier:
+ source_link = (
+ f"https://massive.ucsd.edu/ProteoSAFe/"
+ f"QueryMSV?id={dataset_identifier}"
+ )
+
+ if "precursor_mz" in spectrum_dict:
+ precursor_mz = float(spectrum_dict["precursor_mz"])
+ charge = 0
+ else:
+ precursor_mz, charge = 0, 0
+
+ spectrum = sus.MsmsSpectrum(usi, precursor_mz, charge, mz, intensity)
+ return spectrum, source_link
+ except (requests.exceptions.HTTPError, json.decoder.JSONDecodeError):
+ raise UsiError("Unknown GNPS2 Dataset USI", 404)
+
+# parsing from Zenodo
+def _parse_zenodo(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
+ match = _match_usi(usi)
+ zenodo_id = match.group(1).split("-")[-1]
+ filename = match.group(2)
+ index_flag = match.group(3)
+ if index_flag.lower() == "scan":
+ scan = match.group(4)
+
+ zenodo_obj = mzml_repo(zenodo_id)
+ zenodo_obj.partial_indexing = False
+ scan_obj = zenodo_obj.get_scan(filename, int(scan))
+
+ # get peaks
+ intensity_list = scan_obj["intensities"]
+ mz_list = scan_obj["mz"]
+ charge = scan_obj["charge"]
+ precursor_mz = scan_obj["precursor_mz"]
+
+ try:
+ charge = int(charge)
+ except:
+ charge = 0
+
+ try:
+ precursor_mz = float(precursor_mz)
+ except:
+ precursor_mz = 0
+
+ source_link = f"https://zenodo.org/record/{zenodo_id}"
+
+ spectrum = sus.MsmsSpectrum(usi, precursor_mz, charge, mz_list, intensity_list)
+
+ return spectrum, source_link
+
+# Parse TINYMASS task spectra
+def _parse_tinymass(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
+ match = _match_usi(usi)
+
+ try:
+ request_url = (
+ f"https://tinymass.gnps2.org/resolve?usi={usi}"
+ )
+ lookup_request = requests.get(request_url, timeout=timeout)
+ lookup_request.raise_for_status()
+ spectrum_dict = lookup_request.json()
+ mz, intensity = zip(*spectrum_dict["peaks"])
+ source_link = (
+ f"https://tinymass.gnps2.org/resolve?usi={usi}"
+ )
+ if "precursor" in spectrum_dict:
+ precursor_mz = float(spectrum_dict["precursor"])
+ charge = 0
+ else:
+ precursor_mz, charge = 0, 0
+
+ spectrum = sus.MsmsSpectrum(usi, precursor_mz, charge, mz, intensity)
+ return spectrum, source_link
+ except (requests.exceptions.HTTPError, json.decoder.JSONDecodeError):
+ raise UsiError("Unknown Tiny Mass task USI", 404)
+
# Parse GNPS library.
def _parse_gnps_library(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
match = _match_usi(usi)
@@ -369,8 +556,8 @@ def _parse_gnps_library(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
index = match.group(4)
try:
request_url = (
- f"https://gnps.ucsd.edu/ProteoSAFe/"
- f"SpectrumCommentServlet?SpectrumID={index}"
+ f"https://external.gnps2.org/"
+ f"gnpsspectrum?SpectrumID={index}"
)
lookup_request = requests.get(request_url, timeout=timeout)
lookup_request.raise_for_status()
@@ -407,6 +594,23 @@ def _parse_gnps_library(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
# Parse MassBank entry.
def _parse_massbank(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
+ """ Parse a MassBank or MoNA USI and return the corresponding spectrum/source url.
+
+ MassBank USIs are of the form: MSBNK-[A-Za-z0-9_]{1,32}-[A-Z0-9_]{1,64}
+
+ Fall back to MoNA if MassBank EU fails to respond. Note that partial MassBank ids
+ (e.g., SM858102) will only resolve to MoNA.
+
+ Parameters
+ ----------
+ usi : str
+ The USI to be parsed.
+
+ Returns
+ -------
+ Tuple[sus.MsmsSpectrum, str]
+ The parsed spectrum and the source link.
+ """
match = _match_usi(usi)
index_flag = match.group(3)
if index_flag.lower() != "accession":
@@ -416,16 +620,63 @@ def _parse_massbank(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
index = match.group(4)
# Clean up the new MassBank accessions if necessary.
massbank_accession = re.match(
- r"MSBNK-[A-Z0-9_]{1,32}-([A-Z0-9_]{1,64})", index
+ # See https://github.com/MassBank/MassBank-web/blob/main/Documentation/MassBankRecordFormat.md#211-accession
+ r"(MSBNK-[A-Za-z0-9_]{1,32}-[A-Z0-9_]{1,64})", index
)
if massbank_accession is not None:
- index = massbank_accession.group(1)
+ # It's certiainly MassBank EU/JP
+ try:
+ return _parse_massbankEurope(usi)
+
+ except UsiError:
+ pass
+
+ # Either MassBank EU Failed or it's a MoNA entry, fallback to MoNA.
+ # Let the exception propagate if it fails
+ return _parse_mona(usi)
+
+
+# Parse MONA entry.
+def _parse_mona(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
+ """ Parse a MONA USI and return the corresponding spectrum. Performs a web request to
+ MONA_SERVER.
+
+ Parameters
+ ----------
+ usi : str
+ The USI to be parsed.
+
+ Globals
+ -------
+ MONA_SERVER : str
+ The base URL for the MONA server.
+
+ Returns
+ -------
+ Tuple[sus.MsmsSpectrum, str]
+ The parsed spectrum and the source link.
+
+ Raises
+ ------
+ UsiError
+ If the USI could not be parsed because it is incorrectly formatted.
+ """
+ match = _match_usi(usi)
+ index_flag = match.group(3)
+ if index_flag.lower() != "accession":
+ raise UsiError(
+ "Currently supported MassBank index flags: accession", 400
+ )
+
+ index = match.group(4)
+
try:
lookup_request = requests.get(
- f"{MASSBANK_SERVER}{index}", timeout=timeout
+ f"{MONA_SERVER}{index}", timeout=timeout
)
lookup_request.raise_for_status()
spectrum_dict = lookup_request.json()
+
mz, intensity = [], []
for peak in spectrum_dict["spectrum"].split():
peak_mz, peak_intensity = peak.split(":")
@@ -437,14 +688,82 @@ def _parse_massbank(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
precursor_mz = float(metadata["value"])
break
source_link = (
- f"https://massbank.eu/MassBank/" f"RecordDisplay.jsp?id={index}"
+ f"https://massbank.us/spectra/display/{index}"
)
spectrum = sus.MsmsSpectrum(usi, precursor_mz, 0, mz, intensity)
+
return spectrum, source_link
+
except requests.exceptions.HTTPError:
raise UsiError("Unknown MassBank USI", 404)
+# Parse MassBank entry.
+def _parse_massbankEurope(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
+ """ Parse a MassBank[EU|JP] USI and return the corresponding spectrum. Performs a web request to
+ MassBank Server.
+
+ Parameters
+ ----------
+ usi : str
+ The USI to be parsed.
+
+ Globals
+ -------
+ MassBank Server : str
+ The base URL for the MONA server.
+
+ Returns
+ -------
+ Tuple[sus.MsmsSpectrum, str]
+ The parsed spectrum and the source link.
+
+ Raises
+ ------
+ UsiError
+ If the USI could not be parsed because it is incorrectly formatted.
+ """
+ match = _match_usi(usi)
+ index_flag = match.group(3)
+ if index_flag.lower() != "accession":
+ raise UsiError(
+ "Currently supported MassBank index flags: accession", 400
+ )
+
+ index = match.group(4)
+
+ try:
+ # Try requesting from massbankeurope first
+ lookup_request = requests.get(
+ f"{MASSBANKEUROPE_SERVER}{index}", timeout=timeout
+ )
+
+ lookup_request.raise_for_status()
+ spectrum_dict = lookup_request.json()
+
+ # If request is successful we know it was massbankeurope and parse accordingly
+ peaks = spectrum_dict["peak"]["peak"]["values"]
+
+ mz = [peak["mz"] for peak in peaks]
+ intensity = [peak["intensity"] for peak in peaks]
+
+ precursor_mz = next(
+ (float(item["value"]) for item in spectrum_dict['mass_spectrometry']['focused_ion'] if item["subtag"] == "PRECURSOR_M/Z"),
+ 0
+ )
+
+ source_link = (
+ f"https://massbank.eu/MassBank/" f"RecordDisplay?id={index}"
+ )
+
+ spectrum = sus.MsmsSpectrum(usi, precursor_mz, 0, mz, intensity)
+ return spectrum, source_link
+
+
+ #show what error
+ except requests.exceptions.HTTPError:
+ raise UsiError("Unknown MassBank USI", 404)
+
# Parse MS2LDA from ms2lda.org.
def _parse_ms2lda(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
@@ -490,29 +809,44 @@ def _parse_msv_pxd(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
scan = match.group(4)
try:
lookup_url = (
- f"https://massive.ucsd.edu/ProteoSAFe/"
+ f"https://proteomics3.ucsd.edu/ProteoSAFe/"
f"QuerySpectrum?id={urllib.parse.quote_plus(usi)}"
)
lookup_request = requests.get(lookup_url, timeout=timeout)
- lookup_request.raise_for_status()
+ try:
+ lookup_request.raise_for_status()
+ except:
+ lookup_url = (
+ f"https://proteomics3.ucsd.edu/ProteoSAFe/"
+ f"QuerySpectrum?id={urllib.parse.quote_plus(usi)}"
+ )
+ lookup_request = requests.get(lookup_url, timeout=timeout)
+ lookup_request.raise_for_status()
+
lookup_json = lookup_request.json()
for spectrum_file in lookup_json["row_data"]:
+ # Checking if its an actual file we can resolve or if MSV will go to PX directly
if any(
spectrum_file["file_descriptor"].lower().endswith(extension)
for extension in ["mzml", "mzxml", "mgf"]
- ):
- request_url = (
+ ) or spectrum_file["file_descriptor"].startswith("f.ProteomeCentral"):
+ file_descriptor = spectrum_file['file_descriptor']
+ if file_descriptor.startswith("f."):
+ file_descriptor = file_descriptor[2:]
+
+ peaks_request_url = (
f"https://massive.ucsd.edu/ProteoSAFe/"
f"DownloadResultFile?"
f"task=4f2ac74ea114401787a7e96e143bb4a1&"
f"invoke=annotatedSpectrumImageText&block=0&file=FILE->"
- f"{urllib.parse.quote(spectrum_file['file_descriptor'])}"
+ f"{urllib.parse.quote(file_descriptor)}"
f"&scan={scan}&peptide=*..*&force=false&"
f"format=JSON&uploadfile=True"
)
+
try:
spectrum_request = requests.get(
- request_url, timeout=timeout
+ peaks_request_url, timeout=timeout
)
spectrum_request.raise_for_status()
spectrum_dict = spectrum_request.json()
@@ -569,6 +903,7 @@ def _parse_msv_pxd(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
return spectrum, source_link
except requests.exceptions.HTTPError:
+ raise
pass
raise UsiError("Unsupported/unknown USI", 404)
@@ -596,6 +931,57 @@ def _parse_motifdb(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
raise UsiError("Unknown MOTIFDB USI", 404)
+# Parse GNPS library.
+def _parse_metabolomics_workbench(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
+ match = _match_usi(usi)
+ accession = match.group(1)
+ filename = match.group(2)
+ index_flag = match.group(3)
+ index = match.group(4)
+
+ if index_flag.lower() != "scan":
+ raise UsiError(
+ "Currently supported MW index flags: scan", 400
+ )
+ try:
+ request_url = (
+ f"https://www.metabolomicsworkbench.org/"
+ f"data/ms2.php?A={accession}.zip"
+ f"&F={urllib.parse.quote_plus(filename)}&S={index}"
+ )
+
+ # TODO: Do some extra exception handling if we don't find the filename directly. We might need to his another API to get the full filename
+ # Given the just the basename
+
+ lookup_request = requests.get(request_url, timeout=timeout)
+ lookup_request.raise_for_status()
+
+ response_text = lookup_request.text
+ response_text = (response_text.replace("", "").replace("", "").lstrip().rstrip())
+
+ # Parsing the MW Response
+ precursor_mz = float(response_text.split("\n")[0].split(":")[-1].replace("\"", ""))
+ charge = int(response_text.split("\n")[2].split(":")[-1].replace("\"", ""))
+ peaks_df = pd.read_csv(StringIO(response_text), sep=r" +", skiprows=4)
+ mz = list(peaks_df["m/z"])
+ intensity = list(peaks_df["intensity"])
+
+ source_link = (
+ f"https://www.metabolomicsworkbench.org/"
+ f"data/DRCCMetadata.php?Mode=Study&StudyID={accession}&StudyType=MS&ResultType=1"
+ )
+
+ spectrum = sus.MsmsSpectrum(
+ usi,
+ float(precursor_mz),
+ int(charge),
+ mz,
+ intensity,
+ )
+ return spectrum, source_link
+ except requests.exceptions.HTTPError:
+ raise UsiError("Unknown MW USI", 404)
+
def _parse_sequence(peptide: str, peptide_clean: str) -> Tuple[str, str, list]:
# Parse out gapped sequence (e.g. X+129.04259), faking it
# with Glycine as the base residue and adding more mods to
@@ -621,3 +1007,74 @@ def _parse_sequence(peptide: str, peptide_clean: str) -> Tuple[str, str, list]:
modifications[i] = float(match.group())
previous_mod_len += found_len
return peptide, peptide_clean, modifications
+
+def _parse_norman(usi: str) -> Tuple[sus.MsmsSpectrum, str]:
+ NORMAN_FILES_BASE = "https://files.dsfp.norman-data.eu/"
+
+ match = _match_usi(usi)
+ _accession = match.group(1) # not used
+ file_path = match.group(2) # relative path, e.g. "webform/sample/.../file.mzML"
+ index_flag = match.group(3)
+ scan_no = match.group(4)
+
+ if index_flag.lower() != "scan":
+ raise UsiError("Currently supported index flag: scan", 400)
+
+ # Construct full URL from path
+ file_url = f"{NORMAN_FILES_BASE}{file_path.lstrip('/')}"
+ print(f"[DEBUG] Constructed NORMAN file URL: {file_url}")
+
+ if not file_url.lower().endswith(".mzml"):
+ raise UsiError("NORMAN file URL must point to an .mzML file.", 400)
+
+ # The service expects an URL-encoded file_path (not the full URL!) in the query parameters
+ encoded_path = urllib.parse.quote_plus(file_url, safe=":/")
+ params = {
+ "file_path": encoded_path,
+ "scan_number": str(scan_no),
+ }
+ print(f"[DEBUG] Request params: {params}")
+
+ try:
+ r = requests.post(NORMAN_SERVER, params=params, headers={"accept": "*/*"}, data="")
+ print(f"[DEBUG] Requesting: {r.url}") # Shows full request URL with params
+
+ r.raise_for_status()
+
+ payload = r.json()
+ if not isinstance(payload, dict):
+ raise UsiError("Unexpected response format (not a JSON object).", 502)
+
+ precursor_list = payload.get("precursormz", [])
+ try:
+ precursor_mz = float(precursor_list[0]) if precursor_list else 0.0
+ except Exception:
+ precursor_mz = 0.0
+
+ charge = 0 # not provided by API
+ spec = payload.get("spectrum", [])
+ if not isinstance(spec, list) or not spec:
+ raise UsiError("No peaks in NORMAN scan response.", 502)
+
+ try:
+ mz = [float(p["mz"]) for p in spec]
+ intensity = [float(p["intensity"]) for p in spec]
+ except Exception as e:
+ raise UsiError(f"Malformed peaks in NORMAN scan response: {e}", 502)
+
+ spectrum = sus.MsmsSpectrum(
+ usi,
+ precursor_mz,
+ charge,
+ mz,
+ intensity,
+ )
+ return spectrum, file_url # return the constructed full URL
+
+ except requests.exceptions.HTTPError as e:
+ status = getattr(e.response, "status_code", 502)
+ raise UsiError(f"NORMAN scan lookup failed (HTTP {status}).", status)
+ except ValueError as e:
+ raise UsiError(f"NORMAN scan parsing error (invalid JSON): {e}", 502)
+ except Exception as e:
+ raise UsiError(f"NORMAN scan parsing error: {e}", 502)
diff --git a/metabolomics_spectrum_resolver/tasks.py b/metabolomics_spectrum_resolver/tasks.py
index 24817d3e..77418c0f 100644
--- a/metabolomics_spectrum_resolver/tasks.py
+++ b/metabolomics_spectrum_resolver/tasks.py
@@ -9,7 +9,7 @@
import spectrum_utils.spectrum as sus
from metabolomics_spectrum_resolver import drawing, parsing
-
+from metabolomics_spectrum_resolver import tasks_analytics
memory = joblib.Memory("tmp/joblibcache", verbose=0)
cached_parse_usi = memory.cache(parsing.parse_usi)
@@ -110,6 +110,16 @@ def parse_usi(usi: str) -> Tuple[sus.MsmsSpectrum, str, str]:
A tuple of (i) the `MsmsSpectrum`, (ii) its source link, and (iii) its
SPLASH.
"""
+
+ # We are going to do the analytics now
+ tasks_analytics.task_analytics_event(
+ "parse_usi_or_spectrum"
+ )
+
+ # Debugging logging
+ import sys
+ print("Parsing {}\n".format(usi), file=sys.stderr, flush=True)
+
# First attempt to schedule with Celery.
try:
return _task_parse_usi.apply_async(args=(usi,)).get()
@@ -120,7 +130,7 @@ def parse_usi(usi: str) -> Tuple[sus.MsmsSpectrum, str, str]:
return parsing.parse_usi(usi)
-@celery_instance.task(time_limit=30, base=celery_once.QueueOnce)
+@celery_instance.task(time_limit=45, base=celery_once.QueueOnce)
def _task_parse_usi_or_spectrum(
usi: str, spectrum: dict
) -> Tuple[sus.MsmsSpectrum, str, str]:
@@ -142,6 +152,7 @@ def _task_parse_usi_or_spectrum(
A tuple of (i) the `MsmsSpectrum`, (ii) its source link, and (iii) its
SPLASH.
"""
+
# noinspection PyTypeChecker
return cached_parse_usi_or_spectrum(usi, spectrum)
diff --git a/metabolomics_spectrum_resolver/tasks_analytics.py b/metabolomics_spectrum_resolver/tasks_analytics.py
new file mode 100644
index 00000000..a4b3d277
--- /dev/null
+++ b/metabolomics_spectrum_resolver/tasks_analytics.py
@@ -0,0 +1,44 @@
+import io
+import sys
+from typing import Any, Tuple
+
+import celery
+import umami
+
+celery_instance = celery.Celery(
+ "tasks_analytics",
+ backend="redis://metabolomicsusi-redis",
+ broker="redis://metabolomicsusi-redis",
+)
+
+umami.set_url_base("https://analytics-api.gnps2.org/")
+umami.set_website_id('2e8b3719-51ec-4786-9b29-3e9198c31ea5')
+umami.set_hostname('analytics-api.gnps2.org')
+
+celery_instance.conf.task_routes = {
+ "metabolomics_spectrum_resolver.tasks_analytics._task_analytics_event": {
+ "queue": "worker-analytics"
+ },
+}
+
+def task_analytics_event(event_type: str) -> str:
+ """
+ Task to log an analytics event using umami.
+
+ Args:
+ event_type (str): The type of event to log.
+
+ Returns:
+ str: Confirmation message indicating the event was sent.
+ """
+
+ _task_analytics_event.apply_async(
+ args=([event_type])
+ )
+
+ return f"Event '{event_type}' logged."
+
+@celery_instance.task(time_limit=10)
+def _task_analytics_event(event_type) -> str:
+ umami.new_event(event_name=event_type)
+
diff --git a/metabolomics_spectrum_resolver/templates/homepage.html b/metabolomics_spectrum_resolver/templates/homepage.html
index 2e827bd8..3adeaf95 100644
--- a/metabolomics_spectrum_resolver/templates/homepage.html
+++ b/metabolomics_spectrum_resolver/templates/homepage.html
@@ -80,24 +80,8 @@
-
-
-
-
-
-
-
-
@@ -136,7 +120,7 @@
diff --git a/metabolomics_spectrum_resolver/templates/minimal.html b/metabolomics_spectrum_resolver/templates/minimal.html
index 0e1fd4e2..a6be389d 100644
--- a/metabolomics_spectrum_resolver/templates/minimal.html
+++ b/metabolomics_spectrum_resolver/templates/minimal.html
@@ -50,7 +50,7 @@
diff --git a/metabolomics_spectrum_resolver/views.py b/metabolomics_spectrum_resolver/views.py
index 17becf03..7be99508 100644
--- a/metabolomics_spectrum_resolver/views.py
+++ b/metabolomics_spectrum_resolver/views.py
@@ -39,7 +39,6 @@
def render_homepage():
return flask.render_template("homepage.html")
-
@blueprint.route("/contributors", methods=["GET"])
def render_contributors():
return flask.render_template("contributors.html")
@@ -625,6 +624,10 @@ def generate_qr():
qr_bytes.seek(0)
return flask.send_file(qr_bytes, "image/png")
+@blueprint.route("/robot.txt")
+def robot():
+ # Disallow all
+ return "User-agent: *\nDisallow: /", 200
@blueprint.errorhandler(Exception)
def render_error(error):
diff --git a/metabolomics_spectrum_resolver/zenodo_mzml_repo.py b/metabolomics_spectrum_resolver/zenodo_mzml_repo.py
new file mode 120000
index 00000000..e34961a1
--- /dev/null
+++ b/metabolomics_spectrum_resolver/zenodo_mzml_repo.py
@@ -0,0 +1 @@
+mass-spec-package/zenodo_mzml_repo.py
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index fdddf450..62ea4144 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,5 +19,5 @@ requests
requests_cache
scipy
spectrum_utils
-werkzeug
-git+https://github.com/berlinguyinca/spectra-hash.git#subdirectory=python
+werkzeug==2.0.0
+git+https://github.com/berlinguyinca/spectra-hash.git#subdirectory=python
\ No newline at end of file
diff --git a/run_worker.sh b/run_worker.sh
index 2073a4fb..23601fbe 100755
--- a/run_worker.sh
+++ b/run_worker.sh
@@ -2,5 +2,9 @@
source activate usi
export C_FORCE_ROOT="true"
+
+# Running an analytics worker
+celery -A metabolomics_spectrum_resolver.tasks_analytics worker --concurrency=1 -Q worker-analytics --loglevel INFO --detach
+
#TODO: Make sure we don't run this worker as root
-celery -A metabolomics_spectrum_resolver.tasks worker -l info --autoscale=12,1 -Q worker --max-tasks-per-child 10 --loglevel INFO
+celery -A metabolomics_spectrum_resolver.tasks worker -l info --autoscale=32,1 -Q worker --max-tasks-per-child 10 --loglevel INFO
diff --git a/test/test_unit.py b/test/test_unit.py
index 98782bf9..91b402bb 100644
--- a/test/test_unit.py
+++ b/test/test_unit.py
@@ -218,7 +218,6 @@ def test_parse_motifdb():
parsing.parse_usi(usi.replace(":171163", ":this_index_does_not_exist"))
assert exc_info.value.error_code == 404
-
def test_parse_timeout():
with unittest.mock.patch(
"metabolomics_spectrum_resolver.parsing.requests.get",
diff --git a/test/usi_test_data.py b/test/usi_test_data.py
index b80eee07..08357aac 100644
--- a/test/usi_test_data.py
+++ b/test/usi_test_data.py
@@ -10,6 +10,8 @@
"mzspec:GNPS:GNPS-LIBRARY:accession:CCMSLIB00005436077",
"mzspec:MASSBANK::accession:SM858102",
"mzspec:MASSBANK::accession:MSBNK-AAFC-AC000646",
+ # New Massbank identifier with lowercase
+ "mzspec:MASSBANK::accession:MSBNK-Athens_Univ-AU259904",
"mzspec:MS2LDA:TASK-190:accession:270684",
"mzspec:MOTIFDB::accession:171163",
"mzspec:MSV000082791:(-)-epigallocatechin:scan:2",
@@ -29,6 +31,8 @@
"mzspec:MassIVE:TASK-f4b86b150a164ee4a440b661e97a7193-spectra/specs_ms.mgf:scan:287215:HPYFYAPELLF[-10.059]FAKR/3",
# MassIVE Task USIs disguised as GNPS Task USIs
"mzspec:GNPS:TASK-f4b86b150a164ee4a440b661e97a7193-spectra/specs_ms.mgf:scan:287215:HPYFYAPELLF[-10.059]FAKR/3",
+ # Metabolomics Workbench USIs
+ "mzspec:ST000003:StemCell+Data+and+Raw+Files/iPSC-T1R1:scan:3",
# Legacy cases.
"mzspec:GNPSTASK-c95481f0c53d42e78a61bf899e9f9adb:spectra/specs_ms.mgf:scan:1943",
"mzspec:GNPSTASK-64b22841ab3548f987b3cfc18696a581:spectra/specs_ms.mgf:scan:1469",