diff --git a/metabolomics_spectrum_resolver/parsing.py b/metabolomics_spectrum_resolver/parsing.py index 9e1e30f..b6f715c 100644 --- a/metabolomics_spectrum_resolver/parsing.py +++ b/metabolomics_spectrum_resolver/parsing.py @@ -102,32 +102,37 @@ def parse_usi(usi: str) -> Tuple[sus.MsmsSpectrum, str, str]: except ValueError: raise e try: + # Split off potential ProForma annotations because not all backends + # can handle those. + usi_base = usi[:match.start(5)] + usi[match.end(5):] + proforma = match.group(5)[1:] if match.group(5) is not None else None + # Retrieve the spectrum from its resource. collection = match.group(1).lower() - annotation = match.group(5) - # Send all proteomics USIs (by definition all annotated USIs) to - # MassIVE. - # mzdraft USIs are assumed to also use ProForma notation. If this - # changes, be sure to change this logic. + # Send all proteomics USIs to MassIVE. if ( - annotation is not None - or collection.startswith("msv") + collection.startswith("msv") or collection.startswith("pxd") or collection.startswith("pxl") or collection.startswith("rpxd") or collection == "massivekb" or collection == "massive" ): - spectrum, source_link = _parse_msv_pxd(usi) + spectrum, source_link = _parse_msv_pxd(usi_base) elif collection == "gnps": - spectrum, source_link = _parse_gnps(usi) + spectrum, source_link = _parse_gnps(usi_base) elif collection == "massbank": - spectrum, source_link = _parse_massbank(usi) + spectrum, source_link = _parse_massbank(usi_base) elif collection == "ms2lda": - spectrum, source_link = _parse_ms2lda(usi) + spectrum, source_link = _parse_ms2lda(usi_base) elif collection == "motifdb": - spectrum, source_link = _parse_motifdb(usi) + spectrum, source_link = _parse_motifdb(usi_base) else: raise UsiError(f"Unknown USI collection: {match.group(1)}", 400) + # Assign ProForma annotation. + if proforma is not None: + # TODO: spectrum_utils native ProForma resolution. + # spectrum.annotate(proforma) + pass splash_key = splash_builder.splash( splash.Spectrum( list(zip(spectrum.mz, spectrum.intensity)), @@ -368,52 +373,9 @@ def _parse_msv_pxd(usi: str) -> Tuple[sus.MsmsSpectrum, str]: f"https://massive.ucsd.edu/ProteoSAFe/" f"QueryMSV?id={dataset_identifier}" ) - - # Parse the peptide if available. - try: - # Get the peptide information from resolution, - # this dereferences proforma. - peptide_clean = lookup_json["usi_components"]["peptide"] - peptide = lookup_json["usi_components"]["variant"] - charge = int(lookup_json["usi_components"]["charge"]) - - # Parse out gapped sequence (e.g. X+129.04259), faking it - # with Glycine as the base residue and adding more mods to - # it. - gapmod_pattern = re.compile("X[+][0-9.]*") - transformed_peptide = peptide - for match in gapmod_pattern.finditer(peptide): - gap_mass = float(match.group().replace("X", "")) - # Fake the gap with glycine. - transformed_peptide = transformed_peptide.replace( - match.group(), f"G{gap_mass - 57.021463735:+}" - ) - peptide_clean = peptide_clean.replace("X", "G") - peptide = transformed_peptide - - # Parse out modifications. - mod_pattern = re.compile("[-+][0-9.]*") - modifications, previous_mod_len = {}, 0 - for match in mod_pattern.finditer(peptide): - found_pos = match.start() - found_len = len(match.group()) - i = max(0, found_pos - previous_mod_len - 1) - modifications[i] = float(match.group()) - previous_mod_len += found_len - - spectrum = sus.MsmsSpectrum( - usi, - precursor_mz, - charge, - mz, - intensity, - peptide=peptide_clean, - modifications=modifications, - ) - except (TypeError, KeyError): - spectrum = sus.MsmsSpectrum( - usi, precursor_mz, charge, mz, intensity - ) + spectrum = sus.MsmsSpectrum( + usi, precursor_mz, charge, mz, intensity + ) return spectrum, source_link except requests.exceptions.HTTPError: