Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,8 @@ deploy_clean:

deploy_pypi:
python -m build --sdist --wheel .
twine upload dist/*
twine upload dist/*

specific_pytest:
#pytest -vv --cov=massql ./tests/test_extraction.py::test_extract_MGF
python ./tests/test_extraction.py
58 changes: 33 additions & 25 deletions massql/msql_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
import pandas as pd
import numpy as np
import pymzml
from pyteomics import mzxml
from matchms.importing import load_from_mgf
from pyteomics import mzxml, mgf
from psims.mzml.writer import MzMLWriter

def main():
Expand Down Expand Up @@ -139,31 +138,38 @@ def _extract_mzXML_scan(input_filename, spectrum_identifier_list):

def _extract_mgf_scan(input_filename, spectrum_identifier_list):
output_list = []
spectrum_identifier_set = set([str(spectrum_scan) for spectrum_scan in spectrum_identifier_list])

file = load_from_mgf(input_filename)

spec = None
for spectrum in file:
scan_number = spectrum.metadata["scans"]
if str(scan_number) in spectrum_identifier_set:
spec = spectrum

mz_list = list(spec.peaks.mz)
i_list = list(spec.peaks.intensities)

peaks_list = []
for i in range(len(mz_list)):
peaks_list.append([mz_list[i], i_list[i]])

# Convert identifiers to a set
spectrum_identifier_set = set(str(scan) for scan in spectrum_identifier_list)

# Loading Data
spectrum_obj = {}
spectrum_obj["peaks"] = peaks_list
spectrum_obj["mslevel"] = 2
spectrum_obj["scan"] = scan_number
spectrum_obj["precursor_mz"] = float(spec.metadata["pepmass"][0])
# Open the MGF file using pyteomics
with mgf.read(input_filename) as reader:
for spectrum in reader:
# Pyteomics stores metadata in the 'params' dictionary
# We convert to string to ensure matching works against the set
scan_number = str(spectrum['params'].get('scans'))

if scan_number in spectrum_identifier_set:
# pyteomics returns numpy arrays for 'm/z array' and 'intensity array'
# We zip them to create the [[mz, int], [mz, int]] structure
mz_array = spectrum['m/z array']
int_array = spectrum['intensity array']

# Create the list of lists
peaks_list = [[float(mz), float(i)] for mz, i in zip(mz_array, int_array)]

# Extract precursor m/z (pepmass is usually a tuple: (mz, intensity))
precursor_mz = float(spectrum['params']['pepmass'][0])

# Construct the object
spectrum_obj = {
"peaks": peaks_list,
"mslevel": 2,
"scan": scan_number,
"precursor_mz": precursor_mz
}

output_list.append(spectrum_obj)
output_list.append(spectrum_obj)

return output_list

Expand Down Expand Up @@ -227,6 +233,8 @@ def _extract_spectra(results_df, input_spectra_folder,
elif input_spectra_filename[-5:] == ".json":
spectrum_obj_list = _extract_json_scan(input_spectra_filename, list(set(results_by_file_df["scan"])))

print(spectrum_obj_list)

for spectrum_obj in spectrum_obj_list:
# These are a new scan number in the file, not sure if we need this
spectrum_obj["new_scan"] = current_scan
Expand Down
107 changes: 69 additions & 38 deletions massql/msql_fileloading.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
import numpy as np

from tqdm import tqdm
from matchms.importing import load_from_mgf
import pymzml
from pyteomics import mzxml, mzml
from pyteomics import mzxml, mzml, mgf

import logging
logger = logging.getLogger('msql_fileloading')
Expand Down Expand Up @@ -143,51 +142,83 @@ def load_data(input_filename, cache=None, cache_dir=None, cache_file=None):
return ms1_df, ms2_df

def _load_data_mgf(input_filename):
file = load_from_mgf(input_filename)
ms2_data_list = []

ms2mz_list = []
for i, spectrum in enumerate(file):
if len(spectrum.peaks.mz) == 0:
continue

mz_list = list(spectrum.peaks.mz)
i_list = list(spectrum.peaks.intensities)
i_max = max(i_list)
i_sum = sum(i_list)
# Use 'with' context manager for safe file handling
with mgf.read(input_filename) as reader:
for index, spectrum in enumerate(reader):

# Pyteomics returns numpy arrays
mz_array = spectrum['m/z array']
int_array = spectrum['intensity array']

for i in range(len(mz_list)):
if i_list[i] == 0:
# Skip empty spectra
if len(mz_array) == 0:
continue

peak_dict = {}
peak_dict["i"] = i_list[i]
peak_dict["i_norm"] = i_list[i] / i_max
peak_dict["i_tic_norm"] = i_list[i] / i_sum
peak_dict["mz"] = mz_list[i]
# Calculate spectrum-wide statistics
i_max = int_array.max()
i_sum = int_array.sum()

# --- Metadata Extraction ---
params = spectrum.get('params', {})

# Handling malformed mgf files
# Scan: Use 'scans' or fallback to index
scan = params.get('scans', index + 1)

# RT: Parse 'rtinseconds', default 0, convert to minutes
try:
peak_dict["scan"] = spectrum.metadata["scans"]
except:
peak_dict["scan"] = i + 1
rt = float(params.get('rtinseconds', 0)) / 60.0
except (ValueError, TypeError):
rt = 0.0

# Precursor m/z: 'pepmass' is usually a tuple (mz, intensity)
try:
peak_dict["rt"] = float(spectrum.metadata["rtinseconds"]) / 60
except:
peak_dict["rt"] = 0
precmz = float(params.get('pepmass', [0])[0])
except (IndexError, ValueError, TypeError):
precmz = 0.0

# Charge: Parse 'CHARGE=2+' format
# Pyteomics often returns charge as a list or integer depending on config
try:
peak_dict["precmz"] = float(spectrum.metadata["pepmass"][0])
charge_val = params.get('charge', [1])
# Handle cases where it is a list e.g., [2+] or [2]
if isinstance(charge_val, list):
charge_str = str(charge_val[0])
else:
charge_str = str(charge_val)
# Strip '+' and convert to int
charge = int(charge_str.strip('+'))
except:
peak_dict["precmz"] = 0

peak_dict["ms1scan"] = 0
peak_dict["charge"] = 1 # TODO: Add Charge Correctly here
peak_dict["polarity"] = 1 # TODO: Add Polarity Correctly here

ms2mz_list.append(peak_dict)

# Turning into pandas data frames
ms1_df = pd.DataFrame([peak_dict])
ms2_df = pd.DataFrame(ms2mz_list)
charge = 1

# --- Peak Extraction ---
# Zip arrays to iterate pairs
for mz, intensity in zip(mz_array, int_array):
if intensity == 0:
continue

peak_dict = {
"i": intensity,
"i_norm": intensity / i_max,
"i_tic_norm": intensity / i_sum,
"mz": mz,
"scan": scan,
"rt": rt,
"precmz": precmz,
"ms1scan": 0,
"charge": charge, # Implemented
"polarity": 1 # Default
}

ms2_data_list.append(peak_dict)

# Convert to DataFrames
ms2_df = pd.DataFrame(ms2_data_list)

# Original code assigned the last single peak to ms1_df.
# Initializing empty to prevent bugs.
ms1_df = pd.DataFrame()

return ms1_df, ms2_df

Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ pandas
pyarrow
tqdm
py_expression_eval
matchms
pyteomics
psims
plotly
Expand Down
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
"pyarrow",
"tqdm",
"py_expression_eval",
"matchms",
"pyteomics",
"psims",
"plotly",
Expand Down
7 changes: 5 additions & 2 deletions tests/test_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ def test_extract_MGF():
results_df["filename"] = "specs_ms.mgf"

print("Extracting", len(results_df))
print(results_df)

# Extracting now
merged_summary_df = msql_extract._extract_spectra(results_df, "tests/data/", output_json_filename="test.json")
assert(len(merged_summary_df) == 5)

Expand Down Expand Up @@ -90,9 +93,9 @@ def test_waters_uv_extract():
def main():
#test_extract_mzML()
#test_extract_mzXML()
#test_extract_MGF()
test_extract_MGF()
#test_gnps_library_extract()
test_waters_uv_extract()
#test_waters_uv_extract()


if __name__ == "__main__":
Expand Down
Loading