diff --git a/Makefile b/Makefile index cab00af..1f027a2 100644 --- a/Makefile +++ b/Makefile @@ -33,4 +33,8 @@ deploy_clean: deploy_pypi: python -m build --sdist --wheel . - twine upload dist/* \ No newline at end of file + twine upload dist/* + +specific_pytest: + #pytest -vv --cov=massql ./tests/test_extraction.py::test_extract_MGF + python ./tests/test_extraction.py \ No newline at end of file diff --git a/massql/msql_extract.py b/massql/msql_extract.py index 8385df6..137f11c 100644 --- a/massql/msql_extract.py +++ b/massql/msql_extract.py @@ -6,8 +6,7 @@ import pandas as pd import numpy as np import pymzml -from pyteomics import mzxml -from matchms.importing import load_from_mgf +from pyteomics import mzxml, mgf from psims.mzml.writer import MzMLWriter def main(): @@ -139,31 +138,38 @@ def _extract_mzXML_scan(input_filename, spectrum_identifier_list): def _extract_mgf_scan(input_filename, spectrum_identifier_list): output_list = [] - spectrum_identifier_set = set([str(spectrum_scan) for spectrum_scan in spectrum_identifier_list]) - - file = load_from_mgf(input_filename) - - spec = None - for spectrum in file: - scan_number = spectrum.metadata["scans"] - if str(scan_number) in spectrum_identifier_set: - spec = spectrum - - mz_list = list(spec.peaks.mz) - i_list = list(spec.peaks.intensities) - - peaks_list = [] - for i in range(len(mz_list)): - peaks_list.append([mz_list[i], i_list[i]]) + + # Convert identifiers to a set + spectrum_identifier_set = set(str(scan) for scan in spectrum_identifier_list) - # Loading Data - spectrum_obj = {} - spectrum_obj["peaks"] = peaks_list - spectrum_obj["mslevel"] = 2 - spectrum_obj["scan"] = scan_number - spectrum_obj["precursor_mz"] = float(spec.metadata["pepmass"][0]) + # Open the MGF file using pyteomics + with mgf.read(input_filename) as reader: + for spectrum in reader: + # Pyteomics stores metadata in the 'params' dictionary + # We convert to string to ensure matching works against the set + scan_number = str(spectrum['params'].get('scans')) + + if scan_number in spectrum_identifier_set: + # pyteomics returns numpy arrays for 'm/z array' and 'intensity array' + # We zip them to create the [[mz, int], [mz, int]] structure + mz_array = spectrum['m/z array'] + int_array = spectrum['intensity array'] + + # Create the list of lists + peaks_list = [[float(mz), float(i)] for mz, i in zip(mz_array, int_array)] + + # Extract precursor m/z (pepmass is usually a tuple: (mz, intensity)) + precursor_mz = float(spectrum['params']['pepmass'][0]) + + # Construct the object + spectrum_obj = { + "peaks": peaks_list, + "mslevel": 2, + "scan": scan_number, + "precursor_mz": precursor_mz + } - output_list.append(spectrum_obj) + output_list.append(spectrum_obj) return output_list @@ -227,6 +233,8 @@ def _extract_spectra(results_df, input_spectra_folder, elif input_spectra_filename[-5:] == ".json": spectrum_obj_list = _extract_json_scan(input_spectra_filename, list(set(results_by_file_df["scan"]))) + print(spectrum_obj_list) + for spectrum_obj in spectrum_obj_list: # These are a new scan number in the file, not sure if we need this spectrum_obj["new_scan"] = current_scan diff --git a/massql/msql_fileloading.py b/massql/msql_fileloading.py index 66ed447..fc8fe83 100644 --- a/massql/msql_fileloading.py +++ b/massql/msql_fileloading.py @@ -5,9 +5,8 @@ import numpy as np from tqdm import tqdm -from matchms.importing import load_from_mgf import pymzml -from pyteomics import mzxml, mzml +from pyteomics import mzxml, mzml, mgf import logging logger = logging.getLogger('msql_fileloading') @@ -143,51 +142,83 @@ def load_data(input_filename, cache=None, cache_dir=None, cache_file=None): return ms1_df, ms2_df def _load_data_mgf(input_filename): - file = load_from_mgf(input_filename) + ms2_data_list = [] - ms2mz_list = [] - for i, spectrum in enumerate(file): - if len(spectrum.peaks.mz) == 0: - continue - - mz_list = list(spectrum.peaks.mz) - i_list = list(spectrum.peaks.intensities) - i_max = max(i_list) - i_sum = sum(i_list) + # Use 'with' context manager for safe file handling + with mgf.read(input_filename) as reader: + for index, spectrum in enumerate(reader): + + # Pyteomics returns numpy arrays + mz_array = spectrum['m/z array'] + int_array = spectrum['intensity array'] - for i in range(len(mz_list)): - if i_list[i] == 0: + # Skip empty spectra + if len(mz_array) == 0: continue - peak_dict = {} - peak_dict["i"] = i_list[i] - peak_dict["i_norm"] = i_list[i] / i_max - peak_dict["i_tic_norm"] = i_list[i] / i_sum - peak_dict["mz"] = mz_list[i] + # Calculate spectrum-wide statistics + i_max = int_array.max() + i_sum = int_array.sum() + + # --- Metadata Extraction --- + params = spectrum.get('params', {}) - # Handling malformed mgf files + # Scan: Use 'scans' or fallback to index + scan = params.get('scans', index + 1) + + # RT: Parse 'rtinseconds', default 0, convert to minutes try: - peak_dict["scan"] = spectrum.metadata["scans"] - except: - peak_dict["scan"] = i + 1 + rt = float(params.get('rtinseconds', 0)) / 60.0 + except (ValueError, TypeError): + rt = 0.0 + + # Precursor m/z: 'pepmass' is usually a tuple (mz, intensity) try: - peak_dict["rt"] = float(spectrum.metadata["rtinseconds"]) / 60 - except: - peak_dict["rt"] = 0 + precmz = float(params.get('pepmass', [0])[0]) + except (IndexError, ValueError, TypeError): + precmz = 0.0 + + # Charge: Parse 'CHARGE=2+' format + # Pyteomics often returns charge as a list or integer depending on config try: - peak_dict["precmz"] = float(spectrum.metadata["pepmass"][0]) + charge_val = params.get('charge', [1]) + # Handle cases where it is a list e.g., [2+] or [2] + if isinstance(charge_val, list): + charge_str = str(charge_val[0]) + else: + charge_str = str(charge_val) + # Strip '+' and convert to int + charge = int(charge_str.strip('+')) except: - peak_dict["precmz"] = 0 - - peak_dict["ms1scan"] = 0 - peak_dict["charge"] = 1 # TODO: Add Charge Correctly here - peak_dict["polarity"] = 1 # TODO: Add Polarity Correctly here - - ms2mz_list.append(peak_dict) - - # Turning into pandas data frames - ms1_df = pd.DataFrame([peak_dict]) - ms2_df = pd.DataFrame(ms2mz_list) + charge = 1 + + # --- Peak Extraction --- + # Zip arrays to iterate pairs + for mz, intensity in zip(mz_array, int_array): + if intensity == 0: + continue + + peak_dict = { + "i": intensity, + "i_norm": intensity / i_max, + "i_tic_norm": intensity / i_sum, + "mz": mz, + "scan": scan, + "rt": rt, + "precmz": precmz, + "ms1scan": 0, + "charge": charge, # Implemented + "polarity": 1 # Default + } + + ms2_data_list.append(peak_dict) + + # Convert to DataFrames + ms2_df = pd.DataFrame(ms2_data_list) + + # Original code assigned the last single peak to ms1_df. + # Initializing empty to prevent bugs. + ms1_df = pd.DataFrame() return ms1_df, ms2_df diff --git a/requirements.txt b/requirements.txt index 5f42f88..3b2f274 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,6 @@ pandas pyarrow tqdm py_expression_eval -matchms pyteomics psims plotly diff --git a/setup.py b/setup.py index f826d66..3b30804 100644 --- a/setup.py +++ b/setup.py @@ -34,7 +34,6 @@ "pyarrow", "tqdm", "py_expression_eval", - "matchms", "pyteomics", "psims", "plotly", diff --git a/tests/test_extraction.py b/tests/test_extraction.py index fb3541c..ec55ec1 100644 --- a/tests/test_extraction.py +++ b/tests/test_extraction.py @@ -61,6 +61,9 @@ def test_extract_MGF(): results_df["filename"] = "specs_ms.mgf" print("Extracting", len(results_df)) + print(results_df) + + # Extracting now merged_summary_df = msql_extract._extract_spectra(results_df, "tests/data/", output_json_filename="test.json") assert(len(merged_summary_df) == 5) @@ -90,9 +93,9 @@ def test_waters_uv_extract(): def main(): #test_extract_mzML() #test_extract_mzXML() - #test_extract_MGF() + test_extract_MGF() #test_gnps_library_extract() - test_waters_uv_extract() + #test_waters_uv_extract() if __name__ == "__main__":