diff --git a/msql.ebnf b/msql.ebnf index bb67fa9..f893efb 100644 --- a/msql.ebnf +++ b/msql.ebnf @@ -20,10 +20,13 @@ filterfullcondition: filterfullcondition booleanconjunction filterfullcondition | condition ":" qualifier | condition -condition: conditionfields "=" floating - | conditionfields equal variable +// Conditions +condition: conditionfields equal conditionvalue | conditionfields equal "(" statement ")" +conditionvalue: conditionvalue "," conditionvalue + | variable + | floating qualifier: qualifier ":" qualifier | qualifierfields equal floating diff --git a/msql_engine.py b/msql_engine.py index 31197b5..d04e021 100644 --- a/msql_engine.py +++ b/msql_engine.py @@ -461,23 +461,32 @@ def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2 # Filtering MS2 Product Ions if condition["type"] == "ms2productcondition": - mz = condition["value"][0] - mz_tol = _get_mz_tolerance(condition.get("qualifiers", None), mz) - mz_min = mz - mz_tol - mz_max = mz + mz_tol + filtered_scans = set() + for mz in condition["value"]: + mz_tol = _get_mz_tolerance(condition.get("qualifiers", None), mz) + mz_min = mz - mz_tol + mz_max = mz + mz_tol - min_int, min_intpercent = _get_minintensity(condition.get("qualifiers", None)) + min_int, min_intpercent = _get_minintensity(condition.get("qualifiers", None)) - ms2_filtered_df = ms2_df[(ms2_df["mz"] > mz_min) & (ms2_df["mz"] < mz_max) & (ms2_df["i"] > min_int) & (ms2_df["i_norm"] > min_intpercent)] + ms2_filtered_df = ms2_df[ + (ms2_df["mz"] > mz_min) & + (ms2_df["mz"] < mz_max) & + (ms2_df["i"] > min_int) & + (ms2_df["i_norm"] > min_intpercent) + ] - # Setting the intensity match register - _set_intensity_register(ms2_filtered_df, reference_conditions_register, condition) + # Setting the intensity match register + _set_intensity_register(ms2_filtered_df, reference_conditions_register, condition) - # Applying the intensity match - ms2_filtered_df = _filter_intensitymatch(ms2_filtered_df, reference_conditions_register, condition) + # Applying the intensity match + ms2_filtered_df = _filter_intensitymatch(ms2_filtered_df, reference_conditions_register, condition) + + if len(ms2_filtered_df) > 0: + # Getting union of all scans + filtered_scans = filtered_scans.union(set(ms2_filtered_df["scan"])) # Filtering the actual data structures - filtered_scans = set(ms2_filtered_df["scan"]) ms2_df = ms2_df[ms2_df["scan"].isin(filtered_scans)] # Filtering the MS1 data now @@ -506,28 +515,32 @@ def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2 # Filtering MS2 Neutral Loss if condition["type"] == "ms2neutrallosscondition": - mz = condition["value"][0] - mz_tol = _get_mz_tolerance(condition.get("qualifiers", None), mz) - nl_min = mz - mz_tol - nl_max = mz + mz_tol + filtered_scans = set() + for mz in condition["value"]: + mz_tol = _get_mz_tolerance(condition.get("qualifiers", None), mz) + nl_min = mz - mz_tol + nl_max = mz + mz_tol - min_int, min_intpercent = _get_minintensity(condition.get("qualifiers", None)) + min_int, min_intpercent = _get_minintensity(condition.get("qualifiers", None)) - ms2_filtered_df = ms2_df[ - ((ms2_df["precmz"] - ms2_df["mz"]) > nl_min) & - ((ms2_df["precmz"] - ms2_df["mz"]) < nl_max) & - (ms2_df["i"] > min_int) & - (ms2_df["i_norm"] > min_intpercent) - ] + ms2_filtered_df = ms2_df[ + ((ms2_df["precmz"] - ms2_df["mz"]) > nl_min) & + ((ms2_df["precmz"] - ms2_df["mz"]) < nl_max) & + (ms2_df["i"] > min_int) & + (ms2_df["i_norm"] > min_intpercent) + ] - # Setting the intensity match register - _set_intensity_register(ms2_filtered_df, reference_conditions_register, condition) + # Setting the intensity match register + _set_intensity_register(ms2_filtered_df, reference_conditions_register, condition) - # Applying the intensity match - ms2_filtered_df = _filter_intensitymatch(ms2_filtered_df, reference_conditions_register, condition) + # Applying the intensity match + ms2_filtered_df = _filter_intensitymatch(ms2_filtered_df, reference_conditions_register, condition) + + if len(ms2_filtered_df) > 0: + # Getting union of all scans + filtered_scans = filtered_scans.union(set(ms2_filtered_df["scan"])) # Filtering the actual data structures - filtered_scans = set(ms2_filtered_df["scan"]) ms2_df = ms2_df[ms2_df["scan"].isin(filtered_scans)] # Filtering the MS1 data now @@ -538,33 +551,35 @@ def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2 # finding MS1 peaks if condition["type"] == "ms1mzcondition": - mz = condition["value"][0] - mz_tol = _get_mz_tolerance(condition.get("qualifiers", None), mz) - mz_min = mz - mz_tol - mz_max = mz + mz_tol + filtered_scans = set() + for mz in condition["value"]: + mz_tol = _get_mz_tolerance(condition.get("qualifiers", None), mz) + mz_min = mz - mz_tol + mz_max = mz + mz_tol - min_int, min_intpercent = _get_minintensity(condition.get("qualifiers", None)) - ms1_filtered_df = ms1_df[ - (ms1_df["mz"] > mz_min) & - (ms1_df["mz"] < mz_max) & - (ms1_df["i"] > min_int) & - (ms1_df["i_norm"] > min_intpercent)] - - #print("YYY", mz_min, mz_max, min_int, min_intpercent, len(ms1_filtered_df)) + min_int, min_intpercent = _get_minintensity(condition.get("qualifiers", None)) + ms1_filtered_df = ms1_df[ + (ms1_df["mz"] > mz_min) & + (ms1_df["mz"] < mz_max) & + (ms1_df["i"] > min_int) & + (ms1_df["i_norm"] > min_intpercent)] + + #print("YYY", mz_min, mz_max, min_int, min_intpercent, len(ms1_filtered_df)) - # Setting the intensity match register - _set_intensity_register(ms1_filtered_df, reference_conditions_register, condition) + # Setting the intensity match register + _set_intensity_register(ms1_filtered_df, reference_conditions_register, condition) - # Applying the intensity match - ms1_filtered_df = _filter_intensitymatch(ms1_filtered_df, reference_conditions_register, condition) + # Applying the intensity match + ms1_filtered_df = _filter_intensitymatch(ms1_filtered_df, reference_conditions_register, condition) - #print(ms1_filtered_df) + if len(ms1_filtered_df) > 0: + # Getting union of all scans + filtered_scans = filtered_scans.union(set(ms1_filtered_df["scan"])) - if len(ms1_filtered_df) == 0: + if filtered_scans == 0: return pd.DataFrame(), pd.DataFrame() # Filtering the actual data structures - filtered_scans = set(ms1_filtered_df["scan"]) ms1_df = ms1_df[ms1_df["scan"].isin(filtered_scans)] ms2_df = ms2_df[ms2_df["ms1scan"].isin(filtered_scans)] @@ -583,7 +598,7 @@ def _executeconditions_query(parsed_dict, input_filename, ms1_input_df=None, ms2 if not condition["conditiontype"] == "filter": continue - logging.error("FILTER CONDITION", condition) + #logging.error("FILTER CONDITION", condition) # filtering MS1 peaks if condition["type"] == "ms1mzcondition": diff --git a/msql_parser.py b/msql_parser.py index 27eac6a..5ff1ba6 100644 --- a/msql_parser.py +++ b/msql_parser.py @@ -87,7 +87,10 @@ def qualifier(self, items): def condition(self, items): condition_dict = {} condition_dict["type"] = items[0].children[0] - condition_dict["value"] = [items[-1]] + if type(items[-1]) is dict: + condition_dict["value"] = [items[-1]] + else: + condition_dict["value"] = items[-1] return condition_dict def wherefullcondition(self, items): @@ -159,7 +162,16 @@ def filterfullcondition(self, items): merged_list += items[-1] return merged_list + + def conditionvalue(self, items): + if len(items) == 1: + return items + if len(items) == 2: + merged_list = [] + merged_list += items[0] + merged_list += items[-1] + return merged_list def querytype(self, items): query_dict = {} diff --git a/test.py b/test.py index 225aa11..15d4f5f 100644 --- a/test.py +++ b/test.py @@ -48,6 +48,8 @@ def test_qc_ms1_ms2peak(): print(set(results_df["scan"])) assert(len(results_df) > 1000) + + def test_diphen(): query = "QUERY scannum(MS2DATA) WHERE MS2PROD=167.0857:TOLERANCEPPM=5" print(msql_parser.parse_msql(query)) @@ -105,8 +107,8 @@ def test_variable_ms1(): def test_subquery(): #query = "QUERY scanrangesum(MS1DATA, TOLERANCE=0.1) WHERE MS1MZ=(QUERY scanmz(MS2DATA) WHERE MS2NL=176.0321 AND MS2PROD=85.02915)" query = "QUERY MS1DATA WHERE MS1MZ=(QUERY scanmz(MS2DATA) WHERE MS2NL=176.0321 AND MS2PROD=85.02915)" - results_df = msql_engine.process_query(query, "test/GNPS00002_A3_p.mzML") print(json.dumps(msql_parser.parse_msql(query), indent=4)) + results_df = msql_engine.process_query(query, "test/GNPS00002_A3_p.mzML") print(results_df) def test_filter(): @@ -280,6 +282,14 @@ def test_gnps_full_library(): results_df = msql_engine.process_query(query, "test/gnps.json") print(results_df) +def test_multiple_mz(): + query = "QUERY scaninfo(MS2DATA) WHERE \ + MS2PROD=271.06,217.1" + parse_obj = msql_parser.parse_msql(query) + print(parse_obj) + + results_df = msql_engine.process_query(query, "test/GNPS00002_A3_p.mzML") + print(results_df) def test_networking_mgf_library(): query = "QUERY scaninfo(MS2DATA) WHERE \ @@ -348,7 +358,8 @@ def test_parse(): for line in open("test_queries.txt"): test_query = line.rstrip() print(test_query) - msql_parser.parse_msql(test_query) + parsed_result = msql_parser.parse_msql(test_query) + assert(parsed_result is not None) def test_query(): for line in open("test_queries.txt"): @@ -372,7 +383,7 @@ def main(): #test_parse() #test_query() #test_xic() - #test_subquery() + test_subquery() #test_variable_parse() #test_variable() #test_variable_ms1() @@ -395,12 +406,13 @@ def main(): #test_ms1_cu() #test_neutral_loss_intensity() #test_gnps_library() - test_gnps_full_library() + #test_gnps_full_library() #test_networking_mgf_library() #test_swath() #test_albicidin_tag() #test_double_brominated() #test_agilent() + #test_multiple_mz() if __name__ == "__main__": main() diff --git a/test_queries.txt b/test_queries.txt index 0813b5e..a10885f 100644 --- a/test_queries.txt +++ b/test_queries.txt @@ -10,4 +10,5 @@ QUERY scannum(MS2DATA) WHERE MS2PROD=88:TOLERANCEMZ=0.1:INTENSITYPERCENT>10 AND QUERY scannum(MS2DATA) WHERE MS2NL=163 QUERY scaninfo(MS1DATA) WHERE MS1MZ=425.2898:TOLERANCEMZ=0.1:INTENSITYPERCENT>0.1 QUERY scaninfo(MS1DATA) WHERE MS1MZ=425.2898:TOLERANCEMZ=0.1:INTENSITYPERCENT>1 -QUERY scaninfo(MS1DATA) WHERE RTMIN=50 \ No newline at end of file +QUERY scaninfo(MS1DATA) WHERE RTMIN=50 +QUERY scaninfo(MS1DATA) WHERE MS1MZ=425.2898,426.289 \ No newline at end of file