diff --git a/bin/conda_env_falcon.yml b/bin/conda_env_falcon.yml new file mode 100644 index 0000000..1bb31cf --- /dev/null +++ b/bin/conda_env_falcon.yml @@ -0,0 +1,21 @@ +channels: + - conda-forge + - bioconda + - defaults +channel_priority: strict +dependencies: + - python=3.10 + - pandas=2.2.0 + - pyteomics=4.7.1 + - pip: + - numpy==1.23.4 + - xmltodict + - requests + - tqdm + - psutil + - pyopenms==3.0.0 + - spectrum-utils==0.3.5 + - falcon-ms + - pymzml + - numcodecs + - pyarrow diff --git a/bin/scripts/convert_falcon_to_mscluster_format.py b/bin/scripts/convert_falcon_to_mscluster_format.py new file mode 100755 index 0000000..aa2ccdd --- /dev/null +++ b/bin/scripts/convert_falcon_to_mscluster_format.py @@ -0,0 +1,212 @@ +#!/usr/bin/python + +import sys +import os +import argparse +import pandas as pd + + + +def convert_falcon_to_mscluster_format(falcon_csv, input_spectra_folder, output_clusterinfo, output_clustersummary, min_cluster_size=2): + """ + Convert falcon output to mscluster format. + + Falcon format: cluster, filename, scan, precursor_mz, retention_time, new_batch + MSCluster format: #ClusterIdx, #Filename, #SpecIdx, #Scan, #ParentMass, #Charge, #RetTime, #PrecIntensity + + Note: If falcon doesn't have certain fields (like charge, intensity), we use default values (0) + instead of fetching from MGF files. + """ + # Load falcon CSV + clusterinfo_df = pd.read_csv(falcon_csv, sep=',', comment='#') + + print(f"Loaded {len(clusterinfo_df)} rows from falcon CSV") + print(f"Columns: {clusterinfo_df.columns.tolist()}") + + + if 'identifier' in clusterinfo_df.columns: + + clusterinfo_df["filename"] = clusterinfo_df["identifier"].apply( + lambda x: x.split(":")[2] + ".mzML" if len(x.split(":")) > 2 else (x.split(":")[-2] + ".mzML" if len(x.split(":")) > 1 else "unknown.mzML") + ) + clusterinfo_df["scan"] = clusterinfo_df["identifier"].apply( + lambda x: int(x.split(":")[-1]) if x.split(":")[-1].isdigit() else 0 + ) + + # Ensure we have the required columns + required_cols = ['cluster', 'filename', 'scan', 'precursor_mz', 'retention_time'] + missing_cols = [col for col in required_cols if col not in clusterinfo_df.columns] + + if missing_cols: + print(f"ERROR: Required columns not found in falcon CSV: {missing_cols}") + print(f"Available columns: {clusterinfo_df.columns.tolist()}") + print(f"First few rows:") + print(clusterinfo_df.head()) + sys.exit(1) + + if min_cluster_size > 1: + clusterinfo_df = clusterinfo_df[clusterinfo_df['cluster'] != -1] + + # Convert to mscluster format + mscluster_rows = [] + spec_idx_counter = 0 + + for idx, row in clusterinfo_df.iterrows(): + cluster_idx = int(row['cluster']) + + # Skip singletons if min_cluster_size > 1 + if cluster_idx == -1 and min_cluster_size > 1: + continue + + # Handle cluster indexing: falcon uses 0-based, mscluster uses 1-based + # But we also need to handle -1 (singletons) + if cluster_idx == -1: + # Singletons: use a large number or handle separately + cluster_idx = 999999 # Use a large number for singletons + else: + cluster_idx = cluster_idx + 1 # Convert to 1-based + + filename = str(row['filename']) + scan = int(row['scan']) + precursor_mz = float(row['precursor_mz']) + retention_time = float(row['retention_time']) + + # Use default values for fields that falcon doesn't provide + # Don't fetch from MGF files - just use defaults + charge = 0 + precursor_intensity = 0.0 + + + if 'precursor_charge' in row and pd.notna(row['precursor_charge']): + try: + charge = int(row['precursor_charge']) + except (ValueError, TypeError): + charge = 0 + elif 'charge' in row and pd.notna(row['charge']): + try: + charge = int(row['charge']) + except (ValueError, TypeError): + charge = 0 + + if 'precursor_intensity' in row and pd.notna(row['precursor_intensity']): + try: + precursor_intensity = float(row['precursor_intensity']) + except (ValueError, TypeError): + precursor_intensity = 0.0 + + # Convert retention time to seconds if it's in minutes + # Falcon typically outputs RT in minutes, mscluster uses seconds + if retention_time > 0 and retention_time < 1000: # Likely in minutes + retention_time = retention_time * 60.0 + + if not filename.startswith('input_spectra/'): + filename = f"input_spectra/{filename}" + + spec_idx = spec_idx_counter + spec_idx_counter += 1 + + mscluster_row = { + '#ClusterIdx': cluster_idx, + '#Filename': filename, + '#SpecIdx': spec_idx, + '#Scan': scan, + '#ParentMass': precursor_mz, + '#Charge': charge, + '#RetTime': retention_time, + '#PrecIntensity': precursor_intensity + } + mscluster_rows.append(mscluster_row) + + # Create DataFrame + mscluster_df = pd.DataFrame(mscluster_rows) + + # Filter by min_cluster_size + if min_cluster_size > 1: + # Count spectra per cluster + cluster_counts = mscluster_df['#ClusterIdx'].value_counts() + valid_clusters = cluster_counts[cluster_counts >= min_cluster_size].index + mscluster_df = mscluster_df[mscluster_df['#ClusterIdx'].isin(valid_clusters)] + + # Create cluster summary + cluster_summary_rows = [] + for cluster_idx in mscluster_df['#ClusterIdx'].unique(): + cluster_data = mscluster_df[mscluster_df['#ClusterIdx'] == cluster_idx] + num_spectra = len(cluster_data) + + # Calculate mean RT (in minutes) + mean_rt = cluster_data['#RetTime'].mean() / 60.0 + + # Calculate parent mass (mean of #ParentMass) + parent_mass = cluster_data['#ParentMass'].mean() + + # Calculate precursor mass (parent mass / charge, then take mean) + # If charge is 0, use parent mass directly + precursor_masses = [] + for idx, row in cluster_data.iterrows(): + if row['#Charge'] > 0: + precursor_masses.append(row['#ParentMass'] / row['#Charge']) + else: + precursor_masses.append(row['#ParentMass']) + precursor_mass = sum(precursor_masses) / len(precursor_masses) if precursor_masses else parent_mass + + # Calculate precursor charge (most common charge, or mean if no clear mode) + charges = cluster_data['#Charge'].values + charges_nonzero = charges[charges > 0] + if len(charges_nonzero) > 0: + # Use mode (most common charge) using pandas + charge_series = pd.Series(charges_nonzero) + mode_values = charge_series.mode() + if len(mode_values) > 0: + precursor_charge = int(mode_values.iloc[0]) + else: + precursor_charge = int(charges_nonzero.mean()) + else: + precursor_charge = 0 + + # Calculate sum of precursor intensity + sum_precursor_intensity = cluster_data['#PrecIntensity'].sum() + + cluster_summary_row = { + 'cluster index': cluster_idx, + 'number of spectra': num_spectra, + 'parent mass': parent_mass, + 'precursor charge': precursor_charge, + 'precursor mass': precursor_mass, + 'sum(precursor intensity)': sum_precursor_intensity, + 'RTMean': mean_rt + } + cluster_summary_rows.append(cluster_summary_row) + + cluster_summary_df = pd.DataFrame(cluster_summary_rows) + cluster_summary_df = cluster_summary_df.sort_values('cluster index') + + # Save outputs + mscluster_df.to_csv(output_clusterinfo, sep='\t', index=False) + cluster_summary_df.to_csv(output_clustersummary, sep='\t', index=False) + + print(f"Converted {len(mscluster_df)} spectra in {len(cluster_summary_df)} clusters") + print(f"Saved clusterinfo to {output_clusterinfo}") + print(f"Saved clustersummary to {output_clustersummary}") + + +def main(): + parser = argparse.ArgumentParser(description='Convert Falcon output to MSCluster format') + parser.add_argument('falcon_csv', help='Falcon CSV output file') + parser.add_argument('input_spectra_folder', help='Input spectra folder (for reference, not used for fetching data)') + parser.add_argument('output_clusterinfo', help='Output clusterinfo.tsv file') + parser.add_argument('output_clustersummary', help='Output clustersummary.tsv file') + parser.add_argument('--min_cluster_size', type=int, default=2, help='Minimum cluster size') + + args = parser.parse_args() + + convert_falcon_to_mscluster_format( + args.falcon_csv, + args.input_spectra_folder, + args.output_clusterinfo, + args.output_clustersummary, + args.min_cluster_size + ) + + +if __name__ == "__main__": + main() diff --git a/bin/scripts/falcon_wrapper.py b/bin/scripts/falcon_wrapper.py new file mode 100755 index 0000000..792fc76 --- /dev/null +++ b/bin/scripts/falcon_wrapper.py @@ -0,0 +1,273 @@ +#!/usr/bin/python + +import sys +import os +import glob +import argparse +import subprocess +import pandas as pd +import ming_spectrum_library + +def run_falcon(input_spectra_path, output_prefix="falcon", + precursor_tol="20 ppm", fragment_tol=0.05, + min_mz_range=0, min_mz=0, max_mz=30000, eps=0.1): + """ + Runs Falcon with specified parameters. + input_spectra_path can be a file or a folder. + """ + # Check if input is a file or folder + if os.path.isfile(input_spectra_path): + # Single file input + all_spectrum_files = [input_spectra_path] + elif os.path.isdir(input_spectra_path): + # Folder input - list all spectrum files + all_mgf_files = glob.glob(os.path.join(input_spectra_path, "*.mgf")) + all_mzxml_files = glob.glob(os.path.join(input_spectra_path, "*.mzXML")) + all_mzml_files = glob.glob(os.path.join(input_spectra_path, "*.mzML")) + + all_spectrum_files = all_mgf_files + all_mzxml_files + all_mzml_files + + if len(all_spectrum_files) == 0: + print(f"ERROR: No spectrum files found in {input_spectra_path}") + sys.exit(1) + + # Sort these filenames + all_spectrum_files.sort() + else: + print(f"ERROR: Input path does not exist: {input_spectra_path}") + sys.exit(1) + + # Create pattern for falcon (it accepts wildcards or file list) + if len(all_spectrum_files) == 1: + mzml_pattern = all_spectrum_files[0] + else: + # Falcon can accept multiple files, we'll pass them as space-separated + mzml_pattern = " ".join(all_spectrum_files) + + # Parse precursor_tol - falcon expects two arguments + # Format can be "2.0" (just number) or "20 ppm" (value and unit) + # Falcon needs two arguments, so we need to handle both cases + precursor_tol_str = str(precursor_tol).strip() + if " " in precursor_tol_str: + # Already has format like "20 ppm" or "2.0 Da" - split into two args + parts = precursor_tol_str.split(None, 1) # Split on first space + if len(parts) == 2: + precursor_tol_args = f"{parts[0]} {parts[1]}" + else: + # Fallback: use value twice + precursor_tol_args = f"{parts[0]} {parts[0]}" + else: + # Just a number, assume Da and use twice (min and max tolerance) + try: + tol_value = float(precursor_tol_str) + precursor_tol_args = f"{tol_value} {tol_value}" + except ValueError: + # If can't parse, use default + precursor_tol_args = "2.0 2.0" + + command = ( + f"falcon {mzml_pattern} {output_prefix} " + f"--export_representatives " + f"--precursor_tol {precursor_tol_args} " + f"--fragment_tol {fragment_tol} " + f"--min_mz_range {min_mz_range} " + f"--min_mz {min_mz} --max_mz {max_mz} " + f"--eps {eps} " + f"--hash_len 400 " + f"--n_neighbors_ann 64 " + f"--n_probe 16 " + f"--batch_size 32768 " + ) + print(f"[run_falcon] Running: {command}") + process = subprocess.Popen(command, shell=True) + retcode = process.wait() + if retcode != 0: + print(f"ERROR: Falcon failed with exit code {retcode}") + sys.exit(1) + + +def main(): + # Parse arguments + parser = argparse.ArgumentParser(description='Falcon Clustering Wrapper') + parser.add_argument('input_spectra_folder', help='Input Spectra Folder') + parser.add_argument('output_spectra_folder', help='Output Spectra Folder') + parser.add_argument('final_output_folder', help='final_output_folder') + + parser.add_argument('--min_cluster_size', default="2", help='min_cluster_size (not used by falcon directly)') + + parser.add_argument('--pm_tolerance', default="20 ppm", help='pm_tolerance (precursor tolerance)') + parser.add_argument('--fragment_tolerance', default="0.05", help='fragment_tolerance') + + # Filters (not all are used by falcon) + parser.add_argument('--min_peak_intensity', default="0.0", help='min_peak_intensity (not used by falcon)') + parser.add_argument('--window_filter', default="1", help='window_filter (not used by falcon)') + parser.add_argument('--precursor_filter', default="1", help='precursor_filter (not used by falcon)') + + # Falcon-specific parameters + parser.add_argument('--eps', default="0.1", help='Falcon eps parameter') + parser.add_argument('--min_mz', default="0", help='Falcon min_mz parameter') + parser.add_argument('--max_mz', default="30000", help='Falcon max_mz parameter') + + args = parser.parse_args() + + # Check if input is a file or folder (Nextflow may pass a file) + input_spectra_path = args.input_spectra_folder + if os.path.isfile(args.input_spectra_folder): + # If it's a single file, we need to use its directory + input_spectra_path = os.path.dirname(args.input_spectra_folder) + if not input_spectra_path: + input_spectra_path = "." + + # Running falcon + output_prefix = "falcon" + run_falcon(input_spectra_path, output_prefix, + precursor_tol=args.pm_tolerance, + fragment_tol=float(args.fragment_tolerance), + min_mz_range=0, + min_mz=int(args.min_mz), + max_mz=int(args.max_mz), + eps=float(args.eps)) + + # Falcon outputs: + # - falcon.csv (cluster assignments) + # - falcon.mgf (representative spectra) + + falcon_csv = f"{output_prefix}.csv" + falcon_mgf = f"{output_prefix}.mgf" + + if not os.path.exists(falcon_csv): + print(f"ERROR: Falcon output {falcon_csv} not found") + sys.exit(1) + + if not os.path.exists(falcon_mgf): + print(f"ERROR: Falcon output {falcon_mgf} not found") + sys.exit(1) + + # Convert falcon output to mscluster format first + script_dir = os.path.dirname(os.path.abspath(__file__)) + convert_script = os.path.join(script_dir, "convert_falcon_to_mscluster_format.py") + + clusterinfo_file = os.path.join(args.final_output_folder, "clusterinfo.tsv") + clustersummary_file = os.path.join(args.final_output_folder, "clustersummary.tsv") + + # For conversion, we need the original input path to get spectrum info + convert_input_path = input_spectra_path + + convert_cmd = ( + f"python {convert_script} " + f"{falcon_csv} " + f"{convert_input_path} " + f"{clusterinfo_file} " + f"{clustersummary_file} " + f"--min_cluster_size {args.min_cluster_size}" + ) + + print(f"Converting falcon output to mscluster format...") + print(f"Running: {convert_cmd}") + ret_code = os.system(convert_cmd) + + if ret_code != 0: + print(f"ERROR: Conversion failed with exit code {ret_code}") + sys.exit(1) + + # Now update MGF file with correct scan numbers based on clusterinfo.tsv + # The scan number in MGF should match the #ClusterIdx in clusterinfo.tsv + # Read clusterinfo to get cluster indices + clusterinfo_df = pd.read_csv(clusterinfo_file, sep='\t') + + # Get unique cluster indices (these will be the scan numbers in MGF) + unique_clusters = sorted(clusterinfo_df['#ClusterIdx'].unique()) + + # Read falcon MGF file - parse manually to access cluster field + output_mgf_filename = os.path.join(args.final_output_folder, "specs_ms.mgf") + os.makedirs(args.final_output_folder, exist_ok=True) + + # Parse falcon MGF to get cluster information and update scan numbers + mgf_spectra = [] + with open(falcon_mgf, 'r') as f: + spec = None + for line in f: + line = line.strip() + if line == 'BEGIN IONS': + spec = {'peaks': [], 'headers': {}} + elif line == 'END IONS': + if spec is not None: + mgf_spectra.append(spec) + spec = None + elif spec is not None: + if '=' in line: + key, val = line.split('=', 1) + spec['headers'][key.upper()] = val + else: + # Peak data + parts = line.split() + if len(parts) == 2: + try: + mz, intensity = float(parts[0]), float(parts[1]) + spec['peaks'].append((mz, intensity)) + except: + pass + + # Create mapping: falcon cluster (0-based) -> mscluster cluster index (1-based, which is #ClusterIdx) + # falcon cluster IDs in MGF are 0-based, but we converted to 1-based in clusterinfo + cluster_to_scan = {} + for cluster_idx in unique_clusters: + # falcon uses 0-based, mscluster uses 1-based + falcon_cluster = cluster_idx - 1 + cluster_to_scan[falcon_cluster] = cluster_idx + + # Write MGF with correct scan numbers + # Only include spectra that correspond to clusters in clusterinfo.tsv + # This ensures MGF and clusterinfo.tsv are consistent + skipped_count = 0 + with open(output_mgf_filename, 'w') as out_mgf: + for spec in mgf_spectra: + # Get cluster ID from falcon MGF + falcon_cluster = -1 + if 'CLUSTER' in spec['headers']: + try: + falcon_cluster = int(spec['headers']['CLUSTER']) + except (ValueError, TypeError): + pass + + # Only include spectra that are in clusterinfo.tsv + # Skip clusters that were filtered out by min_cluster_size + if falcon_cluster in cluster_to_scan: + scan_number = cluster_to_scan[falcon_cluster] + + # Write MGF entry with correct scan number + out_mgf.write("BEGIN IONS\n") + + # Write headers, updating SCANS= line + for key, val in spec['headers'].items(): + if key == 'SCANS': + out_mgf.write(f"SCANS={scan_number}\n") + else: + out_mgf.write(f"{key}={val}\n") + + # Add SCANS if it wasn't in the original + if 'SCANS' not in spec['headers']: + out_mgf.write(f"SCANS={scan_number}\n") + + # Write peaks + for mz, intensity in spec['peaks']: + out_mgf.write(f"{mz} {intensity}\n") + + out_mgf.write("END IONS\n\n") + else: + # Skip clusters that are not in clusterinfo.tsv (filtered out by min_cluster_size) + skipped_count += 1 + if falcon_cluster != -1: # Only warn for non-singleton clusters + print(f"INFO: Skipping falcon cluster {falcon_cluster} (filtered out by min_cluster_size)") + + if skipped_count > 0: + print(f"INFO: Skipped {skipped_count} spectra that were filtered out by min_cluster_size") + + print(f"Falcon clustering completed. Output files:") + print(f" - MGF: {output_mgf_filename}") + print(f" - ClusterInfo: {clusterinfo_file}") + print(f" - ClusterSummary: {clustersummary_file}") + + +if __name__ == "__main__": + main() diff --git a/bin/scripts/summarize_results_falcon.py b/bin/scripts/summarize_results_falcon.py new file mode 100644 index 0000000..d6ec09e --- /dev/null +++ b/bin/scripts/summarize_results_falcon.py @@ -0,0 +1,88 @@ +import os +import pandas as pd +import argparse +import numpy as np + +def rewrite_falcon_mgf(mgf_input, mgf_output): + with open(mgf_input, 'r') as infile, open(mgf_output, 'w') as outfile: + for line in infile: + if line.startswith("CLUSTER="): + try: + cluster_num = int(line.strip().split("=")[1]) + outfile.write(f"SCANS={cluster_num + 1}\n") + except ValueError: + outfile.write(line) + else: + outfile.write(line) + +def main(): + parser = argparse.ArgumentParser(description='Summarizing Falcon Results') + parser.add_argument('falcon_clusters', help='falcon_clusters') + parser.add_argument('falcon_mgf', help='falcon_mgf') + #parser.add_argument('output_summary_folder', help='output_summary_folder') + args = parser.parse_args() + + clusterinfo_df = pd.read_csv(args.falcon_clusters, sep=',', comment='#') + + print(args) + print(clusterinfo_df) + + clusterinfo_df = clusterinfo_df.sort_values(by='cluster', key=lambda x: x.replace(-1, np.inf)) + + # Filtering out not in clusters data + clusterinfo_df = clusterinfo_df[clusterinfo_df["cluster"] != -1] + + + # Grouping by cluster + grouped_cluster_df = clusterinfo_df.groupby(["cluster"]) + cluster_summary_list = [] + for cluster, cluster_group_df in grouped_cluster_df: + #TODO :Read these from mgf, as the representative is a medoid + + cluster_count = len(cluster_group_df) + cluster_mz = cluster_group_df["precursor_mz"].mean() + cluster_rt = cluster_group_df["retention_time"].mean() + cluster_charge = cluster_group_df["precursor_charge"].mean() + # adjust the col name to map the classical MN wokflow + output_dict = {} + output_dict["number of spectra"] = cluster_count + output_dict["parent mass"] = cluster_mz + output_dict["RTMean"] = cluster_rt + output_dict["precursor charge"] = cluster_charge + output_dict["cluster index"] = cluster[0] + 1 + + cluster_summary_list.append(output_dict) + + # Creating a cluster summary + cluster_summary_df = pd.DataFrame(cluster_summary_list) + cluster_summary_df.to_csv("clustersummary.tsv", sep='\t', index=False) + + # Creating cluster info + clusterinfo_df["filename"] = clusterinfo_df["identifier"].apply(lambda x: x.split(":")[2] + ".mzML") + clusterinfo_df["scan"] = clusterinfo_df["identifier"].apply(lambda x: x.split(":")[-1]) + + # Rename relevant columns to MS-Cluster format + clusterinfo_df = clusterinfo_df.rename(columns={ + "filename": "#Filename", + "cluster": "#ClusterIdx", + "scan": "#Scan", + "precursor_mz": "#ParentMass", + "precursor_charge": "#Charge", + "retention_time": "#RetTime" + }) + + # Just to prevent other processes in the workflow raise error + clusterinfo_df["#PrecIntensity"] = 0 + + # Select required columns + clusterinfo_df = clusterinfo_df[[ + "#ClusterIdx", "#Filename", "#Scan", "#ParentMass", "#Charge", "#RetTime", "#PrecIntensity" + ]] + clusterinfo_df.to_csv("clusterinfo.tsv", sep='\t', index=False) + + #TODO: Rewriting MGF files + rewrite_falcon_mgf(args.falcon_mgf, "specs_ms.mgf") + # TODO: Maybe make this compatible with FBMN, since its already clustered. + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/nf_workflow.nf b/nf_workflow.nf index 865f302..fd74c3e 100644 --- a/nf_workflow.nf +++ b/nf_workflow.nf @@ -13,12 +13,20 @@ params.metadata_per_file_grouping = "No" // Yes means that each file can be its params.metadata_filename = "data/metadata.tsv" // Clustering Parameters +params.clustering_tool = "mscluster" // or "falcon" params.min_cluster_size = "2" // Tolerance Parameters params.pm_tolerance = "2.0" params.fragment_tolerance = "0.5" +// Falcon-specific parameters +params.falcon_pm_tolerance = "20 ppm" +params.falcon_fragment_tolerance = "0.05" +params.falcon_eps = "0.1" +params.falcon_min_mz = "0" +params.falcon_max_mz = "30000" + // Filtering params.min_peak_intensity = "0.0" params.window_filter = "1" @@ -113,6 +121,41 @@ process mscluster { """ } +process falcon { + publishDir "$params.publishdir/nf_output", mode: 'copy' + + conda "$TOOL_FOLDER/conda_env_falcon.yml" + + // This is necessary because the glibc libraries are not always used in the conda environment, and defaults to the system which could be old + beforeScript 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib' + + input: + file inputSpectra + val ready + + output: + file 'clustering/specs_ms.mgf' + file 'clustering/clusterinfo.tsv' + file 'clustering/clustersummary.tsv' + + """ + mkdir clustering + python $TOOL_FOLDER/scripts/falcon_wrapper.py \ + $inputSpectra \ + spectra \ + clustering \ + --min_cluster_size $params.min_cluster_size \ + --pm_tolerance "$params.falcon_pm_tolerance" \ + --fragment_tolerance $params.falcon_fragment_tolerance \ + --min_peak_intensity $params.min_peak_intensity \ + --window_filter $params.window_filter \ + --precursor_filter $params.precursor_filter \ + --eps $params.falcon_eps \ + --min_mz $params.falcon_min_mz \ + --max_mz $params.falcon_max_mz + """ +} + // TODO: Finish Implementing this, as this is currently an no-op process massqlFilterSpectra { publishDir "$params.publishdir/nf_output", mode: 'copy' @@ -569,8 +612,19 @@ workflow { // File summaries filesummary(input_spectra_ch, _download_ready) + // Note: For subsequent processes (library search, networking), they use pm_tolerance and fragment_tolerance + // These are set to mscluster defaults. If using falcon, users should be aware that downstream processes + // will use the mscluster tolerance values unless they also update those parameters. + // This is acceptable because downstream processes work on clustered spectra, and the tolerance + // for library search and networking can be different from clustering tolerance. + // Clustering - (clustered_spectra_intermediate_ch, clusterinfo_ch, clustersummary_ch) = mscluster(input_spectra_ch, _download_ready) + if(params.clustering_tool == "falcon"){ + (clustered_spectra_intermediate_ch, clusterinfo_ch, clustersummary_ch) = falcon(input_spectra_ch, _download_ready) + } + else{ + (clustered_spectra_intermediate_ch, clusterinfo_ch, clustersummary_ch) = mscluster(input_spectra_ch, _download_ready) + } if(params.massql_filter != "None"){ clustered_spectra_ch = massqlFilterSpectra(clustered_spectra_intermediate_ch) diff --git a/workflowinput.yaml b/workflowinput.yaml index 2435e31..fb869a5 100644 --- a/workflowinput.yaml +++ b/workflowinput.yaml @@ -1,7 +1,7 @@ workflowname: classical_networking_workflow workflowdescription: This is Classical Molecular Networking for GNPS2 workflowlongdescription: This is Classical Molecular Networking for GNPS2 -workflowversion: "2026.01.12" +workflowversion: "2026.01.13" workflowfile: nf_workflow.nf workflowautohide: false adminonly: false @@ -88,6 +88,17 @@ parameterlist: - displayname: Clustering Parameters paramtype: section + - displayname: Clustering Tool + paramtype: select + nf_paramname: clustering_tool + formvalue: "mscluster" + options: + - value: "mscluster" + display: "MSCluster" + - value: "falcon" + display: "Falcon" + tooltip: "Select the clustering tool to use for spectral clustering" + - displayname: Minimum Cluster Size (Or Disable Clustering) paramtype: select nf_paramname: min_cluster_size @@ -105,6 +116,68 @@ parameterlist: display: "4" - value: "5" display: "5" + + - displayname: Falcon Parameters + paramtype: section + showif: + - condition: + - key: clustering_tool + value: "falcon" + + - displayname: Precursor Ion Tolerance + paramtype: text + nf_paramname: falcon_pm_tolerance + formplaceholder: Enter the pm_tolerance (e.g., 20 ppm or 2.0 Da) + formvalue: "20 ppm" + tooltip: "Precursor tolerance with unit (e.g., 20 ppm or 2.0 Da)" + showif: + - condition: + - key: clustering_tool + value: "falcon" + + - displayname: Fragment Ion Tolerance + paramtype: text + nf_paramname: falcon_fragment_tolerance + formplaceholder: Enter the fragment_tolerance + formvalue: "0.05" + tooltip: "Fragment tolerance (Da)" + showif: + - condition: + - key: clustering_tool + value: "falcon" + + - displayname: Falcon Epsilon (eps) + paramtype: text + nf_paramname: falcon_eps + formplaceholder: Enter the falcon_eps + formvalue: "0.1" + tooltip: "Falcon clustering epsilon parameter (only used when clustering_tool is falcon)" + showif: + - condition: + - key: clustering_tool + value: "falcon" + + - displayname: Falcon Min MZ + paramtype: text + nf_paramname: falcon_min_mz + formplaceholder: Enter the falcon_min_mz + formvalue: "0" + tooltip: "Falcon minimum m/z value (only used when clustering_tool is falcon)" + showif: + - condition: + - key: clustering_tool + value: "falcon" + + - displayname: Falcon Max MZ + paramtype: text + nf_paramname: falcon_max_mz + formplaceholder: Enter the falcon_max_mz + formvalue: "30000" + tooltip: "Falcon maximum m/z value (only used when clustering_tool is falcon)" + showif: + - condition: + - key: clustering_tool + value: "falcon" - displayname: Advanced Filtering Parameters paramtype: section