diff --git a/CHANGELOG.md b/CHANGELOG.md index f44da030a0..b79f7788d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ - Pin j178/prek-action action to 91fd7d7 ([#3931](https://github.com/nf-core/tools/pull/3931)) - add pre-commit hook to keep uv.lock in sync ([#3933](https://github.com/nf-core/tools/pull/3933)) - Update mcr.microsoft.com/devcontainers/miniconda Docker digest to 2be0f5a ([#3946](https://github.com/nf-core/tools/pull/3946)) +- Fix quote handling in meta.yml ([#3948](https://github.com/nf-core/tools/pull/3948)) - Fix docker errors in test ([#3924](https://github.com/nf-core/tools/pull/3924)) - Update actions/checkout digest to 8e8c483 ([#3956](https://github.com/nf-core/tools/pull/3956)) - Update GitHub Actions ([#3957](https://github.com/nf-core/tools/pull/3957)) diff --git a/nf_core/components/nfcore_component.py b/nf_core/components/nfcore_component.py index 8d2a5b9c55..a9ffc62ae6 100644 --- a/nf_core/components/nfcore_component.py +++ b/nf_core/components/nfcore_component.py @@ -205,22 +205,13 @@ def get_inputs_from_main_nf(self) -> None: log.debug(f"Could not find any inputs in {self.main_nf}") return input_data = data.split("input:")[1].split("output:")[0] + regex_keyword = r"\b(val|path)\b" for line in input_data.split("\n"): channel_elements: Any = [] line = line.split("//")[0] # remove any trailing comments - regex = r"\b(val|path)\b\s*(\(([^)]+)\)|\s*([^)\s,]+))" - matches = re.finditer(regex, line) - for _, match in enumerate(matches, start=1): - input_val = None - if match.group(3): - input_val = match.group(3).split(",")[0] # handle `files, stageAs: "inputs/*"` cases - elif match.group(4): - input_val = match.group(4).split(",")[0] # handle `files, stageAs: "inputs/*"` cases - if input_val: - input_val = re.split(r',(?=(?:[^\'"]*[\'"][^\'"]*[\'"])*[^\'"]*$)', input_val)[ - 0 - ] # Takes only first part, avoid commas in quotes - input_val = input_val.strip().strip("'").strip('"') # remove quotes and whitespaces + for match in re.finditer(regex_keyword, line): + if input_val := self._extract_value_from_line(line, match.end()): + input_val = self._split_first_param(input_val) channel_elements.append({input_val: {}}) if len(channel_elements) == 1: inputs.append(channel_elements[0]) @@ -244,6 +235,56 @@ def get_inputs_from_main_nf(self) -> None: log.debug(f"Found {len(inputs)} inputs in {self.main_nf}") self.inputs = inputs + def _split_first_param(self, value: str) -> str: + """ + Extract first parameter from comma-separated list, respecting quotes. + + Args: + value: String that may contain comma-separated parameters + + Returns: + First parameter with whitespace stripped + """ + result = re.split(r',(?=(?:[^\'"]*[\'"][^\'"]*[\'"])*[^\'"]*$)', value)[0] + return result.strip() + + def _extract_value_from_line(self, line: str, pos: int) -> str | None: + """ + Extract value after keyword, handling parentheses and quotes. + + Uses a simple state machine to find matching closing parenthesis + while respecting quoted strings. + + Args: + line: The line to parse + pos: Position in line where keyword ends + + Returns: + Extracted value or None if not found + """ + rest = line[pos:].lstrip() + if not rest: + return None + + if not rest.startswith("("): + # No parentheses, extract until comma or newline + match = re.match(r"([^,\n]*)", rest) + return match.group(1).strip() if match else None + + # Find matching closing parentheses, respecting quotes + depth = 0 + in_quote = None + for i, char in enumerate(rest): + if char in ('"', "'") and (i == 0 or rest[i - 1] != "\\"): + in_quote = char if in_quote is None else (None if in_quote == char else in_quote) + elif char == "(" and in_quote is None: + depth += 1 + elif char == ")" and in_quote is None: + depth -= 1 + if depth == 0: + return rest[1:i] # Return content between parentheses + return None + def get_outputs_from_main_nf(self): with open(self.main_nf) as f: data = f.read() @@ -256,25 +297,16 @@ def get_outputs_from_main_nf(self): output_data = data.split("output:")[1].split("when:")[0] log.debug(f"Found output_data: {output_data}") regex_emit = r"emit:\s*([^)\s,]+)" - regex_elements = r"\b(val|path|env|stdout|eval)\b\s*(\(([^)]+)\)|\s*([^)\s,]+))" + regex_keyword = r"\b(val|path|env|stdout|eval)\b" for line in output_data.split("\n"): match_emit = re.search(regex_emit, line) - matches_elements = re.finditer(regex_elements, line) if not match_emit: continue channel_elements = [] outputs[match_emit.group(1)] = [] - for _, match_element in enumerate(matches_elements, start=1): - output_val = None - if match_element.group(3): - output_val = match_element.group(3) - elif match_element.group(4): - output_val = match_element.group(4) - if output_val: - output_val = re.split(r',(?=(?:[^\'"]*[\'"][^\'"]*[\'"])*[^\'"]*$)', output_val)[ - 0 - ] # Takes only first part, avoid commas in quotes - output_val = output_val.strip().strip("'").strip('"') # remove quotes and whitespaces + for match in re.finditer(regex_keyword, line): + if output_val := self._extract_value_from_line(line, match.end()): + output_val = self._split_first_param(output_val) channel_elements.append({output_val: {}}) if len(channel_elements) == 1: outputs[match_emit.group(1)].append(channel_elements[0]) @@ -312,27 +344,18 @@ def get_topics_from_main_nf(self) -> None: output_data = data.split("output:")[1].split("when:")[0] log.debug(f"Output data: {output_data}") regex_topic = r"topic:\s*([^)\s,]+)" - regex_elements = r"\b(val|path|env|stdout|eval)\b\s*(\(([^)]+)\)|\s*([^)\s,]+))" + regex_keyword = r"\b(val|path|env|stdout|eval)\b" for line in output_data.split("\n"): match_topic = re.search(regex_topic, line) - matches_elements = re.finditer(regex_elements, line) if not match_topic: continue channel_elements: list[dict[str, dict]] = [] topic_name = match_topic.group(1) if topic_name not in topics: topics[topic_name] = [] - for _, match_element in enumerate(matches_elements, start=1): - topic_val = None - if match_element.group(3): - topic_val = match_element.group(3) - elif match_element.group(4): - topic_val = match_element.group(4) - if topic_val: - topic_val = re.split(r',(?=(?:[^\'"]*[\'"][^\'"]*[\'"])*[^\'"]*$)', topic_val)[ - 0 - ] # Takes only first part, avoid commas in quotes - topic_val = topic_val.strip().strip("'").strip('"') # remove quotes and whitespaces + for match in re.finditer(regex_keyword, line): + if topic_val := self._extract_value_from_line(line, match.end()): + topic_val = self._split_first_param(topic_val) channel_elements.append({topic_val: {}}) if len(channel_elements) == 1: topics[topic_name].append(channel_elements[0]) diff --git a/nf_core/modules/lint/__init__.py b/nf_core/modules/lint/__init__.py index b05d2623e1..95f001c1d9 100644 --- a/nf_core/modules/lint/__init__.py +++ b/nf_core/modules/lint/__init__.py @@ -25,6 +25,7 @@ from nf_core.components.lint import ComponentLint, LintExceptionError, LintResult from nf_core.components.nfcore_component import NFCoreComponent from nf_core.pipelines.lint_utils import console, run_prettier_on_file +from nf_core.utils import unquote log = logging.getLogger(__name__) @@ -324,29 +325,45 @@ def update_meta_yml_file(self, mod): corrected_meta_yml = meta_yml.copy() def _find_meta_info(meta_yml: dict, element_name: str, is_output=False) -> dict: - """Find the information specified in the meta.yml file to update the corrected meta.yml content""" + """Find the information specified in the meta.yml file to update the corrected meta.yml content + + Note: element_name may contain quotes (e.g., '"*.html"', "'bpipe'") from parsing main.nf, + but meta.yml keys don't include the quotes. We normalize both for comparison + by removing paired quotes (both single and double). + """ + # Remove paired quotes (single or double) from element name for comparison + normalized_element_name = unquote(element_name) + + # Convert old meta.yml output structure (list) to dict if is_output and isinstance(meta_yml, list): - # Convert old meta.yml structure for outputs (list) to dict meta_yml = {k: v for d in meta_yml for k, v in d.items()} + + # Helper to check if a key matches and return its metadata + def check_match(element: dict) -> dict | None: + key = list(element.keys())[0] + return element[key] if normalized_element_name == unquote(key) else None + + # Handle list structure (inputs) if isinstance(meta_yml, list): - for k, meta_channel in enumerate(meta_yml): - if isinstance(meta_channel, list): - for x, meta_element in enumerate(meta_channel): - if element_name == list(meta_element.keys())[0]: - return meta_yml[k][x][element_name] - elif isinstance(meta_channel, dict): - if element_name == list(meta_channel.keys())[0]: - return meta_yml[k][element_name] + for channel in meta_yml: + if isinstance(channel, list): + for element in channel: + if (result := check_match(element)) is not None: + return result + elif isinstance(channel, dict) and (result := check_match(channel)) is not None: + return result + + # Handle dict structure (outputs/topics) elif isinstance(meta_yml, dict): - for ch_name, channels in meta_yml.items(): - for k, meta_channel in enumerate(channels): - if isinstance(meta_channel, list): - for x, meta_element in enumerate(meta_channel): - if element_name == list(meta_element.keys())[0]: - return meta_yml[ch_name][k][x][element_name] - elif isinstance(meta_channel, dict): - if element_name == list(meta_channel.keys())[0]: - return meta_yml[ch_name][k][element_name] + for channels in meta_yml.values(): + for channel in channels: + if isinstance(channel, list): + for element in channel: + if (result := check_match(element)) is not None: + return result + elif isinstance(channel, dict) and (result := check_match(channel)) is not None: + return result + return {} def _sort_meta_yml(meta_yml: dict) -> dict: @@ -385,6 +402,18 @@ def _sort_meta_yml(meta_yml: dict) -> dict: correct_topics = self.obtain_topics(mod.topics) meta_topics = self.obtain_topics(meta_yml.get("topics", {})) + # Load topic metadata template from module-template/meta.yml + template_path = Path(__file__).parent.parent.parent / "module-template" / "meta.yml" + topic_metadata = [{}, {}, {}] # [process, tool, version] + try: + with open(template_path) as fh: + template_meta = yaml.load(fh) + versions_entry = template_meta.get("topics", {}).get("versions", [[]])[0] + if len(versions_entry) == 3: + topic_metadata = [next(iter(item.values())) for item in versions_entry] + except Exception as e: + log.debug(f"Could not load topic template metadata: {e}") + def _populate_channel_elements(io_type, correct_value, meta_value, mod_io_data, meta_yml_io, check_exists=True): """Populate input, output, or topic channel elements with metadata information. @@ -422,25 +451,58 @@ def _populate_channel_elements(io_type, correct_value, meta_value, mod_io_data, if isinstance(channel, list): for j, element in enumerate(channel): element_name = list(element.keys())[0] - corrected_data[i][j][element_name] = _find_meta_info(meta_yml_io, element_name) + normalized_name = unquote(element_name) + corrected_data[i][j] = {normalized_name: _find_meta_info(meta_yml_io, element_name)} elif isinstance(channel, dict): element_name = list(channel.keys())[0] - corrected_data[i][element_name] = _find_meta_info(meta_yml_io, element_name) + normalized_name = unquote(element_name) + corrected_data[i] = {normalized_name: _find_meta_info(meta_yml_io, element_name)} else: # Output and topics structure: { name: [[ {meta:{}}, {*.bam:{}} ]], other: [ {*.fa:{}} ] } - for ch_name in corrected_data.keys(): - for i, ch_content in enumerate(corrected_data[ch_name]): + # Use the original meta_yml_io as the base to preserve all existing metadata + # Only update structure when it differs from main.nf + corrected_data = meta_yml_io.copy() if meta_yml_io else mod_io_data.copy() + + for ch_name in mod_io_data.keys(): + # Ensure channel exists in corrected_data + if ch_name not in corrected_data: + corrected_data[ch_name] = mod_io_data[ch_name] + + for i, ch_content in enumerate(mod_io_data[ch_name]): + # Ensure index exists + if i >= len(corrected_data[ch_name]): + corrected_data[ch_name].append([]) # Initialize empty, we'll populate below + if isinstance(ch_content, list): + # Rebuild list with normalized keys + normalized_list = [] for j, element in enumerate(ch_content): element_name = list(element.keys())[0] - corrected_data[ch_name][i][j][element_name] = _find_meta_info( - meta_yml_io, element_name, is_output=True - ) + normalized_name = unquote(element_name) + element_meta = _find_meta_info(meta_yml_io, element_name, is_output=True) + + # For topics, add default type and description if empty + if io_type == "topics" and not element_meta: + element_meta = topic_metadata[j].copy() if j < len(topic_metadata) else {} + log.info( + f"Adding topic metadata for '{normalized_name}' at index {j}: {element_meta}" + ) + + normalized_list.append({normalized_name: element_meta}) + log.debug(f"After assignment: normalized_list[{j}][{normalized_name}] = {element_meta}") + corrected_data[ch_name][i] = normalized_list elif isinstance(ch_content, dict): element_name = list(ch_content.keys())[0] - corrected_data[ch_name][i][element_name] = _find_meta_info( - meta_yml_io, element_name, is_output=True - ) + normalized_name = unquote(element_name) + element_meta = _find_meta_info(meta_yml_io, element_name, is_output=True) + # For topics, add default type and description if empty + if io_type == "topics" and not element_meta: + element_meta = topic_metadata[i].copy() if i < len(topic_metadata) else {} + log.debug( + f"Element name dict: {normalized_name} at index {i}, Element meta: {element_meta}" + ) + + corrected_data[ch_name][i] = {normalized_name: element_meta} return corrected_data @@ -465,6 +527,33 @@ def _populate_channel_elements(io_type, correct_value, meta_value, mod_io_data, if populated_topics is not None: corrected_meta_yml["topics"] = populated_topics + # Populate metadata for versions_* output channels and topics (from template) + def _populate_versions_metadata(section_name: str, section_data: dict) -> None: + """Add template metadata to versions_* channels and topics.versions""" + for ch_name, ch_data in section_data.items(): + # Only process versions_* outputs or "versions" topic + if (section_name == "output" and ch_name.startswith("versions_")) or ( + section_name == "topics" and ch_name == "versions" + ): + for i, ch_content in enumerate(ch_data): + if isinstance(ch_content, list): + for j, element in enumerate(ch_content): + element_name = list(element.keys())[0] + normalized_name = unquote(element_name) + element_meta = section_data[ch_name][i][j].get(normalized_name, {}) + # Add metadata if empty + if not element_meta or not any(k in element_meta for k in ["type", "description"]): + element_meta = topic_metadata[j].copy() if j < len(topic_metadata) else {} + section_data[ch_name][i][j][normalized_name] = element_meta + log.debug( + f"Adding metadata to {section_name}.{ch_name} for '{normalized_name}' at index {j}" + ) + + if "output" in corrected_meta_yml: + _populate_versions_metadata("output", corrected_meta_yml["output"]) + if "topics" in corrected_meta_yml: + _populate_versions_metadata("topics", corrected_meta_yml["topics"]) + def _add_edam_ontologies(section, edam_formats, desc): expected_ontologies = [] current_ontologies = [] @@ -543,24 +632,22 @@ def _add_edam_ontologies(section, edam_formats, desc): corrected_meta_yml["tools"][i][tool_name]["identifier"] = get_biotools_id(biotools_data, tool_name) # Create YAML anchors for versions_* keys in output that match "versions" in topics + # Since we now populate metadata for both output and topics, set up anchors to reference output from topics if "output" in corrected_meta_yml and "topics" in corrected_meta_yml: - # Find all versions_* keys in output versions_keys = [key for key in corrected_meta_yml["output"].keys() if key.startswith("versions_")] if versions_keys and "versions" in corrected_meta_yml["topics"]: + # Set topics["versions"] to reference output versions (now with populated metadata) if len(versions_keys) == 1: - # Single versions channel: use simple anchor name "versions" corrected_meta_yml["topics"]["versions"] = corrected_meta_yml["output"][versions_keys[0]] if hasattr(corrected_meta_yml["output"][versions_keys[0]], "yaml_set_anchor"): corrected_meta_yml["output"][versions_keys[0]].yaml_set_anchor("versions") else: - # Multiple versions channels: add all to topics["versions"] array with full names as anchors corrected_meta_yml["topics"]["versions"] = [] for versions_key in versions_keys: corrected_meta_yml["topics"]["versions"].append(corrected_meta_yml["output"][versions_key][0]) if hasattr(corrected_meta_yml["output"][versions_key], "yaml_set_anchor"): corrected_meta_yml["output"][versions_key].yaml_set_anchor(versions_key) - corrected_meta_yml = _sort_meta_yml(corrected_meta_yml) with open(mod.meta_yml, "w") as fh: diff --git a/nf_core/modules/lint/meta_yml.py b/nf_core/modules/lint/meta_yml.py index a3ce3e7381..c64f3517be 100644 --- a/nf_core/modules/lint/meta_yml.py +++ b/nf_core/modules/lint/meta_yml.py @@ -10,6 +10,7 @@ from nf_core.components.components_differ import ComponentsDiffer from nf_core.components.lint import ComponentLint, LintExceptionError from nf_core.components.nfcore_component import NFCoreComponent +from nf_core.utils import unquote if TYPE_CHECKING: from nf_core.modules.lint import ModuleLint @@ -304,15 +305,17 @@ def obtain_inputs(_, inputs: list) -> list: Returns: formatted_inputs (dict): A dictionary containing the inputs and their elements obtained from main.nf or meta.yml files. """ - formatted_inputs = [] + formatted_inputs: list[list[str] | str] = [] for input_channel in inputs: if isinstance(input_channel, list): channel_elements = [] for element in input_channel: - channel_elements.append(list(element.keys())[0]) + key = list(element.keys())[0] + channel_elements.append(unquote(key)) formatted_inputs.append(channel_elements) else: - formatted_inputs.append(list(input_channel.keys())[0]) + key = list(input_channel.keys())[0] + formatted_inputs.append(unquote(key)) return formatted_inputs @@ -339,9 +342,11 @@ def obtain_outputs(_, outputs: dict | list) -> dict | list: if isinstance(element, list): channel_elements.append([]) for e in element: - channel_elements[-1].append(list(e.keys())[0]) + key = list(e.keys())[0] + channel_elements[-1].append(unquote(key)) else: - channel_elements.append(list(element.keys())[0]) + key = list(element.keys())[0] + channel_elements.append(unquote(key)) formatted_outputs[channel_name] = channel_elements if old_structure: @@ -368,9 +373,11 @@ def obtain_topics(_, topics: dict) -> dict: if isinstance(element, list): t_elements.append([]) for e in element: - t_elements[-1].append(list(e.keys())[0]) + key = list(e.keys())[0] + t_elements[-1].append(unquote(key)) else: - t_elements.append(list(element.keys())[0]) + key = list(element.keys())[0] + t_elements.append(unquote(key)) formatted_topics[name] = t_elements return formatted_topics diff --git a/nf_core/utils.py b/nf_core/utils.py index ad72559e7b..829dc0f4f7 100644 --- a/nf_core/utils.py +++ b/nf_core/utils.py @@ -2,6 +2,7 @@ Common utility functions for the nf-core python package. """ +import ast import concurrent.futures import datetime import errno @@ -93,6 +94,25 @@ NFCORE_DIR = Path(os.environ.get("XDG_CONFIG_HOME", os.path.join(os.getenv("HOME") or "", ".config")), "nfcore") +def unquote(s: str) -> str: + """ + Remove paired quotes (single or double) from start and end of string. + + Uses ast.literal_eval to safely parse Python string literals, preserving + the original string if it's not a valid literal. + + Args: + s: String potentially containing quotes + + Returns: + String with outer quotes removed if present, otherwise original string + """ + try: + return ast.literal_eval(s) + except (ValueError, SyntaxError): + return s + + def fetch_remote_version(source_url): response = requests.get(source_url, timeout=3) remote_version = re.sub(r"[^0-9\.]", "", response.text)