Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions assets/multiqc_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ custom_content:
- input
- input_seeds
- input_network
- network_node_degree_distribution
- drugstone_link
- overlap
- jaccard_similarity
Expand Down
16 changes: 16 additions & 0 deletions assets/network_node_degree_distribution_header.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
parent_id: input
id: network_node_degree_distribution
parent_name: Input
section_name: Network node degree distribution
description: Node degree distributions of the input network(s).
plot_type: linegraph
pconfig:
id: network_node_degree_distribution_line_graph
title: Node Degree Distribution
xlab: Node Degree
data_labels:
- name: Counts
ylab: Counts
- name: Percentages
ylab: Percentage
data:
48 changes: 47 additions & 1 deletion bin/graph_tool_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@

import argparse
import csv
import json
import logging
import sys
import os
import graph_tool.all as gt
from pathlib import Path
from collections import Counter
import yaml

logger = logging.getLogger()

Expand Down Expand Up @@ -60,6 +63,42 @@ def save_multiqc(g, stem):
)


def save_node_degree_distribution(g, stem):
# Calculate degree for each vertex
degrees = [v.out_degree() for v in g.vertices()]

# Count frequency of each degree
degree_counts = Counter(degrees)

# Get total number of vertices for normalization
total_vertices = len(degrees)

# Create absolute counts dictionary: {degree: count}
absolute_counts = [
[degree, count] for degree, count in sorted(degree_counts.items())
]

# Create relative frequencies dictionary: {degree: percentage}
relative_frequencies = [
[degree, count / total_vertices * 100]
for degree, count in sorted(degree_counts.items())
]
# save node degree distribution as yaml
node_degree_distribution = {
"name": stem,
"absolute": absolute_counts,
"relative": relative_frequencies,
}

with open(f"{stem}.node_degree_distribution.yaml", "w") as file:
yaml.safe_dump(
node_degree_distribution,
file,
sort_keys=False,
default_flow_style=None, # keeps list pairs as [x, y]
)


def save_diamond(g, stem):
with open(f"{stem}.diamond.csv", "w") as file:
writer = csv.writer(file, lineterminator="\n")
Expand Down Expand Up @@ -108,6 +147,7 @@ def save(g, stem, format):
if format == "gt":
save_gt(g=g, stem=stem)
save_multiqc(g=g, stem=stem)
save_node_degree_distribution(g=g, stem=stem)
elif format == "diamond":
save_diamond(g=g, stem=stem)
elif format == "domino":
Expand Down Expand Up @@ -170,7 +210,13 @@ def parse_args(argv=None):
"-f",
"--format",
help="Output format (default gt). If format it gt, a summary file for multiqc will be generated as well.",
choices=("gt", "diamond", "domino", "robust", "rwr"),
choices=(
"gt",
"diamond",
"domino",
"robust",
"rwr",
),
default="gt",
)
parser.add_argument(
Expand Down
64 changes: 64 additions & 0 deletions bin/multiqc_formatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/usr/bin/env python
import argparse
from pathlib import Path
import yaml
import sys


def parse_args(argv=None):
parser = argparse.ArgumentParser(
description="formats file for multiqc custom contents",
epilog="Example: python multiqc_formatter.py -i network.gt -f network_degree",
)
parser.add_argument(
"-i", "--input", type=Path, nargs="*", required=True, help="Input files"
)
parser.add_argument("-H", "--header", type=Path, required=True, help="Header file")
return parser.parse_args(argv)


def parse_input(input_files, header_file):
with open(header_file, "r") as header:
header_data = yaml.safe_load(header)
header_id = header_data.get("id", "")

if header_id == "network_node_degree_distribution":
save_node_degree_distribution(input_files, header_file)


def save_node_degree_distribution(input_files, header_file):
with open(header_file, "r", encoding="utf-8") as header:
mqc_payload = yaml.safe_load(header) or {}

absolute_data = {}
relative_data = {}

for file in input_files:
with open(file, "r", encoding="utf-8") as distribution_file:
distribution = yaml.safe_load(distribution_file) or {}

network_name = distribution.get("name") or file.stem
absolute = distribution.get("absolute")
relative = distribution.get("relative")

if absolute is None or relative is None:
raise ValueError(
f"Invalid distribution YAML in {file}: expected keys 'absolute' and 'relative'"
)

absolute_data[network_name] = absolute
relative_data[network_name] = relative

mqc_payload["data"] = [absolute_data, relative_data]

with open("./node_degree_distribution_mqc.yaml", "w", encoding="utf-8") as file:
yaml.safe_dump(mqc_payload, file, sort_keys=False, default_flow_style=None)


def main():
args = parse_args()
parse_input(args.input, args.header)


if __name__ == "__main__":
sys.exit(main())
22 changes: 22 additions & 0 deletions bin/network_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ def run(args):
# Assign component ID to each component of the subnetwork
component_id = assign_component_ids(subnetwork)

# Add node degrees as vertex properties
add_node_degrees(subnetwork, name_to_degree_full, name_to_degree_sub)

# Save the network containing the annotations in graph-tool format
subnetwork.save(args.output_file)

Expand Down Expand Up @@ -140,5 +143,24 @@ def assign_component_ids(graph):
return graph.vp["component_id"]


def add_node_degrees(subnetwork, name_to_degree_full, name_to_degree_sub):
"""
Adds the node degree from both the full network and the subnetwork as vertex properties.
"""
# Add degree in the full network
subnetwork.vp["degree_in_full_network"] = subnetwork.new_vertex_property("int")
# Add degree in the subnetwork
subnetwork.vp["degree_in_module"] = subnetwork.new_vertex_property("int")

for v in subnetwork.vertices():
name = subnetwork.vp["name"][v]
full_degree = name_to_degree_full.get(name, 0)
sub_degree = name_to_degree_sub.get(name, 0)
subnetwork.vp["degree_in_full_network"][v] = full_degree
subnetwork.vp["degree_in_module"][v] = sub_degree

return subnetwork.vp["degree_in_full_network"], subnetwork.vp["degree_in_module"]


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions bin/visualize_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ def parse_args(argv=None):
def main(argv=None):
"""Coordinate argument parsing and program execution."""
args = parse_args(argv)

logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s")
if not args.module.is_file():
logger.error(f"The given input file {args.file_in} was not found!")
Expand Down
2 changes: 1 addition & 1 deletion conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
process {

// Set default container for python dependencies
container = 'ghcr.io/repo4eu/modulediscovery_python_dependencies:v0.1.0'
container = 'ghcr.io/repo4eu/modulediscovery_python_dependencies:v0.2.0'

cpus = { 1 * task.attempt }
memory = { 6.GB * task.attempt }
Expand Down
7 changes: 7 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ process {
]
}

withName: 'MULTIQCFORMATTER' {
publishDir = [
path: { "${params.outdir}/mqc_summaries"},
mode: params.publish_dir_mode,
saveAs: {filename -> filename.equals('versions.yml') ? null : filename}
]
}
Comment thread
JohannesKersting marked this conversation as resolved.
// Input parsing

withName: GRAPHTOOLPARSER {
Expand Down
5 changes: 3 additions & 2 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d

### Prepare network

The [graph-tool](https://graph-tool.skewed.de/) library is used to parse the input network(s) into the [`.gt`](https://graph-tool.skewed.de/static/docs/stable/gt_format.html) format, the internal representation used for networks within the pipeline. Additionally, it is used to generate networks in the specific formats required by the various disease module inference methods. This step also gathers summary statistics for the MultiQC report, including the number of nodes and edges, the network [diameter](https://graph-tool.skewed.de/static/docs/stable/autosummary/graph_tool.topology.pseudo_diameter.html#graph_tool.topology.pseudo_diameter), the number of connected components, the size of the largest connected component, the count of self-loops (nodes with edges to themselves), and the number of duplicate edges (multiple edges connecting the same two nodes).
The [graph-tool](https://graph-tool.skewed.de/) library is used to parse the input network(s) into the [`.gt`](https://graph-tool.skewed.de/static/docs/stable/gt_format.html) format, the internal representation used for networks within the pipeline. Additionally, it is used to generate networks in the specific formats required by the various disease module inference methods. This step also gathers summary statistics for the MultiQC report, including the number of nodes and edges, the network [diameter](https://graph-tool.skewed.de/static/docs/stable/autosummary/graph_tool.topology.pseudo_diameter.html#graph_tool.topology.pseudo_diameter), the number of connected components, the size of the largest connected component, the distribution of node degrees, the count of self-loops (nodes with edges to themselves), and the number of duplicate edges (multiple edges connecting the same two nodes).

<details markdown="1">
<summary>Output files</summary>
Expand All @@ -63,6 +63,7 @@ The [graph-tool](https://graph-tool.skewed.de/) library is used to parse the inp
- `<network>.robust.tsv`: Input network in the format required for ROBUST or ROBUST (bias-aware). Only created if the methods are used.
- `<network>.rwr.csv`: Input network in the format required for RWR. Only created if the method is used.
- `mqc_summaries/`
- ` node_degree_distribution_mqc.yaml`: Network node degree distribution for the MultiQC report.
- ` input_network_mqc.tsv`: Network summary statistics for the MultiQC report.

</details>
Expand All @@ -84,7 +85,7 @@ The format of the input seed file(s) is validated, and any seed nodes not presen

## Disease module inference

The inferred disease modules are exported in multiple formats, including [`.gt`](https://graph-tool.skewed.de/static/docs/stable/gt_format.html), [`.graphml`](https://de.wikipedia.org/wiki/GraphML), and node and edge lists in `.tsv`. If a method returns only a node list rather than a full network, the connecting edges are extracted from the input network. Module nodes are annotated with their seed status (`is_seed`), their subnetwork participation degree ([`spd`](https://nedrex.net/tutorial/availableFunctions.html)), and a component identifier (`component_id`) to indicate which connected component they belong to. Additionally, tool-specific node properties are added, which are explained in the sections below.
The inferred disease modules are exported in multiple formats, including [`.gt`](https://graph-tool.skewed.de/static/docs/stable/gt_format.html), [`.graphml`](https://de.wikipedia.org/wiki/GraphML), and node and edge lists in `.tsv`. If a method returns only a node list rather than a full network, the connecting edges are extracted from the input network. Module nodes are annotated with their seed status (`is_seed`), their degree within both the whole network (`degree_in_full_network`) and the disease module(`degree_in_module`), their subnetwork participation degree ([`spd`](https://nedrex.net/tutorial/availableFunctions.html)), and a component identifier (`component_id`) to indicate which connected component they belong to. Additionally, tool-specific node properties are added, which are explained in the sections below.

### Only seeds

Expand Down
7 changes: 4 additions & 3 deletions modules/local/graphtoolparser/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@ process GRAPHTOOLPARSER {
val format

output:
tuple val(meta), path("*${format}*") , emit: network
tuple val(meta), path("input_network_multiqc.tsv") , emit: multiqc, optional: true
path "versions.yml" , emit: versions
tuple val(meta), path("*${format}*") , emit: network
tuple val(meta), path("input_network_multiqc.tsv") , emit: multiqc , optional: true
tuple val(meta), path("*node_degree_distribution.yaml") , emit: node_degree , optional: true
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when
Expand Down
21 changes: 21 additions & 0 deletions modules/local/multiqcformatter/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
process MULTIQCFORMATTER {
label 'process_single'
Comment thread
JohannesKersting marked this conversation as resolved.

input:
tuple path(header), path(inputFiles, stageAs: 'input/*')
output:
path("*mqc*") , emit : multiqc
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when
script:
"""
multiqc_formatter.py -i $inputFiles -H $header
cat <<-END_VERSIONS > versions.yml
"${task.process}":
python: \$(python --version | sed 's/Python //g')
END_VERSIONS
"""

}
Loading
Loading