Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 60 additions & 24 deletions src/resdk/tables/qc.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,7 @@

def _filter_and_rename_columns(df, column_map):
"""Filter, apply a scaling factor and rename columns based on provided specifications."""
selected_columns = []
rename_map = {}
result = pd.DataFrame(index=df.index)

for col in column_map:
names = col.get("name", [])
Expand All @@ -77,16 +76,12 @@ def _filter_and_rename_columns(df, column_map):

for name in names:
if name in df.columns:
selected_columns.append(name)
rename_map[name] = slug
# Some column values are given a specific format (e.g. millions)
# and need to be scaled on a common scale
df[name] = df[name] * scaling_factor.get(name, 1)
result[slug] = df[name] * scaling_factor.get(name, 1)
break

df = df[selected_columns].rename(columns=rename_map)

return df
return result


def _aggregate_df(df, column_map, prefix=None):
Expand Down Expand Up @@ -119,7 +114,22 @@ def _clean_sample_name(
return sfx_pattern.sub("", filename)


def general_multiqc_parser(file_object, name, column_map):
def _is_paired_end(sample) -> Optional[bool]:
reads = list(sample.data.filter(type="data:reads:fastq:"))

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[High] Blocking, synchronous network call inside the asyncio download loop. sample.data.filter(...) materialized with list() runs ResolweQuery._fetch → self.api.get(...) — a blocking slumber/requests GET. It executes inside general_multiqc_parser → _parse_file, which is called from the async def _download_file coroutine gathered by asyncio.gather (base.py), with no executor / to_thread. So it stalls the event loop and serializes the very downloads the chunked async path exists to parallelize, adding a round-trip per parsed file. It is also uncached (.filter() clones with an empty cache; no memoization on _is_paired_end).

Suggest resolving library strategy once per sample off the event loop (e.g. precompute a {sample_id: Optional[bool]} map) and passing the boolean into the parser.

origin_read_obj = reads[-1] if reads else None

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Medium] reads[-1] is non-deterministic and origin_read_obj is misleading. No ordering= is set, so the "last" element depends on undefined server ordering, yet the variable is named origin_read_obj while taking the last item. The codebase idiom (shortcuts/sample.py get_reads) pins ordering='-id' and takes [0]. Combined with the strict endswith just below, which reads object is picked actually matters. Pin an explicit order (or tie selection to the reads upstream of the MultiQC Data) and rename to match what is selected.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ordering is determined by the backend, so this is not entirely applicable.

if origin_read_obj is None:

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Medium] None (unknown strategy) is silently treated as single-end. The call site is if _is_paired_end(sample):, so a None return falls into the single-end branch, which sums across both mates — re-introducing exactly the 2× inflation this PR fixes, with no warning, for any genuinely paired-end sample whose reads object can't be resolved here. Handle the three states explicitly (e.g. warn or skip the adjustment on None), consistent with the UserWarning pattern already used in base.py.

return None
if origin_read_obj._process._type.endswith("paired:"):

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[High] Strict endswith('paired:') / ('single:') raises on processed reads, and the failure takes down unrelated tables. Upload types like data:reads:fastq:paired: match, but a trimmed object — e.g. data:reads:fastq:single:cutadapt: (this exact shape is in tests/unit/test_sample.py) — ends in cutadapt: and hits the else: raise ValueError. Since reads[-1] often resolves to the higher-id (trimmed) object, this is plausible rather than exotic. And because _is_paired_end runs for ~15 data types via general_multiqc_parser while asyncio.gather is called without return_exceptions=True, one such sample aborts parsing of picard / flagstat / bowtie2 / etc. for the whole chunk — coupling that did not exist before this PR.

Match the library token as a path segment instead of a strict suffix (e.g. 'paired' in type.split(':')), and make the unknown case non-fatal.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an insightful comment, as the code was written with assumptions of the user running the General RNA-seq pipeline. Will modify to make it more general.

return True
elif origin_read_obj._process._type.endswith("single:"):
return False
else:
raise ValueError(
f"Cannot determine the library strategy of sample: {sample.name}"
)


def general_multiqc_parser(file_object, name, column_map, sample):
"""General parser for MultiQC files.

If the column_map argument is not specified the function will retain the original column names
Expand All @@ -138,6 +148,34 @@ def general_multiqc_parser(file_object, name, column_map):
if df.empty:
return pd.Series(name=name)

# Sum the read count per lane separately for each read mate
# and insert the resulting value into the first column for the corresponding mate.
# This is is done in order to report the actual read pairs for samples
# sequenced in paired-end mode, while preserving the sum across lanes.
fragment_metrics = [
"estimated_fragment_count_raw",
"estimated_fragment_count_trimmed",
]
fragment_columns = df.columns.intersection(fragment_metrics)

if _is_paired_end(sample):

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Medium] _is_paired_end is called unconditionally, even when there are no fragment columns. fragment_columns is computed just above but not used to gate this call. For the ~14 non-FASTQ data types routed through this parser, fragment_columns is always empty, so the per-mate loops are a no-op — yet the blocking reads query (and the ValueError risk above) is still paid for every sample. A single guard removes the waste and confines the crash surface to the one table that needs it:

if not fragment_columns.empty:
    paired = _is_paired_end(sample)
    ...  # the whole paired/single block, only here

for col in fragment_columns:
for s in ["_1", "_2"]:
mate_mask = df.index.str.endswith(s) & df[col].notna()

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[High] The _1/_2 mate-detection assumption is unvalidated and fails silently for common naming; it also crashes on non-string indexes.

(1) endswith('_1') / ('_2') matches SRA-style sample_1 / sample_2 but not Illumina sample_R1_001 / sample_R2_001 or _R1 / _R2'sample_R1'.endswith('_1') is False, and MQC_CLEANED_SFXS strips .fastq / .gz but not the _R1_001 tail. When neither matches, both masks are empty, continue fires, and _aggregate_df averages all 2L lane×mate rows → the reported value is sum / 2L instead of sum / 2, i.e. 1/L of correct (under-reporting that scales with lane count). A single-lane sample returns the right number by coincidence, so a naive fixture won't catch it. Please confirm what suffix the Genialis MultiQC config actually emits; safest is to derive mate identity from the fastq Data's R1/R2 files, or at minimum match _(?:R)?[12](?:_\d+)?$ and warn/raise when a paired sample matches neither mate instead of silently averaging.

(2) df.index.str.endswith(...) raises AttributeError: Can only use .str accessor with string values when sample names are purely numeric (read_csv(index_col=0).convert_dtypes() yields an integer index). The single-end branch isn't affected, so the failure is asymmetric. Use df.index.astype(str).str.endswith(s).

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will test further

if not mate_mask.any():
continue
total = df.loc[mate_mask, col].sum(min_count=1)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Medium] "Mean of the two mate totals" is an estimator that degrades silently. When only one mate's FastQC row is present, the other mate's mask is empty and mean over [mate1_total, NA…] returns the single-mate count, not a pair count. For the trimmed column, mates are trimmed independently and can legitimately differ, so the mean is a fractional value that is neither mate's count nor a true pair count (and disagrees by design with total_read_count_*, which still sums everything). Consider defining fragment count as R1's lane-summed total (== pair count by construction, immune to mate-2 asymmetry), or document the estimator and warn when |m1 − m2| is large or a mate is missing.

first_row = df.index[mate_mask][0]
df.loc[mate_mask, col] = pd.NA
df.loc[first_row, col] = total
else:
for col in fragment_columns:
first_row = df[col].first_valid_index()
total = df[col].sum(min_count=1)
df[col] = pd.NA
if first_row is not None:
df.loc[first_row, col] = total

series = _aggregate_df(df=df, column_map=column_map)

series.name = name
Expand Down Expand Up @@ -453,27 +491,25 @@ def __init__(

def _parse_file(self, file_obj, sample_id, data_type):
"""Parse file object and return one DataFrame line."""
if data_type not in self.DATA_TYPES:
raise ValueError(f"Unknown data type: {data_type}")

sample = next((s for s in self._samples if s.id == sample_id), None)

if data_type == self.MACS_PREPEAK:
case_sample = next(
(sample for sample in self._samples if sample.id == sample_id), None
)
sample_name = _clean_sample_name(case_sample.name)
return self.DATA_TYPES[data_type]["parser"](
file_object=file_obj,
name=sample_id,
column_map=self.DATA_TYPES[data_type]["column_map"],
sample_name=sample_name,
sample_name=_clean_sample_name(sample.name),
)
else:
if data_type in self.DATA_TYPES:
parser_func = self.DATA_TYPES[data_type]["parser"]
return parser_func(
file_object=file_obj,
name=sample_id,
column_map=self.DATA_TYPES[data_type]["column_map"],
)
else:
raise ValueError(f"Unknown data type: {data_type}")

return self.DATA_TYPES[data_type]["parser"](
file_object=file_obj,
name=sample_id,
column_map=self.DATA_TYPES[data_type]["column_map"],
sample=sample,
)

def _get_data_uri(self, data: Data, data_type: str) -> str:
"""Get the file path based on data type."""
Expand Down
20 changes: 20 additions & 0 deletions src/resdk/tables/qc_mappings.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,26 @@
"agg_func": "sum",
"scaling_factor": {"fastqc_trimmed-total_sequences": 1e6},
},
{
"name": [
"FastQC (raw)_mqc-generalstats-fastqc_raw-total_sequences",
"fastqc_raw-total_sequences",
],
"slug": "estimated_fragment_count_raw",
"type": "Int64",
"agg_func": "mean",
"scaling_factor": {"fastqc_raw-total_sequences": 1e6},
},
{
"name": [
"FastQC (trimmed)_mqc-generalstats-fastqc_trimmed-total_sequences",
"fastqc_trimmed-total_sequences",
],
"slug": "estimated_fragment_count_trimmed",
"type": "Int64",
"agg_func": "mean",
"scaling_factor": {"fastqc_trimmed-total_sequences": 1e6},
},
{
"name": [
"FastQC (raw)_mqc-generalstats-fastqc_raw-percent_gc",
Expand Down
Loading