diff --git a/src/resdk/tables/qc.py b/src/resdk/tables/qc.py index 32fcccb8..5ebf4269 100644 --- a/src/resdk/tables/qc.py +++ b/src/resdk/tables/qc.py @@ -64,8 +64,7 @@ def _filter_and_rename_columns(df, column_map): """Filter, apply a scaling factor and rename columns based on provided specifications.""" - selected_columns = [] - rename_map = {} + result = pd.DataFrame(index=df.index) for col in column_map: names = col.get("name", []) @@ -77,16 +76,12 @@ def _filter_and_rename_columns(df, column_map): for name in names: if name in df.columns: - selected_columns.append(name) - rename_map[name] = slug # Some column values are given a specific format (e.g. millions) # and need to be scaled on a common scale - df[name] = df[name] * scaling_factor.get(name, 1) + result[slug] = df[name] * scaling_factor.get(name, 1) break - df = df[selected_columns].rename(columns=rename_map) - - return df + return result def _aggregate_df(df, column_map, prefix=None): @@ -119,7 +114,22 @@ def _clean_sample_name( return sfx_pattern.sub("", filename) -def general_multiqc_parser(file_object, name, column_map): +def _is_paired_end(sample) -> Optional[bool]: + reads = list(sample.data.filter(type="data:reads:fastq:")) + origin_read_obj = reads[-1] if reads else None + if origin_read_obj is None: + return None + if origin_read_obj._process._type.endswith("paired:"): + return True + elif origin_read_obj._process._type.endswith("single:"): + return False + else: + raise ValueError( + f"Cannot determine the library strategy of sample: {sample.name}" + ) + + +def general_multiqc_parser(file_object, name, column_map, sample): """General parser for MultiQC files. If the column_map argument is not specified the function will retain the original column names @@ -138,6 +148,34 @@ def general_multiqc_parser(file_object, name, column_map): if df.empty: return pd.Series(name=name) + # Sum the read count per lane separately for each read mate + # and insert the resulting value into the first column for the corresponding mate. + # This is is done in order to report the actual read pairs for samples + # sequenced in paired-end mode, while preserving the sum across lanes. + fragment_metrics = [ + "estimated_fragment_count_raw", + "estimated_fragment_count_trimmed", + ] + fragment_columns = df.columns.intersection(fragment_metrics) + + if _is_paired_end(sample): + for col in fragment_columns: + for s in ["_1", "_2"]: + mate_mask = df.index.str.endswith(s) & df[col].notna() + if not mate_mask.any(): + continue + total = df.loc[mate_mask, col].sum(min_count=1) + first_row = df.index[mate_mask][0] + df.loc[mate_mask, col] = pd.NA + df.loc[first_row, col] = total + else: + for col in fragment_columns: + first_row = df[col].first_valid_index() + total = df[col].sum(min_count=1) + df[col] = pd.NA + if first_row is not None: + df.loc[first_row, col] = total + series = _aggregate_df(df=df, column_map=column_map) series.name = name @@ -453,27 +491,25 @@ def __init__( def _parse_file(self, file_obj, sample_id, data_type): """Parse file object and return one DataFrame line.""" + if data_type not in self.DATA_TYPES: + raise ValueError(f"Unknown data type: {data_type}") + + sample = next((s for s in self._samples if s.id == sample_id), None) + if data_type == self.MACS_PREPEAK: - case_sample = next( - (sample for sample in self._samples if sample.id == sample_id), None - ) - sample_name = _clean_sample_name(case_sample.name) return self.DATA_TYPES[data_type]["parser"]( file_object=file_obj, name=sample_id, column_map=self.DATA_TYPES[data_type]["column_map"], - sample_name=sample_name, + sample_name=_clean_sample_name(sample.name), ) - else: - if data_type in self.DATA_TYPES: - parser_func = self.DATA_TYPES[data_type]["parser"] - return parser_func( - file_object=file_obj, - name=sample_id, - column_map=self.DATA_TYPES[data_type]["column_map"], - ) - else: - raise ValueError(f"Unknown data type: {data_type}") + + return self.DATA_TYPES[data_type]["parser"]( + file_object=file_obj, + name=sample_id, + column_map=self.DATA_TYPES[data_type]["column_map"], + sample=sample, + ) def _get_data_uri(self, data: Data, data_type: str) -> str: """Get the file path based on data type.""" diff --git a/src/resdk/tables/qc_mappings.py b/src/resdk/tables/qc_mappings.py index 982f8d59..9bcdeabb 100644 --- a/src/resdk/tables/qc_mappings.py +++ b/src/resdk/tables/qc_mappings.py @@ -52,6 +52,26 @@ "agg_func": "sum", "scaling_factor": {"fastqc_trimmed-total_sequences": 1e6}, }, + { + "name": [ + "FastQC (raw)_mqc-generalstats-fastqc_raw-total_sequences", + "fastqc_raw-total_sequences", + ], + "slug": "estimated_fragment_count_raw", + "type": "Int64", + "agg_func": "mean", + "scaling_factor": {"fastqc_raw-total_sequences": 1e6}, + }, + { + "name": [ + "FastQC (trimmed)_mqc-generalstats-fastqc_trimmed-total_sequences", + "fastqc_trimmed-total_sequences", + ], + "slug": "estimated_fragment_count_trimmed", + "type": "Int64", + "agg_func": "mean", + "scaling_factor": {"fastqc_trimmed-total_sequences": 1e6}, + }, { "name": [ "FastQC (raw)_mqc-generalstats-fastqc_raw-percent_gc",