genialis · marcellevstek · May 18, 2026 · gregorjerse · Jun 19, 2026 · gregorjerse
diff --git a/src/resdk/tables/qc.py b/src/resdk/tables/qc.py
@@ -64,8 +64,7 @@
 
 def _filter_and_rename_columns(df, column_map):
     """Filter, apply a scaling factor and rename columns based on provided specifications."""
-    selected_columns = []
-    rename_map = {}
+    result = pd.DataFrame(index=df.index)
 
     for col in column_map:
         names = col.get("name", [])
@@ -77,16 +76,12 @@ def _filter_and_rename_columns(df, column_map):
 
         for name in names:
             if name in df.columns:
-                selected_columns.append(name)
-                rename_map[name] = slug
                 # Some column values are given a specific format (e.g. millions)
                 # and need to be scaled on a common scale
-                df[name] = df[name] * scaling_factor.get(name, 1)
+                result[slug] = df[name] * scaling_factor.get(name, 1)
                 break
 
-    df = df[selected_columns].rename(columns=rename_map)
-
-    return df
+    return result
 
 
 def _aggregate_df(df, column_map, prefix=None):
@@ -119,7 +114,22 @@ def _clean_sample_name(
     return sfx_pattern.sub("", filename)
 
 
-def general_multiqc_parser(file_object, name, column_map):
+def _is_paired_end(sample) -> Optional[bool]:
+    reads = list(sample.data.filter(type="data:reads:fastq:"))
+    origin_read_obj = reads[-1] if reads else None
+    if origin_read_obj is None:
+        return None
+    if origin_read_obj._process._type.endswith("paired:"):
+        return True
+    elif origin_read_obj._process._type.endswith("single:"):
+        return False
+    else:
+        raise ValueError(
+            f"Cannot determine the library strategy of sample: {sample.name}"
+        )
+
+
+def general_multiqc_parser(file_object, name, column_map, sample):
     """General parser for MultiQC files.
 
     If the column_map argument is not specified the function will retain the original column names
@@ -138,6 +148,34 @@ def general_multiqc_parser(file_object, name, column_map):
     if df.empty:
         return pd.Series(name=name)
 
+    # Sum the read count per lane separately for each read mate
+    # and insert the resulting value into the first column for the corresponding mate.
+    # This is is done in order to report the actual read pairs for samples
+    # sequenced in paired-end mode, while preserving the sum across lanes.
+    fragment_metrics = [
+        "estimated_fragment_count_raw",
+        "estimated_fragment_count_trimmed",
+    ]
+    fragment_columns = df.columns.intersection(fragment_metrics)
+
+    if _is_paired_end(sample):
+        for col in fragment_columns:
+            for s in ["_1", "_2"]:
+                mate_mask = df.index.str.endswith(s) & df[col].notna()
+                if not mate_mask.any():
+                    continue
+                total = df.loc[mate_mask, col].sum(min_count=1)
+                first_row = df.index[mate_mask][0]
+                df.loc[mate_mask, col] = pd.NA
+                df.loc[first_row, col] = total
+    else:
+        for col in fragment_columns:
+            first_row = df[col].first_valid_index()
+            total = df[col].sum(min_count=1)
+            df[col] = pd.NA
+            if first_row is not None:
+                df.loc[first_row, col] = total
+
     series = _aggregate_df(df=df, column_map=column_map)
 
     series.name = name
@@ -453,27 +491,25 @@ def __init__(
 
     def _parse_file(self, file_obj, sample_id, data_type):
         """Parse file object and return one DataFrame line."""
+        if data_type not in self.DATA_TYPES:
+            raise ValueError(f"Unknown data type: {data_type}")
+
+        sample = next((s for s in self._samples if s.id == sample_id), None)
+
         if data_type == self.MACS_PREPEAK:
-            case_sample = next(
-                (sample for sample in self._samples if sample.id == sample_id), None
-            )
-            sample_name = _clean_sample_name(case_sample.name)
             return self.DATA_TYPES[data_type]["parser"](
                 file_object=file_obj,
                 name=sample_id,
                 column_map=self.DATA_TYPES[data_type]["column_map"],
-                sample_name=sample_name,
+                sample_name=_clean_sample_name(sample.name),
             )
-        else:
-            if data_type in self.DATA_TYPES:
-                parser_func = self.DATA_TYPES[data_type]["parser"]
-                return parser_func(
-                    file_object=file_obj,
-                    name=sample_id,
-                    column_map=self.DATA_TYPES[data_type]["column_map"],
-                )
-            else:
-                raise ValueError(f"Unknown data type: {data_type}")
+
+        return self.DATA_TYPES[data_type]["parser"](
+            file_object=file_obj,
+            name=sample_id,
+            column_map=self.DATA_TYPES[data_type]["column_map"],
+            sample=sample,
+        )
 
     def _get_data_uri(self, data: Data, data_type: str) -> str:
         """Get the file path based on data type."""

diff --git a/src/resdk/tables/qc_mappings.py b/src/resdk/tables/qc_mappings.py
@@ -52,6 +52,26 @@
         "agg_func": "sum",
         "scaling_factor": {"fastqc_trimmed-total_sequences": 1e6},
     },
+    {
+        "name": [
+            "FastQC (raw)_mqc-generalstats-fastqc_raw-total_sequences",
+            "fastqc_raw-total_sequences",
+        ],
+        "slug": "estimated_fragment_count_raw",
+        "type": "Int64",
+        "agg_func": "mean",
+        "scaling_factor": {"fastqc_raw-total_sequences": 1e6},
+    },
+    {
+        "name": [
+            "FastQC (trimmed)_mqc-generalstats-fastqc_trimmed-total_sequences",
+            "fastqc_trimmed-total_sequences",
+        ],
+        "slug": "estimated_fragment_count_trimmed",
+        "type": "Int64",
+        "agg_func": "mean",
+        "scaling_factor": {"fastqc_trimmed-total_sequences": 1e6},
+    },
     {
         "name": [
             "FastQC (raw)_mqc-generalstats-fastqc_raw-percent_gc",