diff --git a/src/pyprideap/cli.py b/src/pyprideap/cli.py index ce98584..6ce0ceb 100644 --- a/src/pyprideap/cli.py +++ b/src/pyprideap/cli.py @@ -92,6 +92,7 @@ def _generate_report( split: bool = False, sdrf_path: Path | None = None, no_border: bool = True, + two_sides: bool = True, ) -> Path: """Read a data file and generate a QC report.""" import pyprideap as pp @@ -114,7 +115,7 @@ def _generate_report( click.echo("Generating individual plot files...") logger.debug("Output directory: %s", output_path) - result = qc_report_split(ds, output_path, no_border=no_border) + result = qc_report_split(ds, output_path, no_border=no_border, two_sides=two_sides) n_files = len(list(result.glob("*.html"))) click.echo(f" {n_files} HTML files saved to {result}/") return result @@ -161,6 +162,11 @@ def main() -> None: @click.option( "--no-border/--border", default=True, help="Remove card borders from split plot files (default: no border)." ) +@click.option( + "--two-sides/--no-two-sides", + default=True, + help="Generate combined plot files for side-by-side panel layouts (default: enabled).", +) @click.option("-v", "--verbose", is_flag=True, default=False, help="Enable verbose logging output.") def report( input_file: str | None, @@ -170,6 +176,7 @@ def report( split: bool, sdrf: str | None, no_border: bool, + two_sides: bool, verbose: bool, ) -> None: """Generate a QC report from a data file or PAD accession.""" @@ -203,7 +210,15 @@ def report( out = Path(f"{output_path}/{stem}") else: out = Path(f"{output_path}/{stem}.html") - _generate_report(f, out, platform=platform, split=split, sdrf_path=sdrf_path, no_border=no_border) + _generate_report( + f, + out, + platform=platform, + split=split, + sdrf_path=sdrf_path, + no_border=no_border, + two_sides=two_sides, + ) except Exception as e: logger.debug("Error processing %s: %s", f.name, e, exc_info=True) click.echo(f" Skipping {f.name}: {e}", err=True) @@ -219,6 +234,7 @@ def report( split=split, sdrf_path=sdrf_path, no_border=no_border, + two_sides=two_sides, ) diff --git a/src/pyprideap/viz/qc/compute.py b/src/pyprideap/viz/qc/compute.py index 4d4a793..9d3878b 100644 --- a/src/pyprideap/viz/qc/compute.py +++ b/src/pyprideap/viz/qc/compute.py @@ -88,7 +88,7 @@ class DataCompletenessData: below_lod_rate: list[float] # per-sample fraction below LOD (0-1) protein_ids: list[str] = field(default_factory=list) # per-protein identifiers missing_freq: list[float] = field(default_factory=list) # per-protein fraction below LOD (0-1) - title: str = "Data Completeness" + title: str = "Sample Completeness" @dataclass diff --git a/src/pyprideap/viz/qc/render.py b/src/pyprideap/viz/qc/render.py index b8e1db5..9ff498b 100644 --- a/src/pyprideap/viz/qc/render.py +++ b/src/pyprideap/viz/qc/render.py @@ -580,7 +580,7 @@ def render_data_completeness(data: DataCompletenessData) -> Figure: fig = make_subplots( rows=2, cols=1, - subplot_titles=["Per-Sample Data Completeness", "Missing Frequency Distribution"], + subplot_titles=["Sample Completeness", "Missing Frequency"], vertical_spacing=0.35, ) @@ -670,6 +670,94 @@ def render_data_completeness(data: DataCompletenessData) -> Figure: return fig +def render_sample_completeness(data: DataCompletenessData) -> Figure: + """Per-sample stacked bar showing above/below LOD as standalone plot.""" + go, _ = _import_plotly() + + _MAX_LABEL = 20 + short_ids = [s if len(s) <= _MAX_LABEL else s[:_MAX_LABEL] + "\u2026" for s in data.sample_ids] + + fig = go.Figure() + fig.add_trace( + go.Bar( + x=short_ids, + y=[r * 100 for r in data.above_lod_rate], + name="Above LOD", + marker_color="#2ecc71", + customdata=data.sample_ids, + hovertemplate="%{customdata}
Above LOD: %{y:.1f}%", + ), + ) + fig.add_trace( + go.Bar( + x=short_ids, + y=[r * 100 for r in data.below_lod_rate], + name="Below LOD", + marker_color="#f39c12", + customdata=data.sample_ids, + hovertemplate="%{customdata}
Below LOD: %{y:.1f}%", + ), + ) + fig.update_xaxes(title_text="", tickangle=-45) + fig.update_yaxes(title_text="% of Proteins", range=[0, 100], ticksuffix="%") + fig.update_layout( + title="Sample Completeness", + barmode="stack", + height=500, + legend=dict(orientation="h", yanchor="top", y=-0.18), + margin=dict(b=100), + ) + return fig + + +def render_missing_frequency(data: DataCompletenessData) -> Figure: + """Per-protein missing frequency histogram as standalone plot.""" + go, _ = _import_plotly() + + fig = go.Figure() + if data.missing_freq: + missing_pct = [f * 100 for f in data.missing_freq] + fig.add_trace( + go.Histogram( + x=missing_pct, + nbinsx=25, + marker_color="#e74c3c", + showlegend=False, + ), + ) + n_below_30 = sum(1 for f in data.missing_freq if f < 0.30) + pct_below_30 = n_below_30 / len(data.missing_freq) * 100 if data.missing_freq else 0 + fig.add_annotation( + text=f"{pct_below_30:.1f}% of proteins below 30% missing", + xref="paper", + yref="paper", + x=0.98, + y=0.98, + xanchor="right", + yanchor="top", + showarrow=False, + font=dict(size=12), + bgcolor="rgba(255,255,255,0.8)", + ) + fig.add_vline( + x=30, + line_dash="dash", + line_color="#e67e22", + line_width=2, + annotation_text="30% threshold", + annotation_position="bottom right", + annotation_font_color="#e67e22", + ) + fig.update_xaxes(title_text="Missing Frequency (% Samples Below LOD)", range=[0, 100]) + fig.update_yaxes(title_text="Number of Proteins") + fig.update_layout( + title="Missing Frequency Distribution", + height=500, + margin=dict(b=100), + ) + return fig + + def render_cv_distribution(data: CvDistributionData) -> Figure: go, _ = _import_plotly() fig = go.Figure(data=[go.Histogram(x=data.cv_values, nbinsx=50)]) diff --git a/src/pyprideap/viz/qc/report.py b/src/pyprideap/viz/qc/report.py index 7d0565d..cd5d209 100644 --- a/src/pyprideap/viz/qc/report.py +++ b/src/pyprideap/viz/qc/report.py @@ -78,6 +78,17 @@ "of 30%%: proteins above this threshold may be unreliable and should be considered for filtering. " "Proteins clustered near 0%% are reliably detected; those near 100%% may need filtering." ), + "sample_completeness": ( + "Per-sample stacked bar showing Above LOD (green, reliable signal) vs Below LOD " + "(orange, measured but below detection limit). Control samples are excluded. " + "A sample with a large orange fraction may indicate low protein input or technical issues." + ), + "missing_frequency_distribution": ( + "Histogram of per-protein missing rate (%% of samples where signal is below LOD). " + "Olink recommends a missing frequency threshold of 30%%: proteins above this threshold " + "may be unreliable and should be considered for filtering. " + "Proteins clustered near 0%% are reliably detected; those near 100%% may need filtering." + ), "dimreduction": ( "Dimensionality reduction projects the high-dimensional protein expression data onto two axes. " "Use the toggle switch to switch between PCA and t-SNE. " @@ -204,7 +215,8 @@ _SECTION_ORDER = [ ("Quality Overview", ["lod_comparison", "qc_summary"]), ("Signal & Distribution", ["distribution", "lod_analysis"]), - ("Data Completeness", ["data_completeness"]), + ("Sample Completeness", ["sample_completeness"]), + ("Missing Frequency Distribution", ["missing_frequency_distribution"]), ("Sample Relationships", ["dimreduction", "correlation", "heatmap"]), ("Normalization QC", ["norm_scale"]), ("Variability", ["cv_distribution", "plate_cv"]), @@ -883,22 +895,6 @@ def _render_summary_table( rows.append(_summary_row("", "Features (assays)", str(n_features))) - # QC columns available in the dataset - _known_qc_cols = [ - "SampleQC", - "QC_Warning", - "AssayQC", - "SampleType", - "RowCheck", - "ColCheck", - "HybControlNormScale", - ] - qc_cols_found = [c for c in _known_qc_cols if c in samples.columns or c in features.columns] - if qc_cols_found: - rows.append(_summary_row("", "QC columns", ", ".join(qc_cols_found))) - else: - rows.append(_summary_row(_status_dot("amber"), "QC columns", "None detected — QC metrics may be limited")) - # --- LOD (right after General) --- lod_active = lod_info.get("active") lod_analysis = plot_data.get("lod_analysis") @@ -906,8 +902,6 @@ def _render_summary_table( has_any_lod = lod_active is not None or lod_analysis is not None or data_completeness is not None if has_any_lod or lod_info.get("sources"): rows.append(_summary_group("Limit of Detection")) - if lod_active is not None: - rows.append(_summary_row("", "LOD source", str(lod_active))) # Inline LOD sources overview status_icons = {"available": "✅", "unavailable": "❌", "insufficient": "⚠"} @@ -960,42 +954,43 @@ def _render_summary_table( median_per_sample = float(proteins_per_sample.median()) rows.append(_summary_row("", "Proteins per sample (median)", f"{median_per_sample:.0f}")) - # --- Missing Data --- - rows.append(_summary_group("Missing Data")) - total_cells = numeric.size - if total_cells > 0: - detection_rate = float(numeric.notna().sum().sum() / total_cells) - else: - detection_rate = 0.0 - if detection_rate > 0.90: - dr_dot = _status_dot("green") - elif detection_rate >= 0.70: - dr_dot = _status_dot("amber") - else: - dr_dot = _status_dot("red") - rows.append(_summary_row(dr_dot, "Detection rate", f"{detection_rate:.1%}")) - - # Samples with >20% missing - row_miss = numeric.isna().mean(axis=1) - n_high_miss_samples = int((row_miss > 0.2).sum()) - if n_high_miss_samples == 0: - hms_dot = _status_dot("green") - elif n_samples > 0 and n_high_miss_samples / n_samples <= 0.10: - hms_dot = _status_dot("amber") - else: - hms_dot = _status_dot("red") - rows.append(_summary_row(hms_dot, "Samples with >20% missing", str(n_high_miss_samples))) - - # Proteins with >50% missing - col_miss = numeric.isna().mean(axis=0) - n_high_miss_prot = int((col_miss > 0.5).sum()) - if n_high_miss_prot == 0: - hmp_dot = _status_dot("green") - elif n_features > 0 and n_high_miss_prot / n_features <= 0.05: - hmp_dot = _status_dot("amber") - else: - hmp_dot = _status_dot("red") - rows.append(_summary_row(hmp_dot, "Proteins with >50% missing", str(n_high_miss_prot))) + # --- Missing Data (only when no LOD info, since LOD-based completeness is more informative) --- + if not has_any_lod: + rows.append(_summary_group("Missing Data")) + total_cells = numeric.size + if total_cells > 0: + detection_rate = float(numeric.notna().sum().sum() / total_cells) + else: + detection_rate = 0.0 + if detection_rate > 0.90: + dr_dot = _status_dot("green") + elif detection_rate >= 0.70: + dr_dot = _status_dot("amber") + else: + dr_dot = _status_dot("red") + rows.append(_summary_row(dr_dot, "Detection rate", f"{detection_rate:.1%}")) + + # Samples with >20% missing + row_miss = numeric.isna().mean(axis=1) + n_high_miss_samples = int((row_miss > 0.2).sum()) + if n_high_miss_samples == 0: + hms_dot = _status_dot("green") + elif n_samples > 0 and n_high_miss_samples / n_samples <= 0.10: + hms_dot = _status_dot("amber") + else: + hms_dot = _status_dot("red") + rows.append(_summary_row(hms_dot, "Samples with >20% missing", str(n_high_miss_samples))) + + # Proteins with >50% missing + col_miss = numeric.isna().mean(axis=0) + n_high_miss_prot = int((col_miss > 0.5).sum()) + if n_high_miss_prot == 0: + hmp_dot = _status_dot("green") + elif n_features > 0 and n_high_miss_prot / n_features <= 0.05: + hmp_dot = _status_dot("amber") + else: + hmp_dot = _status_dot("red") + rows.append(_summary_row(hmp_dot, "Proteins with >50% missing", str(n_high_miss_prot))) # --- Variability --- cv_data = plot_data.get("cv_distribution") @@ -1031,22 +1026,6 @@ def _render_summary_table( pi_dot = _status_dot("red") rows.append(_summary_row(pi_dot, "Median inter-plate CV", f"{med_inter:.1%}")) - # --- Expression --- - rows.append(_summary_group("Expression")) - vals = numeric.values.flatten() - vals_clean = vals[~np.isnan(vals)] if len(vals) > 0 else vals - if len(vals_clean) > 0: - med_expr = float(np.median(vals_clean)) - dyn_range = float(np.max(vals_clean) - np.min(vals_clean)) - sd_expr = float(np.std(vals_clean)) - rows.append(_summary_row("", "Median expression", f"{med_expr:.2f}")) - rows.append(_summary_row("", "Dynamic range", f"{dyn_range:.2f}")) - rows.append(_summary_row("", "SD of expression", f"{sd_expr:.2f}")) - else: - rows.append(_summary_row("", "Median expression", "N/A")) - rows.append(_summary_row("", "Dynamic range", "N/A")) - rows.append(_summary_row("", "SD of expression", "N/A")) - # --- QC Status (Olink only) --- if "SampleQC" in samples.columns: rows.append(_summary_group("QC Status")) @@ -1090,40 +1069,24 @@ def _render_summary_table( from pyprideap.core import Platform as _Platform iqr_median_data = plot_data.get("iqr_median_qc") - uniprot_dup_data = plot_data.get("uniprot_duplicates") - has_olink_qc = isinstance(iqr_median_data, IqrMedianQcData) or isinstance(uniprot_dup_data, UniProtDuplicateData) - if has_olink_qc: + if isinstance(iqr_median_data, IqrMedianQcData): rows.append(_summary_group("Assay QC")) - - if isinstance(iqr_median_data, IqrMedianQcData): - n_outlier = iqr_median_data.n_outlier_samples - n_total = iqr_median_data.n_total_samples - outlier_rate = n_outlier / n_total if n_total > 0 else 0.0 - if n_outlier == 0: - iqr_dot = _status_dot("green") - elif outlier_rate < 0.10: - iqr_dot = _status_dot("amber") - else: - iqr_dot = _status_dot("red") - rows.append( - _summary_row( - iqr_dot, - "IQR/Median outlier samples", - f"{n_outlier} / {n_total}", - ) - ) - - if isinstance(uniprot_dup_data, UniProtDuplicateData): - n_proteins = uniprot_dup_data.n_unique_proteins - n_assays = uniprot_dup_data.n_total_assays - dup_dot = _status_dot("green") # informational - rows.append( - _summary_row( - dup_dot, - "Unique proteins / Assays", - f"{n_proteins} / {n_assays}", - ) + n_outlier = iqr_median_data.n_outlier_samples + n_total = iqr_median_data.n_total_samples + outlier_rate = n_outlier / n_total if n_total > 0 else 0.0 + if n_outlier == 0: + iqr_dot = _status_dot("green") + elif outlier_rate < 0.10: + iqr_dot = _status_dot("amber") + else: + iqr_dot = _status_dot("red") + rows.append( + _summary_row( + iqr_dot, + "IQR/Median outlier samples", + f"{n_outlier} / {n_total}", ) + ) # --- SomaScan QC Flags --- if dataset.platform == _Platform.SOMASCAN: @@ -1371,7 +1334,8 @@ def qc_report( "lod_analysis": (LodAnalysisData, R.render_lod_analysis), "heatmap": (HeatmapData, R.render_heatmap), "correlation": (CorrelationData, R.render_correlation), - "data_completeness": (DataCompletenessData, R.render_data_completeness), + "sample_completeness": (DataCompletenessData, R.render_sample_completeness), + "missing_frequency_distribution": (DataCompletenessData, R.render_missing_frequency), "cv_distribution": (CvDistributionData, R.render_cv_distribution), "plate_cv": (PlateCvData, R.render_plate_cv), "norm_scale": (NormScaleData, R.render_norm_scale), @@ -1392,6 +1356,12 @@ def qc_report( rendered: dict[str, tuple[str, ...]] = {} # key -> (title, html[, extra_header]) first_key = None + # Map split renderer keys to their compute_all data keys + _DATA_KEY_MAP = { + "sample_completeness": "data_completeness", + "missing_frequency_distribution": "data_completeness", + } + # Handle combined dimensionality reduction (PCA + t-SNE in one panel with toggle) _pca_raw = plot_data.get("pca") _umap_raw = plot_data.get("umap") # actually t-SNE data @@ -1404,7 +1374,8 @@ def qc_report( if key == "dimreduction": first_key = "dimreduction" break - if key in _RENDERERS and plot_data.get(key) is not None: + dk = _DATA_KEY_MAP.get(key, key) + if key in _RENDERERS and plot_data.get(dk) is not None: first_key = key break @@ -1456,12 +1427,14 @@ def qc_report( # Find first key if not already set if first_key is None: for key in display_order: - if key in _RENDERERS and plot_data.get(key) is not None: + data_key = _DATA_KEY_MAP.get(key, key) + if key in _RENDERERS and plot_data.get(data_key) is not None: first_key = key break for key, (dtype, renderer) in _RENDERERS.items(): - data = plot_data.get(key) + data_key = _DATA_KEY_MAP.get(key, key) + data = plot_data.get(data_key) if data is None or not isinstance(data, dtype): continue fig = renderer(data) # type: ignore[operator] @@ -1476,7 +1449,11 @@ def qc_report( plot_height = f"{fig.layout.height}px" js = "cdn" if key == first_key else False plot_html = fig.to_html(full_html=False, include_plotlyjs=js, default_height=plot_height) - rendered[key] = (data.title, plot_html) # type: ignore[attr-defined] + if key in _DATA_KEY_MAP: + title = key.replace("_", " ").title() + else: + title = getattr(data, "title", key.replace("_", " ").title()) + rendered[key] = (title, plot_html) # type: ignore[attr-defined] logger.debug("qc_report: rendered %s plot", key) # SDRF-driven differential expression volcano plots @@ -1682,6 +1659,7 @@ def qc_report_split( output_dir: str | Path, no_border: bool = False, strip_plot_title: bool = True, + two_sides: bool = True, ) -> Path: """Generate individual QC plot HTML files in a directory. @@ -1697,6 +1675,13 @@ def qc_report_split( Directory to write individual HTML files into. Created if it doesn't exist. no_border : bool If True, remove card borders and shadows from standalone plot files. + strip_plot_title : bool + Remove the Plotly figure title from each plot (default ``True``). + two_sides : bool + When ``True``, also generate combined HTML files that stack + logically related plots so that each combined file is + height-matched for side-by-side panel layouts + (e.g. Dataset Summary on the left, QC + LOD overview on the right). Returns ------- @@ -1725,7 +1710,8 @@ def qc_report_split( "lod_analysis": (LodAnalysisData, R.render_lod_analysis), "heatmap": (HeatmapData, R.render_heatmap), "correlation": (CorrelationData, R.render_correlation), - "data_completeness": (DataCompletenessData, R.render_data_completeness), + "sample_completeness": (DataCompletenessData, R.render_sample_completeness), + "missing_frequency_distribution": (DataCompletenessData, R.render_missing_frequency), "cv_distribution": (CvDistributionData, R.render_cv_distribution), "plate_cv": (PlateCvData, R.render_plate_cv), "norm_scale": (NormScaleData, R.render_norm_scale), @@ -1737,10 +1723,29 @@ def qc_report_split( } written: list[str] = [] + # Collect body fragments keyed by plot name for two_sides combinations + bodies: dict[str, str] = {} + + # Keys that will be merged into combined files (skip individual output) + _TWO_SIDES_COMBOS = [ + ("qc_lod_overview", "QC & LOD Overview", ["qc_summary", "lod_analysis"]), + ("iqr_platecv", "Outlier & Plate CV", ["iqr_median_qc", "plate_cv"]), + ] + combined_keys: set[str] = set() + if two_sides: + for _, _, keys in _TWO_SIDES_COMBOS: + combined_keys.update(keys) + + # Map split renderer keys to their compute_all data keys + _DATA_KEY_MAP = { + "sample_completeness": "data_completeness", + "missing_frequency_distribution": "data_completeness", + } # Render each plot as a standalone HTML file for key, (dtype, renderer) in _RENDERERS.items(): - data = plot_data.get(key) + data_key = _DATA_KEY_MAP.get(key, key) + data = plot_data.get(data_key) if data is None or not isinstance(data, dtype): continue fig = renderer(data) # type: ignore[operator] @@ -1751,7 +1756,10 @@ def qc_report_split( fig.update_layout(height=500) plot_height = f"{fig.layout.height}px" plot_html = fig.to_html(full_html=False, include_plotlyjs=False, default_height=plot_height) - title = getattr(data, "title", key.replace("_", " ").title()) + if key in _DATA_KEY_MAP: + title = key.replace("_", " ").title() + else: + title = getattr(data, "title", key.replace("_", " ").title()) help_html = _HELP_TEXT.get(key, "") help_toggle = '' help_block = f'
{help_html}
' if help_html else "" @@ -1759,6 +1767,11 @@ def qc_report_split( f'

{title}

' f"{help_toggle}
{help_block}{plot_html}
" ) + bodies[key] = body + # Skip individual file when this key will be in a combined file + if key in combined_keys: + logger.debug("qc_report_split: %s deferred to combined file", key) + continue page = _wrap_standalone_html(f"{title} — {platform_label}", body, no_border=no_border) (output_dir / f"{key}.html").write_text(page, encoding="utf-8") written.append(key) @@ -1815,17 +1828,42 @@ def qc_report_split( f'
{help_text}
' f"{combined_html}" ) - page = _wrap_standalone_html(f"Dimensionality Reduction — {platform_label}", body, no_border=no_border) - (output_dir / "dimreduction.html").write_text(page, encoding="utf-8") - written.append("dimreduction") + bodies["dimreduction"] = body + if "dimreduction" not in combined_keys: + page = _wrap_standalone_html( + f"Dimensionality Reduction — {platform_label}", + body, + no_border=no_border, + ) + (output_dir / "dimreduction.html").write_text(page, encoding="utf-8") + written.append("dimreduction") + else: + logger.debug("qc_report_split: dimreduction deferred to combined file") # Summary table (includes LOD sources inline) summary_html = _render_summary_table(dataset, plot_data, lod_info) + bodies["summary"] = summary_html page = _wrap_standalone_html( f"Dataset Summary — {platform_label}", summary_html, include_plotlyjs=False, no_border=no_border ) (output_dir / "summary.html").write_text(page, encoding="utf-8") written.append("summary") + # --- two_sides: generate combined HTML files for side-by-side layout --- + if two_sides: + for filename, page_title, keys in _TWO_SIDES_COMBOS: + parts = [bodies[k] for k in keys if k in bodies] + if not parts: + continue + combined_body = "\n".join(parts) + page = _wrap_standalone_html( + f"{page_title} — {platform_label}", + combined_body, + no_border=no_border, + ) + (output_dir / f"{filename}.html").write_text(page, encoding="utf-8") + written.append(filename) + logger.debug("qc_report_split: two_sides combined %s", filename) + logger.debug("qc_report_split: written %d files to %s", len(written), output_dir) return output_dir diff --git a/tests/test_qc_pipeline.py b/tests/test_qc_pipeline.py index 3914612..0644ed5 100644 --- a/tests/test_qc_pipeline.py +++ b/tests/test_qc_pipeline.py @@ -339,7 +339,7 @@ def test_contains_summary_table(self, tmp_path): qc_report(ds, output) content = output.read_text() assert "Dataset Summary" in content - assert "Detection rate" in content + assert "Features (assays)" in content assert "Median CV" in content # At least one traffic-light status dot should be present assert any(dot in content for dot in ["dot-green", "dot-amber", "dot-red"]) @@ -363,7 +363,7 @@ def test_split_report_creates_individual_files(self, tmp_path): # Summary should contain the table summary = (output_dir / "summary.html").read_text() assert "Dataset Summary" in summary - assert "Detection rate" in summary + assert "Features (assays)" in summary def test_somascan_report(self, tmp_path): ds = _make_somascan_dataset()