diff --git a/README.rst b/README.rst index a35dcb0..43f7f43 100644 --- a/README.rst +++ b/README.rst @@ -38,6 +38,8 @@ Python implementation of common RNA-seq normalization methods: - CUF_ (Counts adjusted with UQ factors) - TMM_ (Trimmed mean of M-values) - CTF_ (Counts adjusted with TMM factors) +- RLE_ (Relative log expression) +- CRF (Counts adjusted with RLE factors) For in-depth description of methods see documentation_. @@ -47,6 +49,7 @@ For in-depth description of methods see documentation_. .. _CUF: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02568-9/ .. _TMM: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-3-r25 .. _CTF: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02568-9/ +.. _RLE: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-10-r106 .. _documentation: https://rnanorm.readthedocs.io/ diff --git a/docs/changelog.rst b/docs/changelog.rst index 8371496..e996271 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -5,6 +5,22 @@ Change Log All notable changes to this project are documented in this file. +================== +2.3.0 - 2025-08-07 +================== + +Added +----- +- Implementation of the following methods: + + - RLE + - CRF + +Changed +------- +- Emit a warning when a sample has an upper-quartile of zero during + UQ normalisation; previously this silently produced Inf + ================== 2.2.0 - 2025-01-29 ================== diff --git a/docs/ref.rst b/docs/ref.rst index 5216570..cd15ad0 100644 --- a/docs/ref.rst +++ b/docs/ref.rst @@ -20,6 +20,8 @@ Normalization methods CUF TMM CTF + RLE + CRF Datasets diff --git a/src/rnanorm/__init__.py b/src/rnanorm/__init__.py index 2a2b98a..509e990 100644 --- a/src/rnanorm/__init__.py +++ b/src/rnanorm/__init__.py @@ -1,6 +1,6 @@ """RNA-seq normalization methods.""" -from .methods.between_sample import CTF, CUF, TMM, UQ +from .methods.between_sample import CRF, CTF, CUF, RLE, TMM, UQ from .methods.utils import LibrarySize from .methods.within_sample import CPM, FPKM, TPM @@ -8,9 +8,11 @@ "CPM", "CTF", "CUF", + "CRF", "FPKM", "LibrarySize", "TMM", + "RLE", "TPM", "UQ", ) diff --git a/src/rnanorm/cli.py b/src/rnanorm/cli.py index 24f7ad8..f4b7dbe 100644 --- a/src/rnanorm/cli.py +++ b/src/rnanorm/cli.py @@ -9,9 +9,9 @@ import click import pandas as pd -from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ +from rnanorm import CPM, CRF, CTF, CUF, FPKM, RLE, TMM, TPM, UQ -method_type = Type[Union[CPM, FPKM, TMM, TPM, UQ]] +method_type = Type[Union[CPM, FPKM, TMM, RLE, TPM, UQ]] file_type = Union[io.TextIOWrapper, Path] @@ -186,6 +186,20 @@ def ctf(exp: pd.DataFrame, out: file_type, force: bool, m_trim: float, a_trim: f CLWrapper(CTF, m_trim=m_trim, a_trim=a_trim).handle(exp, out, force) +@click.command(short_help="Reliative log expression") +@common_params +def rle(exp: pd.DataFrame, out: file_type, force: bool) -> None: + """Compute RLE.""" + CLWrapper(RLE).handle(exp, out, force) + + +@click.command(short_help="Counts adjusted with RLE factors") +@common_params +def crf(exp: pd.DataFrame, out: file_type, force: bool) -> None: + """Compute CRF.""" + CLWrapper(CRF).handle(exp, out, force) + + @click.group() def main() -> None: """Common RNA-seq normalization methods.""" @@ -200,3 +214,5 @@ def main() -> None: main.add_command(cuf) main.add_command(tmm) main.add_command(ctf) +main.add_command(rle) +main.add_command(crf) diff --git a/src/rnanorm/methods/between_sample.py b/src/rnanorm/methods/between_sample.py index e09874a..ab01a5c 100644 --- a/src/rnanorm/methods/between_sample.py +++ b/src/rnanorm/methods/between_sample.py @@ -1,5 +1,6 @@ """Between sample normalizations.""" +import warnings from typing import Any, Optional import numpy as np @@ -70,6 +71,26 @@ class UQ(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): Sample_4 20000.0 30000.0 50000.0 200000.0 200000.0 """ + def _warn_zero_uq(self, factors: Numeric1D, X: Numeric2D) -> None: + """Emit a warning if any sample has a zero upper-quartile.""" + zero_uq_mask = factors == 0 + if not np.any(zero_uq_mask): + return + + zero_uq_samples = np.where(zero_uq_mask)[0] + + # If output is pandas, show sample names instead of numeric indices + config = _get_output_config("transform", self) + if config.get("dense", None) == "pandas" and isinstance(X, pd.DataFrame): + zero_uq_samples = X.index[zero_uq_samples] + + warnings.warn( + "Upper-quartile is 0 for the following sample(s): " + f"{zero_uq_samples.tolist()}. This will set their " + "UQ factors to 0 and break geometric-mean centering " + "returning Inf values." + ) + def _get_norm_factors(self, X: Numeric2D) -> Numeric1D: """Get UQ normalization factors (un-normalized with geometric mean). @@ -89,6 +110,7 @@ def _get_norm_factors(self, X: Numeric2D) -> Numeric1D: arr=X, per=75, ) + return upper_quartiles / lib_size def get_norm_factors(self, X: Numeric2D) -> Numeric1D: @@ -117,9 +139,10 @@ def fit(self, X: Numeric2D, y: Optional[Numeric1D] = None, **fit_params: Any) -> :return: Self """ self._reset() - X = validate_data(self, X, ensure_all_finite=True, reset=True) + Xv = validate_data(self, X, ensure_all_finite=True, reset=True) - factors = self._get_norm_factors(X) + factors = self._get_norm_factors(Xv) + self._warn_zero_uq(factors, X) self.geometric_mean_ = gmean(factors) return self @@ -322,9 +345,9 @@ def _get_norm_factors(self, X: Numeric2D) -> Numeric1D: return factors def get_norm_factors(self, X: Numeric2D) -> Numeric1D: - """Get UQ normalization factors (normalized with geometric mean). + """Get TMM normalization factors (normalized with geometric mean). - :param X: Expression raw count matrix (n_samples, n_features)s + :param X: Expression raw count matrix (n_samples, n_features) """ check_is_fitted(self) X2 = validate_data(self, X, ensure_all_finite=True, reset=False, dtype=float) @@ -439,3 +462,195 @@ def transform(self, X: Numeric2D) -> Numeric2D: X = validate_data(self, X, ensure_all_finite=True, reset=False, dtype=float) return X / factors[:, np.newaxis] + + +class RLE(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): + """Relative Log Expression (RLE) normalization. + + In RNA-seq experiments, a small subset of genes may be highly overexpressed + in certain samples but not in others. This can artificially inflate library + size and therefore (after library size normalization) cause the remaining + genes to be considered under-sampled in those samples. Unless this effect + is adjusted for, those genes may falsely appear to be down-regulated in + that sample. To address this issue Anders & Huber developed an alternative + normalization method to TMM for the DESeq package, known as the + median-of-ratios method. EdgeR also implements this method under the name + Relative Log Expression (RLE). + Note that by edgeR convention, the Anders–Huber “size factors” are first + divided by each sample’s total library size and then rescaled so that their + geometric mean equals one. This yields normalization factors which, when + multiplied by the library size, produce effective library sizes. + Although the Anders–Huber size factors are linearly proportional to these + effective library sizes, the two packages differ in output scaling: DESeq2 + returns normalized counts on the scale of the original library sizes, + whereas edgeR reports counts per million (CPM). + + Procedure for normalization is described in `Anders & Huber, 2010 + `_, but in short: + + - Use raw counts + - Compute the reference pseudo-sample (``self.ref_``) as the geometric + mean of all samples for each gene. + - Compute scaling factors: + - For each sample, compute gene-wise ratios relative to the + reference pseudo-sample. + - Size factor = median of these ratios. + - Rescale factors so that their geometric mean is 1 + - "Adjusted library size" = library size * normalization factors + - Compute CPM normalization with “Adjusted library size” + + .. rubric:: Examples + + >>> from rnanorm.datasets import load_toy_data + >>> from rnanorm import RLE + >>> X = load_toy_data().exp + >>> X + Gene_1 Gene_2 Gene_3 Gene_4 Gene_5 + Sample_1 200 300 500 2000 7000 + Sample_2 400 600 1000 4000 14000 + Sample_3 200 300 500 2000 17000 + Sample_4 200 300 500 2000 2000 + >>> RLE().set_output(transform="pandas").fit_transform(X) + Gene_1 Gene_2 Gene_3 Gene_4 Gene_5 + Sample_1 20000.0 30000.0 50000.0 200000.0 700000.0 + Sample_2 20000.0 30000.0 50000.0 200000.0 700000.0 + Sample_3 20000.0 30000.0 50000.0 200000.0 1700000.0 + Sample_4 20000.0 30000.0 50000.0 200000.0 200000.0 + """ + + def _get_norm_factors(self, X: Numeric2D) -> Numeric1D: + """Get RLE normalization factors (un-normalized with geometric mean). + + :param X: Expression raw count matrix (n_samples, n_features) + """ + # Make sure that global set_config(transform_output="pandas") + # does not affect this method - we need numpy output here. + lib_size = LibrarySize().set_output(transform="default").fit_transform(X) + + # Copy the stored reference pseudo-sample so we don’t modify self.ref_ + # and mask zeros to NaN to avoid dividing by zero + ref = self.ref_.copy() + ref[ref == 0] = np.nan + + # Compute gene-wise ratios to the reference for genes with nonzero + # counts in X. + # This handles cases where the fitted reference and current X may + # differ in which genes are zero. + ratio = np.where(X > 0, X / ref, np.nan) + + median_of_ratios = np.nanmedian(ratio, axis=1).astype(float) + + return median_of_ratios / lib_size # normalize by library size + + def _reset(self) -> None: + """Reset internal data-dependent state.""" + if hasattr(self, "geometric_mean_"): + del self.geometric_mean_ + del self.ref_ + + def fit(self, X: Numeric2D, y: Optional[Numeric1D] = None, **fit_params: Any) -> Self: + """Fit. + + Learn gene‑wise geometric means from the training set. + Genes that contain any zero propagate NaN and are excluded + from downstream size‑factor calculations. + + :param X: Expression raw count matrix (n_samples, n_features) + :return: Self + """ + + self._reset() + X = validate_data(self, X, ensure_all_finite=True, reset=True, dtype=float) + + # compute a reference pseudo-sample by performing a geometric mean + # over all samples for each gene + self.ref_ = gmean(X, axis=0) + + factors = self._get_norm_factors(X) + self.geometric_mean_ = gmean(factors) # this is the geometric mean of size factors + return self + + def get_norm_factors(self, X: Numeric2D) -> Numeric1D: + """Get RLE normalization factors (normalized with geometric mean). + + :param X: Expression raw count matrix (n_samples, n_features) + """ + check_is_fitted(self) + factors = self._get_norm_factors(X) + factors = factors / self.geometric_mean_ + + config = _get_output_config("transform", self) + if config.get("dense", None) == "pandas" and isinstance(X, pd.DataFrame): + return pd.Series(factors, index=X.index) + return factors + + def transform(self, X: Numeric2D) -> Numeric2D: + """Transform. + + :param X: Expression raw count matrix (n_samples, n_features) + :return: Normalized expression matrix (n_samples, n_features) + """ + # Compute effective library sizes + factors = self.get_norm_factors(X) + if isinstance(factors, pd.Series): + factors = factors.to_numpy() + + lib_size = LibrarySize().set_output(transform="default").fit_transform(X) + effective_lib_size = lib_size * factors + + # Method ``check_is_fitted`` is not called here, since it is + # called in self.get_norm_factors + X = validate_data(self, X, ensure_all_finite=True, reset=False, dtype=float) + + # Make CPM, but with effective library size + return X / effective_lib_size[:, np.newaxis] * 1e6 + + +class CRF(RLE): + """Counts adjusted with RLE factors normalization. + + This method extends the CUF/CTF procedures to RLE factors, + ensuring consistency across RNAnorm methods. + Although not covered in Johnson & Krishnan (2022) + `_, + it is included for package-wide consistency. In short: + + - Compute normalization factors same as in RLE + - Divide raw counts with these factors + + .. rubric:: Examples + + >>> from rnanorm.datasets import load_toy_data + >>> from rnanorm import CRF + >>> X = load_toy_data().exp + >>> X + Gene_1 Gene_2 Gene_3 Gene_4 Gene_5 + Sample_1 200 300 500 2000 7000 + Sample_2 400 600 1000 4000 14000 + Sample_3 200 300 500 2000 17000 + Sample_4 200 300 500 2000 2000 + >>> CRF().set_output(transform="pandas").fit_transform(X) + Gene_1 Gene_2 Gene_3 Gene_4 Gene_5 + Sample_1 200.0 300.0 500.0 2000.0 7000.0 + Sample_2 400.0 600.0 1000.0 4000.0 14000.0 + Sample_3 400.0 600.0 1000.0 4000.0 34000.0 + Sample_4 100.0 150.0 250.0 1000.0 1000.0 + + """ + + def transform(self, X: Numeric2D) -> Numeric2D: + """Transform. + + :param X: Expression raw count matrix (n_samples, n_features) + :return: Normalized expression matrix (n_samples, n_features) + """ + # Just divide raw counts with normalization factors + factors = self.get_norm_factors(X) + if isinstance(factors, pd.Series): + factors = factors.to_numpy() + + # Method ``check_is_fitted`` is not called here, since it is + # called in self.get_norm_factors + X = validate_data(self, X, ensure_all_finite=True, reset=False, dtype=float) + + return X / factors[:, np.newaxis] diff --git a/tests/files/gtex.lung.30.chr21.rle_factors_edgeR.tsv b/tests/files/gtex.lung.30.chr21.rle_factors_edgeR.tsv new file mode 100644 index 0000000..f06e82a --- /dev/null +++ b/tests/files/gtex.lung.30.chr21.rle_factors_edgeR.tsv @@ -0,0 +1,48 @@ +# This file was obtained with the following code +# +# 1. Prepare file +# from rnanorm.datasets import load_gtex +# ds = load_gtex(as_frame=True) +# ds.exp.T.to_csv("gtex.lung.30.chr21.tsv", sep="\t") +# +# 2. Run edgeR docker +# docker run -i -t -v /Users/user/code/RNAnorm:/data/pegi3s/r_edger --platform linux/amd64 pegi3s/r_edger:3.36.0 /bin/bash +# +# 3. Start R, Run edgeR on data and save RLE factors +# R +# library("edgeR") +# countData = read.delim("gtex.lung.30.chr21.tsv", sep="\t", header=TRUE, check.names=FALSE, row.names=1) +# y <- DGEList(counts=countData) +# y = calcNormFactors(y, method="RLE") +# write.table(y$samples, file='gtex.lung.30.chr21.rle_factors_edgeR.tsv', sep='\t', quote=F, col.names=NA) + group lib.size norm.factors +GTEX-111CU-0326-SM-5GZXO 1 566151 0.962192453671794 +GTEX-111FC-1126-SM-5GZWU 1 428119 1.12409791748014 +GTEX-111VG-0726-SM-5GIDC 1 826255 0.834915667250542 +GTEX-111YS-0626-SM-5GZXV 1 531025 1.01922751980661 +GTEX-1122O-0126-SM-5GICA 1 480370 1.19263550320138 +GTEX-1128S-0726-SM-5N9D6 1 480527 1.03162235607412 +GTEX-117YW-0526-SM-5H11C 1 575937 0.860930245430436 +GTEX-117YX-1326-SM-5H125 1 526901 1.10833415257362 +GTEX-11DXX-0626-SM-5Q5AG 1 494979 1.04525845724813 +GTEX-11DXZ-0726-SM-5N9C4 1 492329 1.18026405107029 +GTEX-11DZ1-0426-SM-5H11A 1 574263 0.842255242954767 +GTEX-11EI6-0826-SM-5985V 1 418580 0.97953310151101 +GTEX-11EMC-0126-SM-5EGKV 1 482736 0.954910672173466 +GTEX-11EQ9-0226-SM-5A5JX 1 489476 1.12484504548183 +GTEX-11GSP-0726-SM-5986L 1 514957 0.848212426315 +GTEX-11I78-0126-SM-5HL6F 1 459716 1.17854515592899 +GTEX-11LCK-0426-SM-5A5M8 1 562545 0.985425359730958 +GTEX-11NSD-0326-SM-5A5LS 1 436715 1.27166891245838 +GTEX-11NUK-0826-SM-5HL4U 1 520950 1.13641010752219 +GTEX-11NV4-1126-SM-5HL6J 1 456046 0.769477972566189 +GTEX-11O72-1326-SM-5BC5A 1 551438 0.85801920626528 +GTEX-11OF3-1126-SM-5986C 1 460745 0.735036890319363 +GTEX-11P7K-0326-SM-59871 1 444387 1.27309094677382 +GTEX-11P81-0226-SM-5HL5M 1 490514 1.19108334036948 +GTEX-11PRG-0926-SM-5EGI8 1 529154 0.967054489076335 +GTEX-11TT1-1626-SM-5EQL7 1 545627 1.03185431932918 +GTEX-11TUW-0526-SM-5LU9A 1 464700 0.916329772320974 +GTEX-11UD2-0726-SM-5EQ69 1 394140 1.08713748837276 +GTEX-11WQC-0626-SM-5EQMF 1 463745 0.94182307098083 +GTEX-11WQK-1226-SM-5GU5Z 1 687312 0.863116191715343 diff --git a/tests/test_cli.py b/tests/test_cli.py index 344a6de..a8264af 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -5,7 +5,7 @@ import pandas as pd import pytest -from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ +from rnanorm import CPM, CRF, CTF, CUF, FPKM, RLE, TMM, TPM, UQ from rnanorm.annotation import GTF @@ -190,3 +190,27 @@ def test_ctf(exp, exp_path, out_path): pd.read_csv(out_path, index_col=0), CTF().set_output(transform="pandas").fit_transform(exp), ) + + +def test_rle(exp, exp_path, out_path): + assert not out_path.is_file() + + cp = subprocess.run(["rnanorm", "rle", str(exp_path), "--out", str(out_path)]) + assert out_path.is_file() + assert cp.returncode == 0 + pd.testing.assert_frame_equal( + pd.read_csv(out_path, index_col=0), + RLE().set_output(transform="pandas").fit_transform(exp), + ) + + +def test_crf(exp, exp_path, out_path): + assert not out_path.is_file() + + cp = subprocess.run(["rnanorm", "crf", str(exp_path), "--out", str(out_path)]) + assert out_path.is_file() + assert cp.returncode == 0 + pd.testing.assert_frame_equal( + pd.read_csv(out_path, index_col=0), + CRF().set_output(transform="pandas").fit_transform(exp), + ) diff --git a/tests/test_crf.py b/tests/test_crf.py new file mode 100644 index 0000000..4e5d045 --- /dev/null +++ b/tests/test_crf.py @@ -0,0 +1,46 @@ +import numpy as np +import pandas as pd +import pytest + +from rnanorm import CRF + + +@pytest.fixture +def expected_crf(exp): + return pd.DataFrame( + [ + [200, 300, 500, 2000, 7000], + [400, 600, 1000, 4000, 14000], + [400, 600, 1000, 4000, 34000], + [100, 150, 250, 1000, 1000], + ], + index=exp.index, + columns=exp.columns, + dtype=np.float64, + ) + + +def test_crf(exp, expected_factors, expected_crf): + """Test UQ normalization.""" + # Simple case: Fit some data, transform same data. + transformer = CRF() + transformer.fit(exp) + factors = transformer.get_norm_factors(exp) + transformed_data = transformer.transform(exp) + + np.testing.assert_allclose(factors, expected_factors, rtol=1e-4) + np.testing.assert_allclose(transformed_data, expected_crf, rtol=1e-4) + + # Advanced case: Fit some samples, transform different ones. + samples_to_fit = ["Sample_1", "Sample_3", "Sample_4"] + exp1 = exp.loc[samples_to_fit] + transformer.fit(exp1) + factors = transformer.get_norm_factors(exp1) + transformed_data = transformer.transform(exp.loc[["Sample_2"]]) + + np.testing.assert_allclose(factors, [1, 0.5, 2.0], rtol=1e-4) + np.testing.assert_allclose( + transformed_data, + expected_crf.loc[["Sample_2"]], + rtol=1e-3, + ) diff --git a/tests/test_rle.py b/tests/test_rle.py new file mode 100644 index 0000000..25da255 --- /dev/null +++ b/tests/test_rle.py @@ -0,0 +1,73 @@ +from pathlib import Path + +import numpy as np +import pandas as pd +import pytest + +from rnanorm import RLE +from rnanorm.datasets import load_gtex + + +@pytest.fixture +def expected_rle(exp): + return pd.DataFrame( + [ + [20000, 30000, 50000, 200000, 700000], + [20000, 30000, 50000, 200000, 700000], + [20000, 30000, 50000, 200000, 1700000], + [20000, 30000, 50000, 200000, 200000], + ], + index=exp.index, + columns=exp.columns, + dtype=np.float64, + ) + + +def test_rle(exp, expected_factors, expected_rle): + """Test RLE normalization.""" + # Simple case: Fit some data, transform same data. + transformer = RLE() + transformer.fit(exp) + factors = transformer.get_norm_factors(exp) + transformed_data = transformer.transform(exp) + + np.testing.assert_allclose(factors, expected_factors, rtol=1e-4) + np.testing.assert_allclose(transformed_data, expected_rle, rtol=1e-4) + + # Advanced case: Fit some samples, transform different ones. + samples_to_fit = ["Sample_1", "Sample_3", "Sample_4"] + exp1 = exp.loc[samples_to_fit] + transformer.fit(exp1) + factors = transformer.get_norm_factors(exp1) + transformed_data = transformer.transform(exp.loc[["Sample_2"]]) + + np.testing.assert_allclose(factors, [1, 0.5, 2.0], rtol=1e-4) + np.testing.assert_allclose( + transformed_data, + expected_rle.loc[["Sample_2"]], + rtol=1e-3, + ) + + +def test_rle_rnanorm_edger(): + """Test our results against EdgeR's RLE implementation.""" + # Load saved scaling factors from edgeR + files_dir = Path(__file__).parent / "files" + edger_factors = pd.read_csv( + files_dir / "gtex.lung.30.chr21.rle_factors_edgeR.tsv", + sep="\t", + comment="#", + index_col=0, + usecols=[0, 2, 3], + ) + + # Compute scaling factors here + ds = load_gtex() + rnanorm_factors = RLE().fit(ds.exp).get_norm_factors(ds.exp) + + # Compare loaded and computed scaling factors + np.testing.assert_array_almost_equal( + edger_factors["norm.factors"].values, + rnanorm_factors, + decimal=14, + )