genialis · ftucos · Jul 1, 2025 · Jul 2, 2025 · Jul 2, 2025 · Jul 2, 2025
diff --git a/README.rst b/README.rst
@@ -38,6 +38,8 @@ Python implementation of common RNA-seq normalization methods:
 - CUF_ (Counts adjusted with UQ factors)
 - TMM_ (Trimmed mean of M-values)
 - CTF_ (Counts adjusted with TMM factors)
+- RLE_ (Relative log expression)
+- CRF (Counts adjusted with RLE factors)
 
 For in-depth description of methods see documentation_.
 
@@ -47,6 +49,7 @@ For in-depth description of methods see documentation_.
 .. _CUF: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02568-9/
 .. _TMM: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-3-r25
 .. _CTF: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02568-9/
+.. _RLE: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-10-r106
 .. _documentation: https://rnanorm.readthedocs.io/
 
 

diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -5,6 +5,22 @@ Change Log
 All notable changes to this project are documented in this file.
 
 
+==================
+2.3.0 - 2025-08-07
+==================
+
+Added
+-----
+- Implementation of the following methods:
+
+    - RLE
+    - CRF
+
+Changed
+-------
+- Emit a warning when a sample has an upper-quartile of zero during 
+  UQ normalisation; previously this silently produced Inf
+
 ==================
 2.2.0 - 2025-01-29
 ==================

diff --git a/docs/ref.rst b/docs/ref.rst
@@ -20,6 +20,8 @@ Normalization methods
    CUF
    TMM
    CTF
+   RLE
+   CRF
 
 
 Datasets

diff --git a/src/rnanorm/__init__.py b/src/rnanorm/__init__.py
@@ -1,16 +1,18 @@
 """RNA-seq normalization methods."""
 
-from .methods.between_sample import CTF, CUF, TMM, UQ
+from .methods.between_sample import CRF, CTF, CUF, RLE, TMM, UQ
 from .methods.utils import LibrarySize
 from .methods.within_sample import CPM, FPKM, TPM
 
 __all__ = (
     "CPM",
     "CTF",
     "CUF",
+    "CRF",
     "FPKM",
     "LibrarySize",
     "TMM",
+    "RLE",
     "TPM",
     "UQ",
 )
diff --git a/src/rnanorm/cli.py b/src/rnanorm/cli.py
@@ -9,9 +9,9 @@
 import click
 import pandas as pd
 
-from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ
+from rnanorm import CPM, CRF, CTF, CUF, FPKM, RLE, TMM, TPM, UQ
 
-method_type = Type[Union[CPM, FPKM, TMM, TPM, UQ]]
+method_type = Type[Union[CPM, FPKM, TMM, RLE, TPM, UQ]]
 file_type = Union[io.TextIOWrapper, Path]
 
 
@@ -186,6 +186,20 @@ def ctf(exp: pd.DataFrame, out: file_type, force: bool, m_trim: float, a_trim: f
     CLWrapper(CTF, m_trim=m_trim, a_trim=a_trim).handle(exp, out, force)
 
 
+@click.command(short_help="Reliative log expression")
+@common_params
+def rle(exp: pd.DataFrame, out: file_type, force: bool) -> None:
+    """Compute RLE."""
+    CLWrapper(RLE).handle(exp, out, force)
+
+
+@click.command(short_help="Counts adjusted with RLE factors")
+@common_params
+def crf(exp: pd.DataFrame, out: file_type, force: bool) -> None:
+    """Compute CRF."""
+    CLWrapper(CRF).handle(exp, out, force)
+
+
 @click.group()
 def main() -> None:
     """Common RNA-seq normalization methods."""
@@ -200,3 +214,5 @@ def main() -> None:
 main.add_command(cuf)
 main.add_command(tmm)
 main.add_command(ctf)
+main.add_command(rle)
+main.add_command(crf)
diff --git a/src/rnanorm/methods/between_sample.py b/src/rnanorm/methods/between_sample.py
@@ -1,5 +1,6 @@
 """Between sample normalizations."""
 
+import warnings
 from typing import Any, Optional
 
 import numpy as np
@@ -70,6 +71,26 @@ class UQ(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
     Sample_4  20000.0  30000.0  50000.0  200000.0   200000.0
     """
 
+    def _warn_zero_uq(self, factors: Numeric1D, X: Numeric2D) -> None:
+        """Emit a warning if any sample has a zero upper-quartile."""
+        zero_uq_mask = factors == 0
+        if not np.any(zero_uq_mask):
+            return
+
+        zero_uq_samples = np.where(zero_uq_mask)[0]
+
+        # If output is pandas, show sample names instead of numeric indices
+        config = _get_output_config("transform", self)
+        if config.get("dense", None) == "pandas" and isinstance(X, pd.DataFrame):
+            zero_uq_samples = X.index[zero_uq_samples]
+
+        warnings.warn(
+            "Upper-quartile is 0 for the following sample(s): "
+            f"{zero_uq_samples.tolist()}. This will set their "
+            "UQ factors to 0 and break geometric-mean centering "
+            "returning Inf values."
+        )
+
     def _get_norm_factors(self, X: Numeric2D) -> Numeric1D:
         """Get UQ normalization factors (un-normalized with geometric mean).
 
@@ -89,6 +110,7 @@ def _get_norm_factors(self, X: Numeric2D) -> Numeric1D:
             arr=X,
             per=75,
         )
+
         return upper_quartiles / lib_size
 
     def get_norm_factors(self, X: Numeric2D) -> Numeric1D:
@@ -117,9 +139,10 @@ def fit(self, X: Numeric2D, y: Optional[Numeric1D] = None, **fit_params: Any) ->
         :return: Self
         """
         self._reset()
-        X = validate_data(self, X, ensure_all_finite=True, reset=True)
+        Xv = validate_data(self, X, ensure_all_finite=True, reset=True)
 
-        factors = self._get_norm_factors(X)
+        factors = self._get_norm_factors(Xv)
+        self._warn_zero_uq(factors, X)
         self.geometric_mean_ = gmean(factors)
 
         return self
@@ -322,9 +345,9 @@ def _get_norm_factors(self, X: Numeric2D) -> Numeric1D:
         return factors
 
     def get_norm_factors(self, X: Numeric2D) -> Numeric1D:
-        """Get UQ normalization factors (normalized with geometric mean).
+        """Get TMM normalization factors (normalized with geometric mean).
 
-        :param X: Expression raw count matrix (n_samples, n_features)s
+        :param X: Expression raw count matrix (n_samples, n_features)
         """
         check_is_fitted(self)
         X2 = validate_data(self, X, ensure_all_finite=True, reset=False, dtype=float)
@@ -439,3 +462,195 @@ def transform(self, X: Numeric2D) -> Numeric2D:
         X = validate_data(self, X, ensure_all_finite=True, reset=False, dtype=float)
 
         return X / factors[:, np.newaxis]
+
+
+class RLE(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
+    """Relative Log Expression (RLE) normalization.
+
+    In RNA-seq experiments, a small subset of genes may be highly overexpressed
+    in certain samples but not in others. This can artificially inflate library
+    size and therefore (after library size normalization) cause the remaining
+    genes to be considered under-sampled in those samples. Unless this effect
+    is adjusted for, those genes may falsely appear to be down-regulated in
+    that sample. To address this issue Anders & Huber developed an alternative
+    normalization method to TMM for the DESeq package, known as the
+    median-of-ratios method. EdgeR also implements this method under the name
+    Relative Log Expression (RLE).
+    Note that by edgeR convention, the Anders–Huber “size factors” are first
+    divided by each sample’s total library size and then rescaled so that their
+    geometric mean equals one. This yields normalization factors which, when
+    multiplied by the library size, produce effective library sizes.
+    Although the Anders–Huber size factors are linearly proportional to these
+    effective library sizes, the two packages differ in output scaling: DESeq2
+    returns normalized counts on the scale of the original library sizes,
+    whereas edgeR reports counts per million (CPM).
+
+    Procedure for normalization is described in `Anders & Huber, 2010
+    <https://doi.org/10.1186/gb-2010-11-10-r106>`_, but in short:
+
+        - Use raw counts
+        - Compute the reference pseudo-sample (``self.ref_``) as the geometric
+          mean of all samples for each gene.
+        - Compute scaling factors:
+            - For each sample, compute gene-wise ratios relative to the
+              reference pseudo-sample.
+            - Size factor = median of these ratios.
+            - Rescale factors so that their geometric mean is 1
+        - "Adjusted library size" = library size * normalization factors
+        - Compute CPM normalization with “Adjusted library size”
+
+    .. rubric:: Examples
+
+    >>> from rnanorm.datasets import load_toy_data
+    >>> from rnanorm import RLE
+    >>> X = load_toy_data().exp
+    >>> X
+              Gene_1  Gene_2  Gene_3  Gene_4  Gene_5
+    Sample_1     200     300     500    2000    7000
+    Sample_2     400     600    1000    4000   14000
+    Sample_3     200     300     500    2000   17000
+    Sample_4     200     300     500    2000    2000
+    >>> RLE().set_output(transform="pandas").fit_transform(X)
+               Gene_1   Gene_2   Gene_3    Gene_4     Gene_5
+    Sample_1  20000.0  30000.0  50000.0  200000.0   700000.0
+    Sample_2  20000.0  30000.0  50000.0  200000.0   700000.0
+    Sample_3  20000.0  30000.0  50000.0  200000.0  1700000.0
+    Sample_4  20000.0  30000.0  50000.0  200000.0   200000.0
+    """
+
+    def _get_norm_factors(self, X: Numeric2D) -> Numeric1D:
+        """Get RLE normalization factors (un-normalized with geometric mean).
+
+        :param X: Expression raw count matrix (n_samples, n_features)
+        """
+        # Make sure that global set_config(transform_output="pandas")
+        # does not affect this method - we need numpy output here.
+        lib_size = LibrarySize().set_output(transform="default").fit_transform(X)
+
+        # Copy the stored reference pseudo-sample so we don’t modify self.ref_
+        # and mask zeros to NaN to avoid dividing by zero
+        ref = self.ref_.copy()
+        ref[ref == 0] = np.nan
+
+        # Compute gene-wise ratios to the reference for genes with nonzero
+        # counts in X.
+        # This handles cases where the fitted reference and current X may
+        # differ in which genes are zero.
+        ratio = np.where(X > 0, X / ref, np.nan)
+
+        median_of_ratios = np.nanmedian(ratio, axis=1).astype(float)
+
+        return median_of_ratios / lib_size  # normalize by library size
+
+    def _reset(self) -> None:
+        """Reset internal data-dependent state."""
+        if hasattr(self, "geometric_mean_"):
+            del self.geometric_mean_
+            del self.ref_
+
+    def fit(self, X: Numeric2D, y: Optional[Numeric1D] = None, **fit_params: Any) -> Self:
+        """Fit.
+
+        Learn gene‑wise geometric means from the training set.
+        Genes that contain any zero propagate NaN and are excluded
+        from downstream size‑factor calculations.
+
+        :param X: Expression raw count matrix (n_samples, n_features)
+        :return: Self
+        """
+
+        self._reset()
+        X = validate_data(self, X, ensure_all_finite=True, reset=True, dtype=float)
+
+        # compute a reference pseudo-sample by performing a geometric mean
+        # over all samples for each gene
+        self.ref_ = gmean(X, axis=0)
+
+        factors = self._get_norm_factors(X)
+        self.geometric_mean_ = gmean(factors)  # this is the geometric mean of size factors
+        return self
+
+    def get_norm_factors(self, X: Numeric2D) -> Numeric1D:
+        """Get RLE normalization factors (normalized with geometric mean).
+
+        :param X: Expression raw count matrix (n_samples, n_features)
+        """
+        check_is_fitted(self)
+        factors = self._get_norm_factors(X)
+        factors = factors / self.geometric_mean_
+
+        config = _get_output_config("transform", self)
+        if config.get("dense", None) == "pandas" and isinstance(X, pd.DataFrame):
+            return pd.Series(factors, index=X.index)
+        return factors
+
+    def transform(self, X: Numeric2D) -> Numeric2D:
+        """Transform.
+
+        :param X: Expression raw count matrix (n_samples, n_features)
+        :return: Normalized expression matrix (n_samples, n_features)
+        """
+        # Compute effective library sizes
+        factors = self.get_norm_factors(X)
+        if isinstance(factors, pd.Series):
+            factors = factors.to_numpy()
+
+        lib_size = LibrarySize().set_output(transform="default").fit_transform(X)
+        effective_lib_size = lib_size * factors
+
+        # Method ``check_is_fitted`` is not called here, since it is
+        # called in self.get_norm_factors
+        X = validate_data(self, X, ensure_all_finite=True, reset=False, dtype=float)
+
+        # Make CPM, but with effective library size
+        return X / effective_lib_size[:, np.newaxis] * 1e6
+
+
+class CRF(RLE):
+    """Counts adjusted with RLE factors normalization.
+
+    This method extends the CUF/CTF procedures to RLE factors,
+    ensuring consistency across RNAnorm methods.
+    Although not covered in Johnson & Krishnan (2022)
+    <https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02568-9>`_,
+    it is included for package-wide consistency. In short:
+
+        - Compute normalization factors same as in RLE
+        - Divide raw counts with these factors
+
+    .. rubric:: Examples
+
+    >>> from rnanorm.datasets import load_toy_data
+    >>> from rnanorm import CRF
+    >>> X = load_toy_data().exp
+    >>> X
+              Gene_1  Gene_2  Gene_3  Gene_4  Gene_5
+    Sample_1     200     300     500    2000    7000
+    Sample_2     400     600    1000    4000   14000
+    Sample_3     200     300     500    2000   17000
+    Sample_4     200     300     500    2000    2000
+    >>> CRF().set_output(transform="pandas").fit_transform(X)
+              Gene_1  Gene_2  Gene_3  Gene_4   Gene_5
+    Sample_1   200.0   300.0   500.0  2000.0   7000.0
+    Sample_2   400.0   600.0  1000.0  4000.0  14000.0
+    Sample_3   400.0   600.0  1000.0  4000.0  34000.0
+    Sample_4   100.0   150.0   250.0  1000.0   1000.0
+
+    """
+
+    def transform(self, X: Numeric2D) -> Numeric2D:
+        """Transform.
+
+        :param X: Expression raw count matrix (n_samples, n_features)
+        :return: Normalized expression matrix (n_samples, n_features)
+        """
+        # Just divide raw counts with normalization factors
+        factors = self.get_norm_factors(X)
+        if isinstance(factors, pd.Series):
+            factors = factors.to_numpy()
+
+        # Method ``check_is_fitted`` is not called here, since it is
+        # called in self.get_norm_factors
+        X = validate_data(self, X, ensure_all_finite=True, reset=False, dtype=float)
+
+        return X / factors[:, np.newaxis]
-Original file line number
+Diff line change
@@ Expand Up / @@ -20,6 +20,8 @@ Normalization methods @@
        CUF
        TMM
        CTF
+       RLE
+       CRF
     Datasets
@@ Expand Down @@