Skip to content
Open
3 changes: 3 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ Python implementation of common RNA-seq normalization methods:
- CUF_ (Counts adjusted with UQ factors)
- TMM_ (Trimmed mean of M-values)
- CTF_ (Counts adjusted with TMM factors)
- RLE_ (Relative log expression)
- CRF (Counts adjusted with RLE factors)

For in-depth description of methods see documentation_.

Expand All @@ -47,6 +49,7 @@ For in-depth description of methods see documentation_.
.. _CUF: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02568-9/
.. _TMM: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-3-r25
.. _CTF: https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02568-9/
.. _RLE: https://genomebiology.biomedcentral.com/articles/10.1186/gb-2010-11-10-r106
.. _documentation: https://rnanorm.readthedocs.io/


Expand Down
16 changes: 16 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,22 @@ Change Log
All notable changes to this project are documented in this file.


==================
2.3.0 - 2025-08-07
==================

Added
-----
- Implementation of the following methods:

- RLE
- CRF

Changed
-------
- Emit a warning when a sample has an upper-quartile of zero during
UQ normalisation; previously this silently produced Inf

==================
2.2.0 - 2025-01-29
==================
Expand Down
2 changes: 2 additions & 0 deletions docs/ref.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ Normalization methods
CUF
TMM
CTF
RLE
CRF


Datasets
Expand Down
4 changes: 3 additions & 1 deletion src/rnanorm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
"""RNA-seq normalization methods."""

from .methods.between_sample import CTF, CUF, TMM, UQ
from .methods.between_sample import CRF, CTF, CUF, RLE, TMM, UQ
from .methods.utils import LibrarySize
from .methods.within_sample import CPM, FPKM, TPM

__all__ = (
"CPM",
"CTF",
"CUF",
"CRF",
"FPKM",
"LibrarySize",
"TMM",
"RLE",
"TPM",
"UQ",
)
20 changes: 18 additions & 2 deletions src/rnanorm/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
import click
import pandas as pd

from rnanorm import CPM, CTF, CUF, FPKM, TMM, TPM, UQ
from rnanorm import CPM, CRF, CTF, CUF, FPKM, RLE, TMM, TPM, UQ

method_type = Type[Union[CPM, FPKM, TMM, TPM, UQ]]
method_type = Type[Union[CPM, FPKM, TMM, RLE, TPM, UQ]]
file_type = Union[io.TextIOWrapper, Path]


Expand Down Expand Up @@ -186,6 +186,20 @@ def ctf(exp: pd.DataFrame, out: file_type, force: bool, m_trim: float, a_trim: f
CLWrapper(CTF, m_trim=m_trim, a_trim=a_trim).handle(exp, out, force)


@click.command(short_help="Reliative log expression")
@common_params
def rle(exp: pd.DataFrame, out: file_type, force: bool) -> None:
"""Compute RLE."""
CLWrapper(RLE).handle(exp, out, force)


@click.command(short_help="Counts adjusted with RLE factors")
@common_params
def crf(exp: pd.DataFrame, out: file_type, force: bool) -> None:
"""Compute CRF."""
CLWrapper(CRF).handle(exp, out, force)


@click.group()
def main() -> None:
"""Common RNA-seq normalization methods."""
Expand All @@ -200,3 +214,5 @@ def main() -> None:
main.add_command(cuf)
main.add_command(tmm)
main.add_command(ctf)
main.add_command(rle)
main.add_command(crf)
223 changes: 219 additions & 4 deletions src/rnanorm/methods/between_sample.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Between sample normalizations."""

import warnings
from typing import Any, Optional

import numpy as np
Expand Down Expand Up @@ -70,6 +71,26 @@ class UQ(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
Sample_4 20000.0 30000.0 50000.0 200000.0 200000.0
"""

def _warn_zero_uq(self, factors: Numeric1D, X: Numeric2D) -> None:
"""Emit a warning if any sample has a zero upper-quartile."""
zero_uq_mask = factors == 0
if not np.any(zero_uq_mask):
return

zero_uq_samples = np.where(zero_uq_mask)[0]

# If output is pandas, show sample names instead of numeric indices
config = _get_output_config("transform", self)
if config.get("dense", None) == "pandas" and isinstance(X, pd.DataFrame):
zero_uq_samples = X.index[zero_uq_samples]

warnings.warn(
"Upper-quartile is 0 for the following sample(s): "
f"{zero_uq_samples.tolist()}. This will set their "
"UQ factors to 0 and break geometric-mean centering "
"returning Inf values."
)

def _get_norm_factors(self, X: Numeric2D) -> Numeric1D:
"""Get UQ normalization factors (un-normalized with geometric mean).

Expand All @@ -89,6 +110,7 @@ def _get_norm_factors(self, X: Numeric2D) -> Numeric1D:
arr=X,
per=75,
)

return upper_quartiles / lib_size

def get_norm_factors(self, X: Numeric2D) -> Numeric1D:
Expand Down Expand Up @@ -117,9 +139,10 @@ def fit(self, X: Numeric2D, y: Optional[Numeric1D] = None, **fit_params: Any) ->
:return: Self
"""
self._reset()
X = validate_data(self, X, ensure_all_finite=True, reset=True)
Xv = validate_data(self, X, ensure_all_finite=True, reset=True)

factors = self._get_norm_factors(X)
factors = self._get_norm_factors(Xv)
self._warn_zero_uq(factors, X)
self.geometric_mean_ = gmean(factors)

return self
Expand Down Expand Up @@ -322,9 +345,9 @@ def _get_norm_factors(self, X: Numeric2D) -> Numeric1D:
return factors

def get_norm_factors(self, X: Numeric2D) -> Numeric1D:
"""Get UQ normalization factors (normalized with geometric mean).
"""Get TMM normalization factors (normalized with geometric mean).

:param X: Expression raw count matrix (n_samples, n_features)s
:param X: Expression raw count matrix (n_samples, n_features)
"""
check_is_fitted(self)
X2 = validate_data(self, X, ensure_all_finite=True, reset=False, dtype=float)
Expand Down Expand Up @@ -439,3 +462,195 @@ def transform(self, X: Numeric2D) -> Numeric2D:
X = validate_data(self, X, ensure_all_finite=True, reset=False, dtype=float)

return X / factors[:, np.newaxis]


class RLE(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
"""Relative Log Expression (RLE) normalization.

In RNA-seq experiments, a small subset of genes may be highly overexpressed
in certain samples but not in others. This can artificially inflate library
size and therefore (after library size normalization) cause the remaining
genes to be considered under-sampled in those samples. Unless this effect
is adjusted for, those genes may falsely appear to be down-regulated in
that sample. To address this issue Anders & Huber developed an alternative
normalization method to TMM for the DESeq package, known as the
median-of-ratios method. EdgeR also implements this method under the name
Relative Log Expression (RLE).
Note that by edgeR convention, the Anders–Huber “size factors” are first
divided by each sample’s total library size and then rescaled so that their
geometric mean equals one. This yields normalization factors which, when
multiplied by the library size, produce effective library sizes.
Although the Anders–Huber size factors are linearly proportional to these
effective library sizes, the two packages differ in output scaling: DESeq2
returns normalized counts on the scale of the original library sizes,
whereas edgeR reports counts per million (CPM).

Procedure for normalization is described in `Anders & Huber, 2010
<https://doi.org/10.1186/gb-2010-11-10-r106>`_, but in short:

- Use raw counts
- Compute the reference pseudo-sample (``self.ref_``) as the geometric
mean of all samples for each gene.
- Compute scaling factors:
- For each sample, compute gene-wise ratios relative to the
reference pseudo-sample.
- Size factor = median of these ratios.
- Rescale factors so that their geometric mean is 1
- "Adjusted library size" = library size * normalization factors
- Compute CPM normalization with “Adjusted library size”

.. rubric:: Examples

>>> from rnanorm.datasets import load_toy_data
>>> from rnanorm import RLE
>>> X = load_toy_data().exp
>>> X
Gene_1 Gene_2 Gene_3 Gene_4 Gene_5
Sample_1 200 300 500 2000 7000
Sample_2 400 600 1000 4000 14000
Sample_3 200 300 500 2000 17000
Sample_4 200 300 500 2000 2000
>>> RLE().set_output(transform="pandas").fit_transform(X)
Gene_1 Gene_2 Gene_3 Gene_4 Gene_5
Sample_1 20000.0 30000.0 50000.0 200000.0 700000.0
Sample_2 20000.0 30000.0 50000.0 200000.0 700000.0
Sample_3 20000.0 30000.0 50000.0 200000.0 1700000.0
Sample_4 20000.0 30000.0 50000.0 200000.0 200000.0
"""

def _get_norm_factors(self, X: Numeric2D) -> Numeric1D:
"""Get RLE normalization factors (un-normalized with geometric mean).

:param X: Expression raw count matrix (n_samples, n_features)
"""
# Make sure that global set_config(transform_output="pandas")
# does not affect this method - we need numpy output here.
lib_size = LibrarySize().set_output(transform="default").fit_transform(X)

# Copy the stored reference pseudo-sample so we don’t modify self.ref_
# and mask zeros to NaN to avoid dividing by zero
ref = self.ref_.copy()
ref[ref == 0] = np.nan

# Compute gene-wise ratios to the reference for genes with nonzero
# counts in X.
# This handles cases where the fitted reference and current X may
# differ in which genes are zero.
ratio = np.where(X > 0, X / ref, np.nan)

median_of_ratios = np.nanmedian(ratio, axis=1).astype(float)

return median_of_ratios / lib_size # normalize by library size

def _reset(self) -> None:
"""Reset internal data-dependent state."""
if hasattr(self, "geometric_mean_"):
del self.geometric_mean_
del self.ref_

def fit(self, X: Numeric2D, y: Optional[Numeric1D] = None, **fit_params: Any) -> Self:
"""Fit.

Learn gene‑wise geometric means from the training set.
Genes that contain any zero propagate NaN and are excluded
from downstream size‑factor calculations.

:param X: Expression raw count matrix (n_samples, n_features)
:return: Self
"""

self._reset()
X = validate_data(self, X, ensure_all_finite=True, reset=True, dtype=float)

# compute a reference pseudo-sample by performing a geometric mean
# over all samples for each gene
self.ref_ = gmean(X, axis=0)

factors = self._get_norm_factors(X)
self.geometric_mean_ = gmean(factors) # this is the geometric mean of size factors
return self

def get_norm_factors(self, X: Numeric2D) -> Numeric1D:
"""Get RLE normalization factors (normalized with geometric mean).

:param X: Expression raw count matrix (n_samples, n_features)
"""
check_is_fitted(self)
factors = self._get_norm_factors(X)
factors = factors / self.geometric_mean_

config = _get_output_config("transform", self)
if config.get("dense", None) == "pandas" and isinstance(X, pd.DataFrame):
return pd.Series(factors, index=X.index)
return factors

def transform(self, X: Numeric2D) -> Numeric2D:
"""Transform.

:param X: Expression raw count matrix (n_samples, n_features)
:return: Normalized expression matrix (n_samples, n_features)
"""
# Compute effective library sizes
factors = self.get_norm_factors(X)
if isinstance(factors, pd.Series):
factors = factors.to_numpy()

lib_size = LibrarySize().set_output(transform="default").fit_transform(X)
effective_lib_size = lib_size * factors

# Method ``check_is_fitted`` is not called here, since it is
# called in self.get_norm_factors
X = validate_data(self, X, ensure_all_finite=True, reset=False, dtype=float)

# Make CPM, but with effective library size
return X / effective_lib_size[:, np.newaxis] * 1e6


class CRF(RLE):
"""Counts adjusted with RLE factors normalization.

This method extends the CUF/CTF procedures to RLE factors,
ensuring consistency across RNAnorm methods.
Although not covered in Johnson & Krishnan (2022)
<https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02568-9>`_,
it is included for package-wide consistency. In short:

- Compute normalization factors same as in RLE
- Divide raw counts with these factors

.. rubric:: Examples

>>> from rnanorm.datasets import load_toy_data
>>> from rnanorm import CRF
>>> X = load_toy_data().exp
>>> X
Gene_1 Gene_2 Gene_3 Gene_4 Gene_5
Sample_1 200 300 500 2000 7000
Sample_2 400 600 1000 4000 14000
Sample_3 200 300 500 2000 17000
Sample_4 200 300 500 2000 2000
>>> CRF().set_output(transform="pandas").fit_transform(X)
Gene_1 Gene_2 Gene_3 Gene_4 Gene_5
Sample_1 200.0 300.0 500.0 2000.0 7000.0
Sample_2 400.0 600.0 1000.0 4000.0 14000.0
Sample_3 400.0 600.0 1000.0 4000.0 34000.0
Sample_4 100.0 150.0 250.0 1000.0 1000.0

"""

def transform(self, X: Numeric2D) -> Numeric2D:
"""Transform.

:param X: Expression raw count matrix (n_samples, n_features)
:return: Normalized expression matrix (n_samples, n_features)
"""
# Just divide raw counts with normalization factors
factors = self.get_norm_factors(X)
if isinstance(factors, pd.Series):
factors = factors.to_numpy()

# Method ``check_is_fitted`` is not called here, since it is
# called in self.get_norm_factors
X = validate_data(self, X, ensure_all_finite=True, reset=False, dtype=float)

return X / factors[:, np.newaxis]
Loading