theislab · picciama · May 27, 2022 · May 27, 2022 · May 27, 2022 · May 27, 2022
diff --git a/batchglm/external/edgeR/__init__.py b/batchglm/external/edgeR/__init__.py
diff --git a/batchglm/external/edgeR/adjProfileLik.py b/batchglm/external/edgeR/adjProfileLik.py
@@ -0,0 +1,77 @@
+import dask.array
+import numpy as np
+import scipy
+
+from .estimator import NBEstimator
+
+
+def adjusted_profile_likelihood(
+    estimator: NBEstimator,
+    adjust: bool = True,
+):
+    """
+    Featurewise Cox-Reid adjusted profile log-likelihoods for the dispersion.
+    dispersion can be a scalar or a featurewise vector.
+    Computationally, dispersion can also be a matrix, but the apl is still computed tagwise.
+    y is a matrix: rows are genes/tags/transcripts, columns are samples/libraries.
+    offset is a matrix of the same dimensions as y.
+    This is a numpy vectorized python version of edgeR's adjProfileLik function implemented in C++.
+    """
+    low_value = 1e-10
+    log_low_value = np.log(low_value)
+
+    estimator.train(maxit=250, tolerance=1e-10)
+    model = estimator._model_container
+    poisson_idx = np.where(1 / model.scale < 0)[0]
+    if isinstance(poisson_idx, dask.array.core.Array):
+        poisson_idx = poisson_idx.compute()
+
+    if len(poisson_idx) == model.num_features:
+        loglik = model.x * np.log(model.location) - model.location - scipy.special.lgamma(model.x + 1)
+    elif len(poisson_idx) == 0:
+        loglik = model.ll
+    else:
+        loglik = np.zeros_like(model.x)
+
+        poisson_x = model.x[:, poisson_idx]
+        poisson_loc = model.location_j(poisson_idx)
+
+        loglik[:, poisson_idx] = poisson_x * np.log(poisson_loc) - poisson_loc - scipy.special.lgamma(poisson_x + 1)
+
+        non_poisson_idx = np.where(model.theta_scale > 0)[0]
+        loglik[:, non_poisson_idx] = model.ll_j(non_poisson_idx)
+
+    sum_loglik = np.sum(loglik, axis=0)
+
+    if adjust:
+        w = -model.fim_weight_location_location
+
+        adj = np.zeros(model.num_features)
+        n_loc_params = model.design_loc.shape[1]
+        if n_loc_params == 1:
+            adj = np.sum(w, axis=0)
+            adj = np.log(np.abs(adj))
+            if isinstance(adj, dask.array.core.Array):
+                adj = adj.compute()
+        else:
+            xh = model.xh_loc
+            xhw = np.einsum("ob,of->fob", xh, w)
+            fim = np.einsum("fob,oc->fbc", xhw, xh)
+            if isinstance(fim, dask.array.core.Array):
+                fim = fim.compute()
+            for i in range(fim.shape[0]):
+
+                ldu, _, info = scipy.linalg.lapack.dsytrf(lower=0, a=fim[i])
+                if info < 0:
+                    adj[i] = 0
+                    print(f"LDL factorization failed for feature {i}")
+                else:
+                    ldu_diag = np.diag(ldu)
+                    adj[i] = np.sum(
+                        np.where((ldu_diag < low_value) | np.isinf(ldu_diag), log_low_value, np.log(ldu_diag))
+                    )
+
+        adj /= 2
+        sum_loglik -= adj
+
+    return sum_loglik
diff --git a/batchglm/external/edgeR/aveLogCPM.py b/batchglm/external/edgeR/aveLogCPM.py
@@ -0,0 +1,80 @@
+from typing import Optional, Union
+
+import dask.array
+import numpy as np
+
+from .external import InputDataGLM, ModelContainer, NBModel
+from .glm_one_group import fit_single_group, get_single_group_start
+
+
+def calculate_avg_log_cpm(
+    x: np.ndarray,
+    size_factors: Optional[np.ndarray] = None,
+    dispersion: Union[np.ndarray, float] = 0.05,
+    prior_count: int = 2,
+    weights: Optional[Union[np.ndarray, float]] = None,
+    maxit: int = 50,
+    tolerance: float = 1e-10,
+    chunk_size_cells=1e6,
+    chunk_size_genes=1e6,
+):
+    """
+    Computes average log2 counts per million per feature over all observations.
+    The method is a python derivative of edgeR's aveLogCPM method.
+
+    :param x: the counts data.
+    :param model_class: the class object to use for creation of a model during the calculation
+    :param size_factors: Optional size_factors. This is equivalent to edgeR's offsets.
+    :param dispersion: Optional fixed dispersion parameter used during the calculation.
+    :param prior_count: The count to be added to x prior to calculation.
+    :param weights: Optional weights per feature (currently unsupported and ignored)
+    :param: maxit: The max number of iterations during newton-raphson approximation.
+    :param: tolerance: The minimal difference in change used as a stopping criteria during NR approximation.
+    :param: chunk_size_cells: chunks used over the feature axis when using dask
+    :param: chunk_size_genes: chunks used over the observation axis when using dask
+    """
+
+    if weights is None:
+        weights = 1.0
+    if isinstance(dispersion, float):
+        dispersion = np.full((1, x.shape[1]), dispersion, dtype=float)
+    if size_factors is None:
+        size_factors = np.full((x.shape[0], 1), np.log(1.0))
+
+    adjusted_prior, adjusted_size_factors = add_priors(prior_count, size_factors)
+    x = x + adjusted_prior
+    avg_cpm_model = NBModel(
+        InputDataGLM(
+            data=x,
+            design_loc=np.ones((x.shape[0], 1)),
+            design_loc_names=["Intercept"],
+            size_factors=adjusted_size_factors,
+            design_scale=np.ones((x.shape[0], 1)),
+            design_scale_names=["Intercept"],
+            as_dask=isinstance(x, dask.array.core.Array),
+            chunk_size_cells=chunk_size_cells,
+            chunk_size_genes=chunk_size_genes,
+        )
+    )
+    avg_cpm_model_container = ModelContainer(
+        model=avg_cpm_model,
+        init_theta_location=get_single_group_start(avg_cpm_model.x, avg_cpm_model.size_factors),
+        init_theta_scale=np.log(1 / dispersion),
+        chunk_size_genes=chunk_size_genes,
+        dtype=x.dtype,
+    )
+
+    fit_single_group(avg_cpm_model_container, maxit=maxit, tolerance=tolerance)
+    output = (avg_cpm_model_container.theta_location + np.log(1e6)) / np.log(2)
+
+    return output
+
+
+def add_priors(prior_count: int, size_factors: np.ndarray):
+
+    factors = np.exp(size_factors)
+    avg_factors = np.mean(factors)
+    adjusted_priors = prior_count * factors / avg_factors
+    adjusted_size_factors = np.log(factors + 2 * adjusted_priors)
+
+    return adjusted_priors, adjusted_size_factors