Implement IntervalCensoredDistribution (#2090)

vanAmsterdam · vanamsterdam · web-flow · commit d49f71825691 · 2025-11-01T20:28:10.000+01:00
* start implementing intervalcensored

* minor updates, passing non-interval specific tests

* add logp test for intervalcensored

* allow exact observations in intervalcensored; update tests for intervalcensored

* update docs

* implement interval censoring validate_sample

* use log1mexp for numerical stability

---------

Co-authored-by: vanamsterdam &lt;w.a.c.vanamsterdam@gmail.com&gt;
diff --git a/docs/source/distributions.rst b/docs/source/distributions.rst
@@ -766,21 +766,28 @@ Censored Distributions
 -----------------------
 
 LeftCensoredDistribution
-^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^
 .. autoclass:: numpyro.distributions.censored.LeftCensoredDistribution
     :members:
     :undoc-members:
     :show-inheritance:
     :member-order: bysource
 
 RightCensoredDistribution
-^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^
 .. autoclass:: numpyro.distributions.censored.RightCensoredDistribution
     :members:
     :undoc-members:
     :show-inheritance:
     :member-order: bysource
 
+IntervalCensoredDistribution
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. autoclass:: numpyro.distributions.censored.IntervalCensoredDistribution
+    :members:
+    :undoc-members:
+    :show-inheritance:
+    :member-order: bysource
 
 TensorFlow Distributions
 ------------------------
diff --git a/numpyro/distributions/__init__.py b/numpyro/distributions/__init__.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from numpyro.distributions.censored import (
+    IntervalCensoredDistribution,
     LeftCensoredDistribution,
     RightCensoredDistribution,
 )
@@ -200,6 +201,7 @@
     "RightTruncatedDistribution",
     "LeftCensoredDistribution",
     "RightCensoredDistribution",
+    "IntervalCensoredDistribution",
     "SineBivariateVonMises",
     "SineSkewed",
     "SoftLaplace",
diff --git a/numpyro/distributions/censored.py b/numpyro/distributions/censored.py
@@ -3,6 +3,9 @@
 
 
 from typing import Optional
+import warnings
+
+import numpy as np
 
 import jax
 from jax import lax
@@ -12,10 +15,8 @@
 from numpyro._typing import ConstraintT, DistributionT
 from numpyro.distributions import constraints
 from numpyro.distributions.distribution import Distribution
-from numpyro.distributions.util import (
-    promote_shapes,
-    validate_sample,
-)
+from numpyro.distributions.util import log1mexp, promote_shapes, validate_sample
+from numpyro.util import find_stack_level, not_jax_tracer
 
 
 class LeftCensoredDistribution(Distribution):
@@ -249,3 +250,246 @@ def log_survival_censored(x):
             log_survival_censored(value),  # censored observations: log S(t)
             self.base_dist.log_prob(value),  # observed values: log f(t)
         )
+
+
+class IntervalCensoredDistribution(Distribution):
+    r"""
+    Distribution wrapper for interval-censored outcomes.
+
+    This distribution augments a base distribution with interval censoring,
+    so that the likelihood contribution depends on whether the observation is
+    exactly observed,
+    left-censored, right-censored, interval-censored, or doubly-censored
+    (i.e., known to lie outside the observed interval).
+
+    :param base_dist: Parametric distribution for the *uncensored* values
+            (e.g., Exponential, Weibull, LogNormal, Normal, etc.).
+            This distribution must implement a ``cdf`` method.
+    :type base_dist: numpyro.distributions.Distribution
+    :param left_censored: Indicator per observation:
+            1 → observation is left-censored at the reported upper bound
+            0 → not left-censored
+    :type left_censored: array-like of {0,1}
+    :param right_censored: Indicator per observation:
+            1 → observation is right-censored at the reported lower bound
+            0 → not right-censored
+    :type right_censored: array-like of {0,1}
+
+    .. note::
+            The ``log_prob(value)`` method expects ``value`` to be a two-dimensional array
+            of shape ``(batch_size, 2)``, where each row is ``(lower, upper)``.
+            The contribution to the log-likelihood is determined as follows:
+
+                    log F(upper)                   if left_censored == 1 and right_censored == 0
+                    log (1 - F(lower))             if right_censored == 1 and left_censored == 0
+                    log (F(upper) - F(lower))      if both == 0  (interval-censored)
+                    log (1 - (F(upper) - F(lower))) if both == 1  (doubly-censored)
+                    log f(value)                   if lower ≈ upper (point interval)
+
+            where f is the density and F the cumulative distribution function of ``base_dist``.
+
+            This is commonly used in survival analysis, where event times are positive,
+            but the approach is general and can be applied to any distribution
+            with a cumulative distribution function, regardless of support.
+
+            In R's ``survival`` package notation, this corresponds to
+            ``Surv(l, r, type = 'interval2')``.
+
+            Example:
+
+                    Surv(l = c(2, 4, 6), r = c(5, Inf, 9), type = 'interval2')
+
+            means:
+
+                    subject 1 had an event in (2, 5]
+                    subject 2 was right-censored at 4
+                    subject 3 had an event in (6, 9]
+
+    **Example:**
+
+    .. doctest::
+
+            >>> from jax import numpy as jnp
+            >>> from numpyro import distributions as dist
+            >>> base = dist.Weibull(concentration=2.0, scale=3.0)
+            >>> left_censored = jnp.array([0, 0, 0])
+            >>> right_censored = jnp.array([0, 1, 0])
+            >>> surv_dist = dist.IntervalCensoredDistribution(base, left_censored, right_censored)
+            >>> values = jnp.array([
+            ...     [2.0, 5.0],
+            ...     [4.0, jnp.inf],
+            ...     [6.0, 9.0],
+            ... ])
+            >>> loglik = surv_dist.log_prob(values)
+    """
+
+    arg_constraints = {
+        "left_censored": constraints.boolean,
+        "right_censored": constraints.boolean,
+    }
+    pytree_data_fields = ("base_dist", "left_censored", "right_censored", "_support")
+
+    def __init__(
+        self,
+        base_dist: DistributionT,
+        left_censored: ArrayLike,
+        right_censored: ArrayLike,
+        *,
+        validate_args: Optional[bool] = None,
+    ):
+        # Optionally test that cdf actually works (in validate_args mode)
+        if validate_args:
+            try:
+                test_val = base_dist.support.feasible_like(jnp.array(0.0))
+                _ = base_dist.cdf(test_val)
+            except (NotImplementedError, AttributeError) as e:
+                raise TypeError(
+                    f"{type(base_dist).__name__}.cdf() is not properly implemented."
+                ) from e
+        batch_shape = lax.broadcast_shapes(
+            base_dist.batch_shape, jnp.shape(left_censored), jnp.shape(right_censored)
+        )
+        self.base_dist = jax.tree.map(
+            lambda p: promote_shapes(p, shape=batch_shape)[0], base_dist
+        )
+        self.left_censored = jnp.array(
+            promote_shapes(left_censored, shape=batch_shape)[0], dtype=jnp.bool
+        )
+        self.right_censored = jnp.array(
+            promote_shapes(right_censored, shape=batch_shape)[0], dtype=jnp.bool
+        )
+        self._support = base_dist.support
+        super().__init__(batch_shape, event_shape=(2,), validate_args=validate_args)
+
+    def sample(
+        self, key: Optional[jax.dtypes.prng_key], sample_shape: tuple[int, ...] = ()
+    ) -> ArrayLike:
+        return self.base_dist.expand(self.batch_shape).sample(key, sample_shape)
+
+    @constraints.dependent_property(is_discrete=False, event_dim=1)
+    def support(self) -> ConstraintT:
+        return self._support
+
+    def _get_censoring_masks(self, value):
+        """Helper to get censoring masks."""
+
+        x1 = jnp.take(value, 0, axis=-1)  # left bound
+        x2 = jnp.take(value, 1, axis=-1)  # right bound
+
+        m_left = self.left_censored & (~self.right_censored)  # left-censored only
+        m_right = self.right_censored & (~self.left_censored)  # right-censored only
+        m_int = (~self.left_censored) & (~self.right_censored)  # interval censored
+        m_double = self.left_censored & self.right_censored  # doubly censored
+        m_point = jnp.isclose(x1, x2) & m_int  # point observation
+        m_int = m_int & (~m_point)  # update interval mask to exclude point obs
+        return m_left, m_right, m_int, m_double, m_point
+
+    @validate_sample
+    def log_prob(self, value):
+        dtype = jnp.result_type(value, float)
+        minval = 100.0 * jnp.finfo(dtype).tiny  # for values close to 0
+        eps = jnp.finfo(dtype).eps  # otherwise
+
+        x1 = jnp.take(value, 0, axis=-1)  # left bound
+        x2 = jnp.take(value, 1, axis=-1)  # right bound
+
+        # make masks based on censoring indicators
+        m_left, m_right, m_int, m_double, m_point = self._get_censoring_masks(value)
+
+        # Replace potential out-of-support values with finite placeholder BEFORE cdf
+        # (value doesn't matter; it will be overwritten)
+        feasible_value = self.support.feasible_like(x1)
+        x1_finite = jnp.where(m_left, feasible_value, x1)
+        x2_finite = jnp.where(m_right, feasible_value, x2)
+
+        # Calculate CDF on safe values
+        F1_tmp = self.base_dist.cdf(x1_finite)
+        F2_tmp = self.base_dist.cdf(x2_finite)
+
+        # Overwrite with correct limit values on censored rows
+        # Left-censored: F1 := 0
+        F1 = jnp.where(m_left, 0.0, F1_tmp)
+        # Right-censored: F2 := 1
+        F2 = jnp.where(m_right, 1.0, F2_tmp)
+
+        # Stabilize against log(0) and tiny intervals
+        F1 = jnp.clip(F1, minval, 1.0 - eps)
+        F2 = jnp.clip(F2, minval, 1.0 - eps)
+
+        # Use a stable log-diff for intervals (also covers left/right cases)
+        # log(F2 - F1) = logF2 + log1p(-exp(logF1 - logF2))
+        logF1 = jnp.log(F1)
+        logF2 = jnp.log(F2)
+
+        lp_interval = logF2 + jnp.log1p(-jnp.exp(jnp.clip(logF1 - logF2, max=-minval)))
+        # handle point intervals (x1 == x2) by returning log density instead of log prob
+        lp_interval = jnp.where(m_point, self.base_dist.log_prob(x1), lp_interval)
+
+        # for doubly censored data, the value is not in the interval, so computation is 1 - exp(lp_interval)
+        lp_double = log1mexp(lp_interval)
+
+        # Select the right expression per row
+        # left: log F(x2)
+        lp_left = logF2
+        # right: log (1 - F(x1)) = log1p(-F1)
+        lp_right = jnp.log1p(-F1)
+
+        logp = jnp.zeros_like(logF1)
+        logp = jnp.where(m_left, lp_left, logp)
+        logp = jnp.where(m_right, lp_right, logp)
+        logp = jnp.where(m_int, lp_interval, logp)
+        logp = jnp.where(m_double, lp_double, logp)
+        return logp
+
+    def _validate_sample(self, value: ArrayLike) -> None:
+        if value.shape[-1] != 2:
+            raise ValueError(
+                f"Expected last dimension of `value` to be 2 (lower, upper), but got shape {value.shape}"
+            )
+        x1 = jnp.take(value, 0, axis=-1)  # left bound
+        x2 = jnp.take(value, 1, axis=-1)  # right bound
+        m_left, m_right, m_int, m_double, m_point = self._get_censoring_masks(value)
+
+        # check validity under base_dist of x1 and x2
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            x1_mask = self.base_dist._validate_sample(x1)
+            x2_mask = self.base_dist._validate_sample(x2)
+
+        mask = jnp.ones_like(x1, dtype=jnp.bool)
+        # for left-censored, the upper bound must be in the support of base_dist
+        mask = jnp.where(m_left, x2_mask, mask)
+        if not_jax_tracer(mask):
+            if not np.all(mask):
+                warnings.warn(
+                    "For left-censored observations, upper bound should be within the support of base_dist. ",
+                    stacklevel=find_stack_level(),
+                )
+
+        # for right-censored, the lower bound must be in the support of base_dist
+        mask = jnp.where(m_right, x1_mask, mask)
+        if not_jax_tracer(mask):
+            if not np.all(mask):
+                warnings.warn(
+                    "For right-censored observations, lower bound should be within the support of base_dist. ",
+                    stacklevel=find_stack_level(),
+                )
+        # for interval-censored, doubly censored and point, both bounds must be in the support of base_dist
+        mask = jnp.where(m_int | m_double | m_point, x1_mask & x2_mask, mask)
+        if not_jax_tracer(mask):
+            if not np.all(mask):
+                warnings.warn(
+                    "For interval-censored, doubly-censored, or exact observations,"
+                    "lower bound should be within the support of base_dist. ",
+                    stacklevel=find_stack_level(),
+                )
+        # for interval-censored and doubly-censored, upper bound must be > lower bound
+        mask = jnp.where(m_int | m_double, mask & (x2 > x1), mask)
+        if not_jax_tracer(mask):
+            if not np.all(mask):
+                warnings.warn(
+                    "For interval-censored and doubly-censored observations,"
+                    "upper bound should greater than lower bound. ",
+                    stacklevel=find_stack_level(),
+                )
+        return mask
diff --git a/test/test_distributions.py b/test/test_distributions.py