feat: Cell Ranger 8.0.1

10XGenomics · Aug 21, 2024 · a039816 · a039816
1 parent 5b0acf8
commit a039816
Show file tree

Hide file tree

Showing 39 changed files with 4,406 additions and 2,584 deletions.
diff --git a/conda_spec.bzl b/conda_spec.bzl
diff --git a/lib/python/cellranger/cell_calling.py b/lib/python/cellranger/cell_calling.py
@@ -18,6 +18,7 @@
 from cellranger.chemistry import (
     CHEMISTRY_DESCRIPTION_FIELD,
     CHEMISTRY_SC3P_LT,
+    HT_CHEMISTRIES,
     SC3P_V4_CHEMISTRIES,
     SC5P_V3_CHEMISTRIES,
 )
@@ -132,8 +133,12 @@ def get_empty_drops_range(chemistry_description: str, num_probe_bcs: int | None)
     # The chips used with V4 have roughly double the GEMs as the older V3 chips
     v4_chemistries = SC3P_V4_CHEMISTRIES + SC5P_V3_CHEMISTRIES
     v4_chem_names = [chem[CHEMISTRY_DESCRIPTION_FIELD] for chem in v4_chemistries]
+    ht_chem_names = [chem[CHEMISTRY_DESCRIPTION_FIELD] for chem in HT_CHEMISTRIES]
+
     if chemistry_description == CHEMISTRY_SC3P_LT[CHEMISTRY_DESCRIPTION_FIELD]:
         N_PARTITIONS = 9000
+    elif chemistry_description in ht_chem_names:
+        N_PARTITIONS = 160000
     elif chemistry_description in v4_chem_names:
         N_PARTITIONS = 80000 * num_probe_bcs if num_probe_bcs and num_probe_bcs > 1 else 160000
     else:

diff --git a/lib/python/cellranger/cell_calling_helpers.py b/lib/python/cellranger/cell_calling_helpers.py
@@ -960,7 +960,7 @@ def filter_cellular_barcodes_fixed_cutoff(bc_counts, cutoff: int):
     top_n = min(cutoff, nonzero_bcs)
     top_bc_idx = np.sort(np.argsort(bc_counts, kind=NP_SORT_KIND)[::-1][0:top_n])
     metrics = BarcodeFilterResults.init_with_constant_call(top_n)
-    metrics.filtered_bcs_cutoff = np.sort(bc_counts)[::-1][top_n]
+    metrics.filtered_bcs_cutoff = np.sort(bc_counts)[::-1][top_n - 1]
     return top_bc_idx, metrics, None
 
 

diff --git a/lib/python/cellranger/chemistry.py b/lib/python/cellranger/chemistry.py
@@ -20,6 +20,9 @@
     open(os.path.join(os.path.dirname(__file__), "chemistry_defs.json"))
 )
 
+# HT chemistries
+HT_CHEMISTRIES = [CHEMISTRY_DEFS["SC3Pv3HT"], CHEMISTRY_DEFS["SC5PHT"]]
+
 # LT v3 Chemistry
 CHEMISTRY_SC3P_LT = CHEMISTRY_DEFS["SC3Pv3LT"]
 

diff --git a/lib/python/cellranger/chemistry_defs.json b/lib/python/cellranger/chemistry_defs.json
@@ -234,37 +234,6 @@
             }
         ]
     },
-    "SC3Pv4HT": {
-        "barcode": [
-            {
-                "kind": "gel_bead",
-                "length": 16,
-                "offset": 0,
-                "read_type": "R1",
-                "whitelist": {
-                    "name": "3M-3pgex-may-2023"
-                }
-            }
-        ],
-        "description": "Single Cell 3' v4 HT",
-        "endedness": "three_prime",
-        "name": "SC3Pv4HT",
-        "rna": {
-            "length": null,
-            "offset": 0,
-            "read_type": "R2"
-        },
-        "rna2": null,
-        "strandedness": "+",
-        "umi": [
-            {
-                "length": 12,
-                "min_length": 10,
-                "offset": 16,
-                "read_type": "R1"
-            }
-        ]
-    },
     "SC3Pv3LT": {
         "barcode": [
             {
@@ -352,7 +321,7 @@
         "umi": [
             {
                 "length": 12,
-                "min_length": null,
+                "min_length": 10,
                 "offset": 16,
                 "read_type": "R1"
             }
@@ -414,7 +383,7 @@
         "umi": [
             {
                 "length": 12,
-                "min_length": null,
+                "min_length": 10,
                 "offset": 16,
                 "read_type": "R1"
             }
@@ -494,7 +463,7 @@
         "umi": [
             {
                 "length": 12,
-                "min_length": null,
+                "min_length": 10,
                 "offset": 16,
                 "read_type": "R1"
             }
@@ -531,37 +500,6 @@
             }
         ]
     },
-    "SC5PHT-v3": {
-        "barcode": [
-            {
-                "kind": "gel_bead",
-                "length": 16,
-                "offset": 0,
-                "read_type": "R1",
-                "whitelist": {
-                    "name": "3M-5pgex-jan-2023"
-                }
-            }
-        ],
-        "description": "Single Cell 5' HT v3",
-        "endedness": "five_prime",
-        "name": "SC5PHT-v3",
-        "rna": {
-            "length": null,
-            "offset": 0,
-            "read_type": "R2"
-        },
-        "rna2": null,
-        "strandedness": "-",
-        "umi": [
-            {
-                "length": 12,
-                "min_length": null,
-                "offset": 16,
-                "read_type": "R1"
-            }
-        ]
-    },
     "SC5P-PE": {
         "barcode": [
             {
@@ -626,7 +564,7 @@
         "umi": [
             {
                 "length": 12,
-                "min_length": null,
+                "min_length": 10,
                 "offset": 16,
                 "read_type": "R1"
             }
@@ -696,7 +634,7 @@
         "umi": [
             {
                 "length": 12,
-                "min_length": null,
+                "min_length": 10,
                 "offset": 16,
                 "read_type": "R1"
             }
@@ -758,7 +696,7 @@
         "umi": [
             {
                 "length": 12,
-                "min_length": null,
+                "min_length": 10,
                 "offset": 16,
                 "read_type": "R1"
             }

diff --git a/lib/python/cellranger/feature/feature_assigner.py b/lib/python/cellranger/feature/feature_assigner.py
@@ -44,7 +44,6 @@
 UMI_NUM_TRIES = 10  # Number of initial points to try for GMM-fitting
 UMI_MIX_INIT_SD = 0.25  # Initial standard deviation for GMM components
 MIN_COUNTS_PER_ANTIBODY = 1000  # Filter out background antibodies
-# TODO: revise after background correction is implemented
 
 # Filtering feature assignments based on UMI thresholds and correlation with other tags
 COUNTS_DYNAMIC_RANGE = 50.0
@@ -210,10 +209,10 @@ def _calculate_n_let_diversity_probs(n_let: int, n_tags: int) -> np.ndarray:
     return probs
 
 
-def call_presence_with_gmm_ab(umi_counts: np.ndarray, *, umi_threshold: int = 1) -> np.ndarray:
+def call_presence_with_gmm_ab(umi_counts: np.ndarray, *, min_umi_threshold: int = 1) -> np.ndarray:
     """Given the UMI counts for a specific antibody, separate signal from background.
 
-    A cell must have at least `umi_threshold` UMIs for this feature to be considered positive.
+    A cell must have at least `min_umi_threshold` UMIs for this feature to be considered positive.
     """
     if np.max(umi_counts) == 0 or max(umi_counts.shape) < 2:
         # there are no UMIs, or only one UMI, each barcode has 0 count
@@ -227,7 +226,7 @@ def call_presence_with_gmm_ab(umi_counts: np.ndarray, *, umi_threshold: int = 1)
     positive_component = np.argmax(gmm.means_)
 
     # Classify each cell
-    return (umi_counts >= umi_threshold) & (gmm.predict(log_umi_counts) == positive_component)
+    return (umi_counts >= min_umi_threshold) & (gmm.predict(log_umi_counts) == positive_component)
 
 
 # This cannot use a namedtuple because those are immutable.
@@ -675,7 +674,13 @@ def create_feature_assignments_matrix(self) -> FeatureAssignmentsMatrix:
 class GuideAssigner(FeatureAssigner):
     """Sub-class of FeatureAssigner specific to CRISPR Library features."""
 
-    def __init__(self, matrix: cr_matrix.CountMatrix, feature_type: str):
+    def __init__(
+        self,
+        matrix: cr_matrix.CountMatrix,
+        *,
+        feature_type: str = rna_library.CRISPR_LIBRARY_TYPE,
+        min_crispr_umi_threshold: int,
+    ):
         super().__init__(
             matrix,
             feature_type,
@@ -687,6 +692,7 @@ def __init__(self, matrix: cr_matrix.CountMatrix, feature_type: str):
 
         self.feature_mol_name = "guide"
         self.feature_bc_name = "protospacer"
+        self.min_crispr_umi_threshold = min_crispr_umi_threshold
 
         self.method = "GMM"
         assert self.method in SUPPORTED_METHODS, f"Method {self.method} not supported"
@@ -711,22 +717,29 @@ def get_guide_assignments(self) -> dict[bytes, FeatureAssignments]:
                 log_transform=False, list_feature_ids=[feature_id]
             )
 
-            in_high_umi_component = GuideAssigner._call_presence(umi_counts, self.method)
+            in_high_umi_component = GuideAssigner._call_presence(
+                umi_counts, self.method, min_crispr_umi_threshold=self.min_crispr_umi_threshold
+            )
             assignments[feature_id] = FeatureAssignments(
                 np.flatnonzero(np.array(in_high_umi_component)), sum(umi_counts), False, None
             )
         return assignments
 
     @staticmethod
-    def _call_presence(counts: np.ndarray, method: str = "GMM") -> np.ndarray:
+    def _call_presence(
+        counts: np.ndarray,
+        method: str = "GMM",
+        *,
+        min_crispr_umi_threshold: int,
+    ) -> np.ndarray:
         """Classify each cell as positive/negative for a CRISPR feature using a GMM.
 
-        A cell must have at least 3 UMIs for this CRISPR feature to be considered positive.
-        This threshold is used to exclude CRISPR features with only background signal and
-        no foreground signal. Without this filter, a CRISPR feature with 0 or 1 UMIs in each cell
-        would call all the cells with one UMI as positive, which renders meaningless the metric
-        `Cells with one or more protospacers detected`. The threshold value was chosen by being
-        the smallest sufficient value on experimental data.
+        A cell must have at least `min_crispr_umi_threshold` UMIs for this CRISPR feature to be
+        considered positive. This threshold is used to exclude CRISPR features with only background
+        signal and no foreground signal. Without this filter, a CRISPR feature with 0 or 1 UMIs
+        in each cell would call all the cells with one UMI as positive, which renders meaningless
+        the metric `Cells with one or more protospacers detected`. The threshold value was chosen
+        by being the smallest sufficient value on experimental data.
 
         Args:
             counts: feature counts
@@ -736,7 +749,7 @@ def _call_presence(counts: np.ndarray, method: str = "GMM") -> np.ndarray:
             Booleans indicating whether feature is present above background
         """
         if method == "GMM":
-            return call_presence_with_gmm_ab(counts, umi_threshold=3)
+            return call_presence_with_gmm_ab(counts, min_umi_threshold=min_crispr_umi_threshold)
         raise ValueError(f"Method {method} is not supported")
 
     def create_guide_assignments_matrix(self) -> FeatureAssignmentsMatrix:

diff --git a/lib/python/cellranger/feature_ref.py b/lib/python/cellranger/feature_ref.py
@@ -20,7 +20,7 @@
 import cellranger.hdf5 as cr_h5
 from cellranger.feature.antigen.specificity import MHC_ALLELE, TARGETING_ANTIGEN
 from cellranger.rna.library import ANTIGEN_LIBRARY_TYPE
-from cellranger.targeted.targeted_constants import PROBE_ID_IGNORE_PREFIXES
+from cellranger.targeted.targeted_constants import EXCLUDED_PROBE_ID_PREFIXES
 
 FEATURE_TYPE = "feature_type"
 # Required HDF5 datasets
@@ -228,18 +228,18 @@ def __ne__(self, other):
 
     def get_feature_ids_excluding_deprecated_probes(self) -> list[bytes]:
         """Return the list of feature IDs excluding deprecated probes."""
-        return [f.id for f in self.feature_defs if not f.id.startswith(PROBE_ID_IGNORE_PREFIXES)]
+        return [f.id for f in self.feature_defs if not f.id.startswith(EXCLUDED_PROBE_ID_PREFIXES)]
 
     def has_deprecated_probes(self) -> bool:
         """Return true if there are deprecated probes in features."""
-        return any(f.id.startswith(PROBE_ID_IGNORE_PREFIXES) for f in self.feature_defs)
+        return any(f.id.startswith(EXCLUDED_PROBE_ID_PREFIXES) for f in self.feature_defs)
 
     def get_feature_types_excluding_deprecated_probes(self) -> list[str]:
         """Return the list of feature types excluding deprecated probes."""
         return [
             f.feature_type
             for f in self.feature_defs
-            if not f.id.startswith(PROBE_ID_IGNORE_PREFIXES)
+            if not f.id.startswith(EXCLUDED_PROBE_ID_PREFIXES)
         ]
 
     def get_antigen_control(self) -> tuple | None:

diff --git a/lib/python/cellranger/preflight.py b/lib/python/cellranger/preflight.py
@@ -354,7 +354,7 @@ class _VersionCmd(NamedTuple):
 _PACKAGE_VERSION_CMDS = [
     _VersionCmd(name="mro", cmd=["mro", "--version"]),
     _VersionCmd(name="mrp", cmd=["mrp", "--version"]),
-    _VersionCmd(name="Anaconda", cmd=["python", "--version"]),
+    _VersionCmd(name="python", cmd=["python", "--version"]),
     _VersionCmd(name="numpy", cmd=["python", "-c", "import numpy; print(numpy.__version__)"]),
     _VersionCmd(name="scipy", cmd=["python", "-c", "import scipy; print(scipy.__version__)"]),
     _VersionCmd(name="pysam", cmd=["python", "-c", "import pysam; print(pysam.__version__)"]),

diff --git a/lib/python/cellranger/targeted/targeted_constants.py b/lib/python/cellranger/targeted/targeted_constants.py
@@ -100,13 +100,12 @@ def get_targeting_method_from_metadata(cls, metadata):
 ALL_TARGETING_METHODS = list(TARGETING_METHOD_FILE_NAMES.keys())
 
 # List of gene/probe ID prefixes that are excluded from the filtered_feature_bc_matrix.
-PROBE_ID_IGNORE_PREFIXES = (
+# Ensure that the corresponding Python and Rust constants are identical.
+EXCLUDED_PROBE_ID_PREFIXES = (
     b"DEPRECATED",
     b"Hum-",
     b"IGNORE",
-    b"INTERGENIC",
-    b"IR",
-    b"NC",
-    b"VAR",
-    b"VDJ",
+    b"NC-",
+    b"VAR_",
+    b"VDJ_",
 )
diff --git a/lib/python/tenkit/preflight.py b/lib/python/tenkit/preflight.py
@@ -564,7 +564,7 @@ def fun():
 _PACKAGE_VERSION_CMDS: list[_VersionCmd] = [
     _VersionCmd(name="mro", cmd=_call(["mro", "--version"])),
     _VersionCmd(name="mrp", cmd=_call(["mrp", "--version"])),
-    _VersionCmd(name="Anaconda", cmd=_call(["python", "--version"])),
+    _VersionCmd(name="python", cmd=_call(["python", "--version"])),
     _VersionCmd(
         name="numpy", cmd=_call(["python", "-c", "import numpy; print(numpy.__version__)"])
     ),