openpipelines-bio · dorien-er · Oct 9, 2025 · Oct 9, 2025 · Oct 9, 2025 · Oct 9, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -38,6 +38,8 @@
 
 * `integrate/scarches` and `workflows/annotate/scanvi_scarches`: Enable correction for technical variability by multiple continuous and categorical covariates.
 
+* Various components and workflows in `integrate`, `annotate`, `workflows/integration` and `workflows/annotation`: Make feature name sanitation optional (PR #1084).
+
 * `genetic_demux/scsplit`: bump python to `3.13` and unpin pandas and numpy (were pinned to `<2.0` and `<2` respectively) (PR #1096).
 
 ## BUG FIXES

diff --git a/src/annotate/celltypist/config.vsh.yaml b/src/annotate/celltypist/config.vsh.yaml
@@ -38,7 +38,11 @@ argument_groups:
         min: 1
         description: | 
           The minimum number of genes present in both the reference and query datasets.
-
+      - name: "--sanitize_ensembl_ids"
+        type: boolean
+        description: Whether to sanitize ensembl ids by removing version numbers.
+        default: true
+
   - name: Reference
     description: Arguments related to the reference dataset.
     arguments:
@@ -61,11 +65,6 @@ argument_groups:
         required: false
         description: |
           The name of the adata var column in the reference data containing gene names; when no gene_name_layer is provided, the var index will be used.
-      - name: "--reference_var_input"
-        type: string
-        required: false
-        description: |
-          .var column containing highly variable genes. By default, do not subset genes.
 
   - name: Model arguments
     description: Model arguments.

diff --git a/src/annotate/celltypist/script.py b/src/annotate/celltypist/script.py
@@ -18,7 +18,6 @@
     "reference_layer": "log_normalized",
     "input_reference_gene_overlap": 100,
     "reference_obs_target": "cell_ontology_class",
-    "reference_var_input": None,
     "feature_selection": True,
     "majority_voting": True,
     "output_compression": "gzip",
@@ -38,7 +37,6 @@
 from setup_logger import setup_logger
 from cross_check_genes import cross_check_genes
 from set_var_index import set_var_index
-from subset_vars import subset_vars
 
 logger = setup_logger()
 use_gpu = cuda_is_available()
@@ -59,7 +57,9 @@ def main(par):
 
     # Provide correct format of query data for celltypist annotation
     ## Sanitize gene names and set as index
-    input_modality = set_var_index(input_modality, par["input_var_gene_names"])
+    input_modality = set_var_index(
+        input_modality, par["input_var_gene_names"], par["sanitize_ensembl_ids"]
+    )
     ## Fetch lognormalized counts
     lognorm_counts = (
         input_modality.layers[par["input_layer"]].copy()
@@ -83,16 +83,12 @@ def main(par):
     elif par["reference"]:
         reference_modality = mu.read_h5mu(par["reference"]).mod[par["modality"]]
 
-        # subset to HVG if required
-        if par["reference_var_input"]:
-            reference_modality = subset_vars(
-                reference_modality, par["reference_var_input"]
-            )
-
         # Set var names to the desired gene name format (gene symbol, ensembl id, etc.)
         # CellTypist requires query gene names to be in index
         reference_modality = set_var_index(
-            reference_modality, par["reference_var_gene_names"]
+            reference_modality,
+            par["reference_var_gene_names"],
+            par["sanitize_ensembl_ids"],
         )
 
         # Ensure enough overlap between genes in query and reference

diff --git a/src/annotate/onclass/config.vsh.yaml b/src/annotate/onclass/config.vsh.yaml
@@ -39,6 +39,10 @@ argument_groups:
         min: 1
         description: | 
           The minimum number of genes present in both the reference and query datasets.
+      - name: "--sanitize_ensembl_ids"
+        type: boolean
+        description: Whether to sanitize ensembl ids by removing version numbers.
+        default: true
 
   - name: Ontology
     description: Ontology input files

diff --git a/src/annotate/onclass/script.py b/src/annotate/onclass/script.py
@@ -123,7 +123,9 @@ def main():
     input_modality = input_adata.copy()
 
     # Set var names to the desired gene name format (gene symbol, ensembl id, etc.)
-    input_modality = set_var_index(input_modality, par["input_var_gene_names"])
+    input_modality = set_var_index(
+        input_modality, par["input_var_gene_names"], par["sanitize_ensembl_ids"]
+    )
     input_matrix = (
         input_modality.layers[par["input_layer"]]
         if par["input_layer"]
@@ -156,7 +158,9 @@ def main():
         reference_mudata = mu.read_h5mu(par["reference"])
         reference_modality = reference_mudata.mod[par["modality"]].copy()
         reference_modality = set_var_index(
-            reference_modality, par["reference_var_gene_names"]
+            reference_modality,
+            par["reference_var_gene_names"],
+            par["sanitize_ensembl_ids"],
         )
 
         # subset to HVG if required

diff --git a/src/annotate/onclass/test.py b/src/annotate/onclass/test.py
@@ -29,12 +29,12 @@ def test_simple_execution(run_component, random_h5mu_path):
         [
             "--input",
             input_file,
-            "--input_var_gene_names",
-            "gene_symbol",
             "--reference",
             reference_file,
             "--reference_obs_target",
             "cell_ontology_class",
+            "--reference_var_gene_names",
+            "ensemblid",
             "--cl_nlp_emb_file",
             cl_nlp_emb_file,
             "--cl_ontology_file",
@@ -70,12 +70,12 @@ def test_custom_obs(run_component, random_h5mu_path):
         [
             "--input",
             input_file,
-            "--input_var_gene_names",
-            "gene_symbol",
             "--reference",
             reference_file,
             "--reference_obs_target",
             "cell_ontology_class",
+            "--reference_var_gene_names",
+            "ensemblid",
             "--output_obs_predictions",
             "dummy_pred_1",
             "--output_obs_probability",
@@ -116,8 +116,6 @@ def test_no_model_no_reference_error(run_component, random_h5mu_path):
             [
                 "--input",
                 input_file,
-                "--input_var_gene_names",
-                "gene_symbol",
                 "--output",
                 output_file,
                 "--cl_nlp_emb_file",
@@ -128,6 +126,8 @@ def test_no_model_no_reference_error(run_component, random_h5mu_path):
                 cl_obo_file,
                 "--reference_obs_target",
                 "cell_ontology_class",
+                "--reference_var_gene_names",
+                "ensemblid",
             ]
         )
     assert re.search(
@@ -145,6 +145,8 @@ def test_pretrained_model(run_component, random_h5mu_path):
             input_file,
             "--input_var_gene_names",
             "gene_symbol",
+            "--sanitize_ensembl_ids",
+            "False",
             "--cl_nlp_emb_file",
             cl_nlp_emb_file,
             "--cl_ontology_file",

diff --git a/src/annotate/random_forest_annotation/config.vsh.yaml b/src/annotate/random_forest_annotation/config.vsh.yaml
@@ -35,6 +35,10 @@ argument_groups:
         min: 1
         description: | 
           The minimum number of genes present in both the reference and query datasets.
+      - name: "--sanitize_ensembl_ids"
+        type: boolean
+        description: Whether to sanitize ensembl ids by removing version numbers.
+        default: true
 
   - name: Reference
     description: Arguments related to the reference dataset.

diff --git a/src/annotate/random_forest_annotation/script.py b/src/annotate/random_forest_annotation/script.py
@@ -47,7 +47,9 @@ def main():
     input_mudata = mu.read_h5mu(par["input"])
     input_adata = input_mudata.mod[par["modality"]]
     input_modality = input_adata.copy()
-    input_modality = set_var_index(input_modality, par["input_var_gene_names"])
+    input_modality = set_var_index(
+        input_modality, par["input_var_gene_names"], par["sanitize_ensembl_ids"]
+    )
 
     # Handle max_features parameter
     max_features_conversion = {
@@ -100,7 +102,9 @@ def main():
         reference_mudata = mu.read_h5mu(par["reference"])
         reference_modality = reference_mudata.mod[par["modality"]].copy()
         reference_modality = set_var_index(
-            reference_modality, par["reference_var_gene_names"]
+            reference_modality,
+            par["reference_var_gene_names"],
+            par["sanitize_ensembl_ids"],
         )
 
         # subset to HVG if required

diff --git a/src/annotate/scanvi/config.vsh.yaml b/src/annotate/scanvi/config.vsh.yaml
@@ -48,7 +48,11 @@ argument_groups:
         default: "Unknown"
         description: | 
           Value in the --obs_labels field that indicates unlabeled observations
-
+      - name: "--sanitize_ensembl_ids"
+        type: boolean
+        description: Whether to sanitize ensembl ids by removing version numbers.
+        default: true
+
   - name: scVI Model
     arguments:
       - name: "--scvi_model"

diff --git a/src/annotate/scanvi/script.py b/src/annotate/scanvi/script.py
@@ -50,7 +50,9 @@ def main():
         adata_subset = adata.copy()
 
     # Sanitize gene names and set as index of the AnnData object
-    adata_subset = set_var_index(adata_subset, par["var_gene_names"])
+    adata_subset = set_var_index(
+        adata_subset, par["var_gene_names"], par["sanitize_ensembl_ids"]
+    )
 
     logger.info(f"Loading pre-trained scVI model from {par['scvi_model']}")
     scvi_model = scvi.model.SCVI.load(

diff --git a/src/annotate/singler/config.vsh.yaml b/src/annotate/singler/config.vsh.yaml
@@ -121,7 +121,10 @@ argument_groups:
           If set to True, an additional output .obs field `--output_obs_pruned_predictions` will be added to the `--output`,
           containing labels where 'low-quality' labels are replaced with NA's.
           Labels are considered 'low-quality' when their delta score (stored in `--output_obs_delta_next`) fall more than 3 median absolute deviations below the median for that label type.
-
+      - name: "--sanitize_ensembl_ids"
+        type: boolean
+        description: Whether to sanitize ensembl ids by removing version numbers.
+        default: true
   - name: Outputs
     description: Output arguments.
     arguments:

diff --git a/src/annotate/singler/script.R b/src/annotate/singler/script.R
@@ -20,7 +20,7 @@ par <- list(
   output = "singler_output.h5mu",
   output_compression = "gzip",
   output_obs_predictions = "singler_labels",
-  output_obs_probability = "singlr_proba",
+  output_obs_probability = "singler_proba",
   output_obsm_scores = "single_r_scores",
   output_obs_delta_next = "singler_delta_next",
   output_obs_pruned_predictions = "singler_pruned_labels",
@@ -61,21 +61,31 @@ get_layer <- function(adata, layer, var_gene_names) {
   }
 
   # Set matrix dimnames
-  input_gene_names <- sanitize_gene_names(adata, var_gene_names)
+  input_gene_names <- sanitize_ensembl_ids(adata, var_gene_names)
   dimnames(data) <- list(adata$obs_names, input_gene_names)
 
   # return output
   data
 }
 
-sanitize_gene_names <- function(adata, gene_symbol = NULL) {
+sanitize_ensembl_ids <- function(adata, gene_symbol = NULL) {
   if (is.null(gene_symbol)) {
     gene_names <- adata$var_names
   } else {
     gene_names <- adata$var[[gene_symbol]]
   }
-  # Remove version numbers (dot followed by digits at end of string)
-  sanitized <- gsub("\\.[0-9]+$", "", gene_names)
+
+  # Pattern matches Ensembl IDs: starts with ENS, followed by any characters,
+  # then an eleven digit number, optionally followed by .version_number
+  ensembl_pattern <- "^(ENS.*\\d{11})(?:\\.\\d+)?$"
+
+  # Remove version numbers for ensembl ids only
+  sanitized <- ifelse(
+    grepl(ensembl_pattern, gene_names, perl = TRUE),
+    gsub(ensembl_pattern, "\\1", gene_names, perl = TRUE),
+    as.character(gene_names)
+  )
+
   sanitized
 }
 

diff --git a/src/annotate/singler/test.py b/src/annotate/singler/test.py
@@ -79,14 +79,14 @@ def test_params(run_component, random_h5mu_path):
         [
             "--input",
             input_file,
-            "--input_var_gene_names",
-            "gene_symbol",
             "--reference",
             reference_file,
             "--reference_obs_target",
             "cell_ontology_class",
             "--input_reference_gene_overlap",
             "1000",
+            "--reference_var_gene_names",
+            "ensemblid",
             "--reference_var_input",
             "highly_variable",
             "de_n_genes",

diff --git a/src/annotate/svm_annotation/config.vsh.yaml b/src/annotate/svm_annotation/config.vsh.yaml
@@ -35,6 +35,10 @@ argument_groups:
         min: 1
         description: | 
           The minimum number of genes present in both the reference and query datasets.
+      - name: "--sanitize_ensembl_ids"
+        type: boolean
+        description: Whether to sanitize ensembl ids by removing version numbers.
+        default: true
 
   - name: Reference
     description: Arguments related to the reference dataset.

diff --git a/src/annotate/svm_annotation/script.py b/src/annotate/svm_annotation/script.py
@@ -51,7 +51,9 @@ def main():
     input_mudata = mu.read_h5mu(par["input"])
     input_adata = input_mudata.mod[par["modality"]]
     input_modality = input_adata.copy()
-    input_modality = set_var_index(input_modality, par["input_var_gene_names"])
+    input_modality = set_var_index(
+        input_modality, par["input_var_gene_names"], par["sanitize_ensembl_ids"]
+    )
 
     if par["model"]:
         logger.info("Loading a pre-trained model")
@@ -82,7 +84,9 @@ def main():
         reference_mudata = mu.read_h5mu(par["reference"])
         reference_modality = reference_mudata.mod[par["modality"]].copy()
         reference_modality = set_var_index(
-            reference_modality, par["reference_var_gene_names"]
+            reference_modality,
+            par["reference_var_gene_names"],
+            par["sanitize_ensembl_ids"],
         )
 
         # subset to HVG if required

diff --git a/src/integrate/scarches/config.vsh.yaml b/src/integrate/scarches/config.vsh.yaml
@@ -66,6 +66,10 @@ argument_groups:
           (i.e., the model tries to minimize their effects on the latent space). Thus, these should not be
           used for biologically-relevant factors that you do _not_ want to correct for.
           Important: the order of the continuous covariates matters and should match the order of the covariates in the trained reference model.
+      - name: "--sanitize_ensembl_ids"
+        type: boolean
+        description: Whether to sanitize ensembl ids by removing version numbers.
+        default: true
 
   - name: Reference
     arguments:

diff --git a/src/integrate/scarches/script.py b/src/integrate/scarches/script.py
@@ -133,7 +133,9 @@ def _align_query_with_registry(adata_query, model_path):
 
     # Sanitize gene names and set as index of the AnnData object
     # all scArches VAE models expect gene names to be in the .var index
-    adata_query = set_var_index(adata_query, par["input_var_gene_names"])
+    adata_query = set_var_index(
+        adata_query, par["input_var_gene_names"], par["sanitize_ensembl_ids"]
+    )
 
     # align layer
     query_layer = (

diff --git a/src/integrate/scvi/config.vsh.yaml b/src/integrate/scvi/config.vsh.yaml
@@ -73,6 +73,10 @@ argument_groups:
           addition to the batch covariate and are also treated as nuisance factors
           (i.e., the model tries to minimize their effects on the latent space). Thus, these should not be
           used for biologically-relevant factors that you do _not_ want to correct for.
+      - name: "--sanitize_ensembl_ids"
+        type: boolean
+        description: Whether to sanitize ensembl ids by removing version numbers.
+        default: true
   - name: Outputs
     arguments:
       - name: "--output"

diff --git a/src/integrate/scvi/script.py b/src/integrate/scvi/script.py
@@ -84,7 +84,9 @@ def main():
         adata_subset = adata.copy()
 
     # Sanitize gene names and set as index of the AnnData object
-    adata_subset = set_var_index(adata_subset, par["var_gene_names"])
+    adata_subset = set_var_index(
+        adata_subset, par["var_gene_names"], par["sanitize_ensembl_ids"]
+    )
 
     check_validity_anndata(
         adata_subset,