openproblems-bio · mumichae · Aug 5, 2025 · Aug 5, 2025 · Aug 12, 2025 · Aug 28, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,8 @@
 
 * Added `metrics/kbet_pg` and `metrics/kbet_pg_label` components (PR #52).
 
+* Added `metircs/ksim` component (PR #75).
-* Added `metircs/ksim` component (PR #75).
+* Added `metrics/ksim` component (PR #75).
-* Added `metircs/ksim` component (PR #75).
+* Added `metrics/ksim` component (PR #75).
+
 ## Minor changes
 
 * Un-pin the scPRINT version and update parameters (PR #51)

diff --git a/src/metrics/ksim/config.vsh.yaml b/src/metrics/ksim/config.vsh.yaml
@@ -0,0 +1,63 @@
+__merge__: ../../api/comp_metric.yaml
+name: ksim
+info:
+  metrics:
+
+    - name: ksim
+      label: kSIM
+      summary: "The kSIM acceptance rate evaluates whether cells of the same known cell type remain grouped together in their neighborhoods after batch correction."
+      description: |
+        The kSIM acceptance rate uses prior knowledge of cell type labels to assess local neighborhood consistency. For each cell, we look at its nearest neighbors (including itself) and check how many share the same cell type. 
+        A cell is considered to have a consistent neighborhood if the majority of its neighbors still belong to its own type. The acceptance rate is the overall fraction of such cells in the dataset.
+        A high kSIM value means that cells of the same type remain locally clustered after correction, while a low value suggests that the correction has disrupted true biological structure—for example, by overcorrecting batch effects. 
+      references:
+        doi: 
+          - 10.1038/s41592-020-0905-x
+      links:
+        documentation: https://pegasus.readthedocs.io/en/stable/api/pegasus.calc_kSIM.html#pegasus.calc_kSIM
+        repository: https://github.com/lilab-bcb/pegasus
+      min: 0
+      max: 1
+      maximize: true
+
+arguments:
+  - name: "--K"
+    type: "integer"
+    default: 24
+    description: The number of nearest neighbors to be considered.
+  - name: "--min_rate"
+    type: "double"
+    default: 0.9
+    description: Acceptance rate threshold. A cell is accepted if its kSIM rate is larger than or equal to min_rate.
+  - name: "--n_jobs"
+    type: "integer"
+    default: -1
+    description: Number of threads used. If -1, use all physical CPU cores.
+  - name: "--random_state"
+    type: "integer"
+    default: 0
+    description: Random seed set for reproducing results.
+  - name: "--use_cache"
+    type: "boolean"
+    default: True
+    description: If use cache results for kNN.
+
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    setup:
+        - type: python
+          pypi:
+          - pegasuspy==1.10.2
+
+
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime,midmem,midcpu]
diff --git a/src/metrics/ksim/script.py b/src/metrics/ksim/script.py
@@ -0,0 +1,46 @@
+import anndata as ad
+import sys
+import pegasus as pg
+import pegasusio
+from scipy.sparse import csr_matrix
+
+
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+print('Reading input files', flush=True)
+adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns')
+adata.obs = read_anndata(par['input_solution'], obs='obs').obs
+adata.uns |= read_anndata(par['input_solution'], uns='uns').uns
+print(adata)
+
+print('Convert to pegasusio.MultimodalData...', flush=True)
+adata.X = csr_matrix(adata.shape)
+mmdata = pegasusio.MultimodalData(adata)
+
+print('Compute metrics', flush=True)
+score = pg.calc_kSIM(
+    mmdata,
+    attr='cell_type',
+    rep='emb',
+    K=par["K"],
+    min_rate=par["min_rate"],
+    n_jobs=par["n_jobs"],
+    random_state=par["random_state"],
+    use_cache=par["use_cache"]
+)
+print("score:", score)
+
+print('Create output AnnData object', flush=True)
+output = ad.AnnData(
+    uns={
+        'dataset_id': adata.uns['dataset_id'],
+        'normalization_id': adata.uns['normalization_id'],
+        'method_id': adata.uns['method_id'],
+        'metric_ids': [ meta['name'] ],
+        'metric_values': [ score ]
+    }
+)
+
+print("Write output AnnData to file", flush=True)
+output.write_h5ad(par['output'], compression='gzip')