k2-fsa · danpovey · Feb 24, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 11, 2026
diff --git a/egs/librispeech/ASR/RESULTS.md b/egs/librispeech/ASR/RESULTS.md
@@ -1,5 +1,30 @@
 ## Results
 
+### zapformer (zapformer + pruned-transducer w/ CTC)
+
+Note: --num-real-epochs 40 takes about the same time as 20 epochs with the zipformer CR-CTC recipe.
+(each epoch is really 3 epochs due to speed-perturb).  So the time for training will be roughly 40%
+of the old zipformer recipe.  The "--epoch 13" reported below is the last epoch, the smaller
+number of epochs has to do with the --min-copies,--max-copies, we will add this into the
+report later (later epochs take more real computation time because they make different SpecAug
+copies of the data.)
+
+# (non-streaming)
+./zapformer/train.py --world-size 4 \
+      --min-copies 1 --max-copies 8 --num-real-epochs 40 \
+      --base-lr=0.023  --batches-per-epoch 2400  --start-epoch 1 --use-fp16 1 \
+       --exp-dir zapformer/exp \
+      --use-ctc 1 --use-transducer 1 \
+      --base-dim 64 --ctc-loss-scale 0.2 \
+      --full-libri 1 --max-duration 1200 --master-port 43039
+
+| decoding method                      | test-clean | test-other | comment             |
+|--------------------------------------|------------|------------|---------------------|
+| greedy_search                        | 1.81       | 3.73       | --epoch 13 --avg 3  |
+
+Note on other results: dev-clean=1.73,dev-other,3.55, giga test=16.69 giga dev=1.733. (i.e. on the model trained with Libri only).
+
+
 ### zipformer (zipformer + pruned-transducer w/ CR-CTC)
 
 See <https://github.com/k2-fsa/icefall/pull/1766> for more details.

diff --git a/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py b/egs/librispeech/ASR/tdnn_lstm_ctc/asr_datamodule.py
@@ -18,13 +18,21 @@
 
 import argparse
 import inspect
+import glob
 import logging
+import re
+
 from functools import lru_cache
 from pathlib import Path
 from typing import Any, Dict, Optional
 
+import numpy as np  # to set its random seed
+
 import torch
+import lhotse
+
 from lhotse import CutSet, Fbank, FbankConfig, load_manifest, load_manifest_lazy
+
 from lhotse.dataset import (  # noqa F401 for PrecomputedFeatures
     CutConcatenate,
     CutMix,
@@ -497,3 +505,107 @@ def gigaspeech_dev_cuts(self) -> CutSet:
     def gigaspeech_test_cuts(self) -> CutSet:
         logging.info("About to get Gigaspeech test cuts")
         return load_manifest_lazy(self.args.manifest_dir / "cuts_TEST.jsonl.gz")
+
+
+class GigaSpeech:
+    def __init__(self, manifest_dir: str):
+        """
+        Args:
+          manifest_dir:
+            It is expected to contain the following files:
+
+                - gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz
+                - gigaspeech_cuts_L.jsonl.gz
+                - gigaspeech_cuts_M.jsonl.gz
+                - gigaspeech_cuts_S.jsonl.gz
+                - gigaspeech_cuts_XS.jsonl.gz
+                - gigaspeech_cuts_DEV.jsonl.gz
+                - gigaspeech_cuts_TEST.jsonl.gz
+        """
+        self.manifest_dir = Path(manifest_dir)
+
+    def train_XL_cuts_split(self) -> CutSet:
+        logging.info("About to get train-XL cuts")
+
+        filenames = list(
+            glob.glob(
+                f"{self.manifest_dir}/gigaspeech_XL_split_2000/gigaspeech_cuts_XL.*.jsonl.gz"  # noqa
+            )
+        )
+
+        pattern = re.compile(r"gigaspeech_cuts_XL.([0-9]+).jsonl.gz")
+        idx_filenames = [(int(pattern.search(f).group(1)), f) for f in filenames]
+        idx_filenames = sorted(idx_filenames, key=lambda x: x[0])
+
+        sorted_filenames = [f[1] for f in idx_filenames]
+
+        logging.info(f"Loading {len(sorted_filenames)} splits")
+
+        return lhotse.combine(lhotse.load_manifest_lazy(p) for p in sorted_filenames)
+
+    def train_XL_cuts(self) -> CutSet:
+        f = self.manifest_dir / "gigaspeech_cuts_XL.jsonl.gz"
+        logging.info(f"About to get train-XL cuts from {f}")
+        return CutSet.from_jsonl_lazy(f)
+
+    def train_L_cuts(self) -> CutSet:
+        f = self.manifest_dir / "gigaspeech_cuts_L.jsonl.gz"
+        logging.info(f"About to get train-L cuts from {f}")
+        return CutSet.from_jsonl_lazy(f)
+
+    def train_M_cuts(self) -> CutSet:
+        f = self.manifest_dir / "gigaspeech_cuts_M.jsonl.gz"
+        logging.info(f"About to get train-M cuts from {f}")
+        return CutSet.from_jsonl_lazy(f)
+
+    def train_S_cuts(self) -> CutSet:
+        f = self.manifest_dir / "gigaspeech_cuts_S.jsonl.gz"
+        logging.info(f"About to get train-S cuts from {f}")
+        return CutSet.from_jsonl_lazy(f)
+
+    def train_XS_cuts(self) -> CutSet:
+        f = self.manifest_dir / "gigaspeech_cuts_XS.jsonl.gz"
+        logging.info(f"About to get train-XS cuts from {f}")
+        return CutSet.from_jsonl_lazy(f)
+
+    def test_cuts(self) -> CutSet:
+        f = self.manifest_dir / "gigaspeech_cuts_TEST.jsonl.gz"
+        logging.info(f"About to get TEST cuts from {f}")
+        return load_manifest_lazy(f)
+
+    def dev_cuts(self) -> CutSet:
+        f = self.manifest_dir / "gigaspeech_cuts_DEV.jsonl.gz"
+        logging.info(f"About to get DEV cuts from {f}")
+        return load_manifest_lazy(f)
+
+
+class CommonVoice:
+    def __init__(self, manifest_dir: str):
+        """
+        Args:
+          manifest_dir:
+            It is expected to contain the following files::
+
+                - cv22-en_cuts_train.jsonl.gz
+                - cv22-en_cuts_dev.jsonl.gz
+                - cv22-en_cuts_test.jsonl.gz
+        """
+        self.manifest_dir = Path(manifest_dir)
+
+    def train_cuts(self) -> CutSet:
+        logging.info("CommonVoice: About to get train cuts")
+        return load_manifest_lazy(
+            self.manifest_dir / "cv22-en_cuts_train.jsonl.gz"
+        )
+
+    def dev_cuts(self) -> CutSet:
+        logging.info("CommonVoice: About to get dev cuts")
+        return load_manifest_lazy(
+            self.manifest_dir / "cv22-en_cuts_dev.jsonl.gz"
+        )
+
+    def test_cuts(self) -> CutSet:
+        logging.info("CommonVoice: About to get test cuts")
+        return load_manifest_lazy(
+            self.manifest_dir / "cv22-en_cuts_test.jsonl.gz"
+        )
diff --git a/egs/librispeech/ASR/zapformer/.gitignore b/egs/librispeech/ASR/zapformer/.gitignore
@@ -0,0 +1 @@
+swoosh.pdf