k2-fsa · yfyeung · Jun 21, 2026 · Jun 21, 2026 · Jun 21, 2026 · coderabbitai
diff --git a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech.py b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech.py
@@ -16,12 +16,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import argparse
 import logging
 from pathlib import Path
 
 import torch
 from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig
 
+from icefall.utils import str2bool
+
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
@@ -30,7 +33,22 @@
 torch.set_num_interop_threads(1)
 
 
-def compute_fbank_gigaspeech():
+def get_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--on-the-fly",
+        type=str2bool,
+        default=True,
+        help="When True, do not compute and store fbank features; only "
+        "produce the trimmed cut manifests so that features are extracted "
+        "on-the-fly during training.",
+    )
+    return parser.parse_args()
+
+
+def compute_fbank_gigaspeech(on_the_fly: bool = True):
     in_out_dir = Path("data/fbank")
 
     # number of workers in dataloader
@@ -51,7 +69,11 @@ def compute_fbank_gigaspeech():
     device = torch.device("cpu")
     if torch.cuda.is_available():
         device = torch.device("cuda", 0)
-    extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
+
+    # on-the-fly mode does not need the extractor (and kaldifeat may be absent)
+    extractor = None
+    if not on_the_fly:
+        extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
 
     logging.info(f"device: {device}")
 
@@ -66,15 +88,21 @@ def compute_fbank_gigaspeech():
         logging.info(f"Loading {raw_cuts_path}")
         cut_set = CutSet.from_file(raw_cuts_path)
 
-        logging.info("Computing features")
+        if on_the_fly:
+            logging.info(
+                "on-the-fly is enabled - skipping feature extraction, "
+                "only saving the trimmed cut manifest"
+            )
+        else:
+            logging.info("Computing features")
+            cut_set = cut_set.compute_and_store_features_batch(
+                extractor=extractor,
+                storage_path=f"{in_out_dir}/gigaspeech_feats_{partition}",
+                num_workers=num_workers,
+                batch_duration=batch_duration,
+                overwrite=True,
+            )
 
-        cut_set = cut_set.compute_and_store_features_batch(
-            extractor=extractor,
-            storage_path=f"{in_out_dir}/gigaspeech_feats_{partition}",
-            num_workers=num_workers,
-            batch_duration=batch_duration,
-            overwrite=True,
-        )
         cut_set = cut_set.trim_to_supervisions(
             keep_overlapping=False, min_duration=None
         )
@@ -88,7 +116,8 @@ def main():
     formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
     logging.basicConfig(format=formatter, level=logging.INFO)
 
-    compute_fbank_gigaspeech()
+    args = get_args()
+    compute_fbank_gigaspeech(on_the_fly=args.on_the_fly)
 
 
 if __name__ == "__main__":

diff --git a/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py b/egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py
@@ -19,11 +19,14 @@
 import argparse
 import logging
 import os
+from concurrent.futures import ProcessPoolExecutor
 from pathlib import Path
 
 import torch
 from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig
 
+from icefall.utils import str2bool
+
 # Torch's multithreaded behavior needs to be disabled or
 # it wastes a lot of CPU and slow things down.
 # Do this outside of main() in case it needs to take effect
@@ -71,9 +74,36 @@ def get_args():
         default=-1,
         help="Stop processing pieces until this number (exclusive).",
     )
+
+    parser.add_argument(
+        "--on-the-fly",
+        type=str2bool,
+        default=True,
+        help="When True, do not compute and store fbank features; only "
+        "produce the trimmed cut manifests so that features are extracted "
+        "on-the-fly during training.",
+    )
     return parser.parse_args()
 
 
+def trim_split(idx: str, output_dir: Path) -> None:
+    """Trim one raw split to supervisions and save it (no feature extraction)."""
+    cuts_path = output_dir / f"gigaspeech_cuts_XL.{idx}.jsonl.gz"
+    if cuts_path.is_file():
+        logging.info(f"{cuts_path} exists - skipping")
+        return
+
+    raw_cuts_path = output_dir / f"gigaspeech_cuts_XL_raw.{idx}.jsonl.gz"
+    if not raw_cuts_path.is_file():
+        logging.info(f"{raw_cuts_path} does not exist - skipping it")
+        return
+
+    cut_set = CutSet.from_file(raw_cuts_path)
+    cut_set = cut_set.trim_to_supervisions(keep_overlapping=False, min_duration=None)
+    cut_set.to_file(cuts_path)
+    logging.info(f"Saved to {cuts_path}")
+
+
 def compute_fbank_gigaspeech_splits(args):
     num_splits = args.num_splits
     output_dir = "data/fbank/gigaspeech_XL_split"
@@ -87,13 +117,29 @@ def compute_fbank_gigaspeech_splits(args):
 
     stop = min(stop, num_splits)
 
+    num_digits = 8  # num_digits is fixed by lhotse split-lazy
+
+    if args.on_the_fly:
+        # on-the-fly does not compute features, so the per-split work is pure
+        # CPU and independent -- fan it out across processes instead of the
+        # serial loop the GPU path needs.
+        logging.info(
+            f"on-the-fly is enabled - trimming splits in parallel "
+            f"with {args.num_workers} workers"
+        )
+        idxs = [f"{i}".zfill(num_digits) for i in range(start, stop)]
+        with ProcessPoolExecutor(max_workers=args.num_workers) as ex:
+            futures = [ex.submit(trim_split, idx, output_dir) for idx in idxs]
+            for f in futures:
+                f.result()
+        return
+
     device = torch.device("cpu")
     if torch.cuda.is_available():
         device = torch.device("cuda", 0)
     extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
     logging.info(f"device: {device}")
 
-    num_digits = 8  # num_digits is fixed by lhotse split-lazy
     for i in range(start, stop):
         idx = f"{i}".zfill(num_digits)
         logging.info(f"Processing {idx}/{num_splits}")

diff --git a/egs/gigaspeech/ASR/prepare.sh b/egs/gigaspeech/ASR/prepare.sh
@@ -15,6 +15,13 @@ stop_stage=8
 start=0
 stop=-1  # -1 means until the end
 
+# If true, skip storing GigaSpeech fbank features; only produce the (trimmed)
+# cut manifests for on-the-fly feature extraction during training.
+on_the_fly=true
+
+# If false (default), skip all musan steps (download, manifest, fbank).
+use_musan=false
+
 # Note: This script just prepares the minimal requirements needed by a
 # transducer training with bpe units.
 #
@@ -138,7 +145,7 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
   #
   #   ln -svf /path/to/musan $dl_dir/
   #
-  if [ ! -d $dl_dir/musan ]; then
+  if [ $use_musan == true ] && [ ! -d $dl_dir/musan ]; then
     lhotse download musan $dl_dir
   fi
 fi
@@ -156,25 +163,25 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
     $dl_dir/GigaSpeech data/manifests
 fi
 
-if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
-  log "Stage 2: Prepare musan manifest"
-  # We assume that you have downloaded the musan corpus
-  # to $dl_dir/musan
-  mkdir -p data/manifests
-  lhotse prepare musan $dl_dir/musan data/manifests
+if [ $use_musan == true ] && [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+    log "Stage 2: Prepare musan manifest"
+    # We assume that you have downloaded the musan corpus
+    # to $dl_dir/musan
+    mkdir -p data/manifests
+    lhotse prepare musan $dl_dir/musan data/manifests
 fi
 
 if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
-  log "State 3: Preprocess GigaSpeech manifest"
+  log "Stage 3: Preprocess GigaSpeech manifest"
   if [ ! -f data/fbank/.preprocess_complete ]; then
     python3 ./local/preprocess_gigaspeech.py
     touch data/fbank/.preprocess_complete
   fi
 fi
 
 if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
-  log "Stage 4: Compute features for DEV, TEST, L, M, S, and XS subsets of GigaSpeech."
-  python3 ./local/compute_fbank_gigaspeech.py
+  log "Stage 4: Compute features for DEV and TEST subsets of GigaSpeech."
+  python3 ./local/compute_fbank_gigaspeech.py --on-the-fly $on_the_fly
 fi
 
 if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
@@ -196,13 +203,14 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
     --batch-duration 600 \
     --num-splits $num_splits \
     --start $start \
-    --stop $stop
+    --stop $stop \
+    --on-the-fly $on_the_fly
 fi
 
-if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
-  log "Stage 7: Compute fbank for musan"
-  mkdir -p data/fbank
-  ./local/compute_fbank_musan.py
+if [ $use_musan == true ] && [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
+    log "Stage 7: Compute fbank for musan"
+    mkdir -p data/fbank
+    ./local/compute_fbank_musan.py
 fi
 
 if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then

diff --git a/egs/gigaspeech/ASR/zipformer/asr_datamodule.py b/egs/gigaspeech/ASR/zipformer/asr_datamodule.py
@@ -134,7 +134,7 @@ def add_arguments(cls, parser: argparse.ArgumentParser):
         group.add_argument(
             "--on-the-fly-feats",
             type=str2bool,
-            default=False,
+            default=True,
             help="When enabled, use on-the-fly cut mixing and feature "
             "extraction. Will drop existing precomputed feature manifests "
             "if available.",
@@ -164,11 +164,19 @@ def add_arguments(cls, parser: argparse.ArgumentParser):
         group.add_argument(
             "--num-workers",
             type=int,
-            default=2,
+            default=8,
             help="The number of training dataloader workers that "
             "collect the batches.",
         )
 
+        group.add_argument(
+            "--prefetch-factor",
+            type=int,
+            default=8,
+            help="Number of batches each worker prefetches in advance. "
+            "Ignored when --num-workers is 0.",
+        )
+
         group.add_argument(
             "--enable-spec-aug",
             type=str2bool,
@@ -189,7 +197,7 @@ def add_arguments(cls, parser: argparse.ArgumentParser):
         group.add_argument(
             "--enable-musan",
             type=str2bool,
-            default=True,
+            default=False,
             help="When enabled, select noise from MUSAN and mix it"
             "with training dataset. ",
         )
@@ -343,8 +351,12 @@ def train_dataloaders(
             sampler=train_sampler,
             batch_size=None,
             num_workers=self.args.num_workers,
-            persistent_workers=False,
+            persistent_workers=self.args.num_workers > 0,
             worker_init_fn=worker_init_fn,
+            pin_memory=True,
+            prefetch_factor=self.args.prefetch_factor
+            if self.args.num_workers > 0
+            else None,
         )
 
         return train_dl
@@ -389,8 +401,11 @@ def valid_dataloaders(
             validate,
             sampler=valid_sampler,
             batch_size=None,
-            num_workers=2,
+            num_workers=self.args.num_workers,
             persistent_workers=False,
+            prefetch_factor=self.args.prefetch_factor
+            if self.args.num_workers > 0
+            else None,
         )
 
         return valid_dl