k2-fsa · KarelVesely84 · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026
diff --git a/.github/scripts/docker/Dockerfile b/.github/scripts/docker/Dockerfile
@@ -44,6 +44,7 @@ RUN pip install --no-cache-dir \
       espnet_tts_frontend \
       graphviz \
       kaldi-decoder \
+      kaldi_native_fbank \
       kaldi_native_io \
       kaldialign \
       kaldifst \
@@ -61,6 +62,7 @@ RUN pip install --no-cache-dir \
       piper_phonemize -f https://k2-fsa.github.io/icefall/piper_phonemize.html \
       pypinyin==0.50.0 \
       pytest \
+      rknn_toolkit2 \
       sentencepiece>=0.1.96 \
       six \
       tensorboard \

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -102,6 +102,7 @@ jobs:
               cd ../transducer_lstm
               pytest -v -s
 
+              pip install kaldi_native_fbank rknn_toolkit2
               cd ../zipformer
               pytest -v -s
 

diff --git a/egs/iwslt22_ta/ASR/local/prepare_transcripts.py b/egs/iwslt22_ta/ASR/local/prepare_transcripts.py
@@ -1 +1 @@
-/exp/ahussein/tmp/icefall/egs/iwslt22_ta/ST/local/prepare_transcripts.py
+../../ST/local/prepare_transcripts.py
diff --git a/egs/iwslt22_ta/ST/local/prepare_transcripts.py b/egs/iwslt22_ta/ST/local/prepare_transcripts.py
@@ -1,66 +1,66 @@
-# Copyright 2023 Johns Hopkins University  (Amir Hussein)
-
-#!/usr/bin/python
-"""
-This script prepares transcript_words.txt from cutset
-"""
-
-from lhotse import CutSet
-import argparse
-import logging
-import pdb
-from pathlib import Path
-import os
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument(
-        "--cut",
-        type=str,
-        default="",
-        help="Cutset file",
-    )
-    parser.add_argument(
-        "--src-langdir",
-        type=str,
-        default="",
-        help="name of the source lang-dir",
-    )
-    parser.add_argument(
-        "--tgt-langdir",
-        type=str,
-        default=None,
-        help="name of the target lang-dir",
-    )
-    return parser
-     
-
-def main():
-
-    parser = get_parser()
-    args = parser.parse_args()
-
-    logging.info("Reading the cuts")
-    cuts = CutSet.from_file(args.cut)
-    if args.tgt_langdir != None:
-        logging.info("Target dir is not None")
-        langdirs = [Path(args.src_langdir), Path(args.tgt_langdir)]
-    else:
-        langdirs = [Path(args.src_langdir)]
-    
-    for langdir in langdirs:
-        if not os.path.exists(langdir):
-            os.makedirs(langdir)
-
-    with open(langdirs[0] / "transcript_words.txt", 'w') as src, open(langdirs[1] / "transcript_words.txt", 'w') as tgt:
-        for c in cuts:
-            src_txt = c.supervisions[0].text
-            tgt_txt = c.supervisions[0].custom['translated_text']['eng']
-            src.write(src_txt + '\n')
-            tgt.write(tgt_txt + '\n')
-
-if __name__ == "__main__":
-    main()
+# Copyright 2023 Johns Hopkins University  (Amir Hussein)
+
+#!/usr/bin/python
-# Copyright 2023 Johns Hopkins University  (Amir Hussein)
-
-#!/usr/bin/python
+#!/usr/bin/python
+# Copyright 2023 Johns Hopkins University  (Amir Hussein)
-# Copyright 2023 Johns Hopkins University  (Amir Hussein)
-
-#!/usr/bin/python
+#!/usr/bin/python
+# Copyright 2023 Johns Hopkins University  (Amir Hussein)
+"""
+This script prepares transcript_words.txt from cutset
+"""
+
+from lhotse import CutSet
+import argparse
+import logging
+import pdb
+from pathlib import Path
+import os
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--cut",
+        type=str,
+        default="",
+        help="Cutset file",
+    )
+    parser.add_argument(
+        "--src-langdir",
+        type=str,
+        default="",
+        help="name of the source lang-dir",
+    )
+    parser.add_argument(
+        "--tgt-langdir",
+        type=str,
+        default=None,
+        help="name of the target lang-dir",
+    )
+    return parser
+
+
+def main():
+
+    parser = get_parser()
+    args = parser.parse_args()
+
+    logging.info("Reading the cuts")
+    cuts = CutSet.from_file(args.cut)
+    if args.tgt_langdir != None:
+        logging.info("Target dir is not None")
+        langdirs = [Path(args.src_langdir), Path(args.tgt_langdir)]
+    else:
+        langdirs = [Path(args.src_langdir)]
+
+    for langdir in langdirs:
+        if not os.path.exists(langdir):
+            os.makedirs(langdir)
+
+    with open(langdirs[0] / "transcript_words.txt", 'w') as src, open(langdirs[1] / "transcript_words.txt", 'w') as tgt:
+        for c in cuts:
+            src_txt = c.supervisions[0].text
+            tgt_txt = c.supervisions[0].custom['translated_text']['eng']
+            src.write(src_txt + '\n')
+            tgt.write(tgt_txt + '\n')
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/iwslt22_ta/ST/zipformer/profile.py b/egs/iwslt22_ta/ST/zipformer/profile.py
@@ -1 +1 @@
-../../../librispeech/ASR/zipformer/profile.py
+../../ASR/zipformer/profile.py
diff --git a/egs/multi_conv_zh_es_ta/ST/hent_srt/export.py b/egs/multi_conv_zh_es_ta/ST/hent_srt/export.py
@@ -168,7 +168,7 @@
     ln -s pretrained.pt epoch-9999.pt
 
     cd /path/to/egs/multi_conv_zh_es_ta/ST
-    
+
     ./hent_srt/decode.py \
         --epoch 9999 --avg 1 --use-averaged-model 0 \
         --beam-size 20 \
@@ -208,7 +208,7 @@
     ln -s pretrained.pt epoch-9999.pt
 
     cd /path/to/egs/multi_conv_zh_es_ta/ST
-        
+
     ./hent_srt/decode.py \
         --epoch 9999 --avg 1 --use-averaged-model 0 \
         --causal 1 \
@@ -240,7 +240,7 @@
         --st-blank-penalty 2 \
         --chunk-size 64 \
         --left-context-frames 128 \
-        --use-hat False --max-sym-per-frame 20 
+        --use-hat False --max-sym-per-frame 20
 
 Note: If you don't want to train a model from scratch, we have
 provided one for you. You can get it at
@@ -389,7 +389,7 @@ def forward(
             features: (N, T, C)
             feature_lengths: (N,)
         """
-        encoder_out, encoder_out_lens, st_encoder_out, st_encoder_out_lens = model.forward_encoder(feature, feature_lengths)
+        encoder_out, encoder_out_lens, st_encoder_out, st_encoder_out_lens = self.model.forward_encoder(features, feature_lengths)
         encoder_out = encoder_out.permute(1, 0, 2)  # (T, N, C) ->(N, T, C)
         st_encoder_out = st_encoder_out.permute(1, 0, 2)
         return encoder_out, encoder_out_lens, st_encoder_out, st_encoder_out_lens
@@ -596,7 +596,7 @@ def main():
                     filename_start=filename_start,
                     filename_end=filename_end,
                     device=device,
-                    
+
                 ), strict=False
             )
 

diff --git a/egs/multi_conv_zh_es_ta/ST/zipformer_multijoiner_st/train.py b/egs/multi_conv_zh_es_ta/ST/zipformer_multijoiner_st/train.py
@@ -27,7 +27,7 @@
 
 export CUDA_VISIBLE_DEVICES="0,1,2,3"
 
-# For non-streaming model training:  
+# For non-streaming model training:
 
 ./zipformer_hat_st/train.py \
   --base-lr 0.045 \
@@ -50,7 +50,7 @@
   --warm-step 10000 \
   --lr-epochs 6 \
   --use-hat False
-  
+
 # With Cr-CTC
 ./zipformer_hat_st/train.py \
   --base-lr 0.045 \
@@ -989,7 +989,7 @@ def load_checkpoint_if_available(
         return None
 
     assert filename.is_file(), f"{filename} does not exist!"
-    
+
     saved_params = load_checkpoint(
         filename,
         model=model,
@@ -1094,7 +1094,7 @@ def compute_loss(
      spec_augment:
         The SpecAugment instance used only when use_cr_ctc is True.
     """
-    
+
     device = model.device if isinstance(model, DDP) else next(model.parameters()).device
     feature = batch["inputs"]
     # at entry, feature is (N, T, C)
@@ -1116,13 +1116,13 @@ def compute_loss(
     y = sp.encode(texts, out_type=int)
     y = k2.RaggedTensor(y)
     if params.st_scale != 1:
-        alpha_st = params.st_scale 
-        alpha_asr = 1-params.st_scale 
+        alpha_st = params.st_scale
+        alpha_asr = 1-params.st_scale
     else:
         alpha_st, alpha_asr = 1, 1
     use_asr_cr_ctc, use_st_cr_ctc  = params.use_asr_cr_ctc, params.use_st_cr_ctc
     use_spec_aug = (use_asr_cr_ctc or use_st_cr_ctc) and is_training
-    
+
     if use_spec_aug:
         supervision_intervals = batch["supervisions"]
         supervision_segments = torch.stack(
@@ -1218,7 +1218,7 @@ def compute_loss(
         info["pruned_loss"] = pruned_loss.detach().cpu().item()
         if params.use_st_joiner:
             info["st_simple_loss"] = st_simple_loss.detach().cpu().item()
-            info["st_pruned_loss"] = st_pruned_loss.detach().cpu().item()     
+            info["st_pruned_loss"] = st_pruned_loss.detach().cpu().item()
     if params.use_ctc:
         info["ctc_loss"] = ctc_loss.detach().cpu().item()
         if params.use_asr_cr_ctc:
@@ -1573,7 +1573,7 @@ def run(rank, world_size, args):
     )
 
     scheduler = Eden(optimizer, params.lr_batches, params.lr_epochs)
-    
+
     # if checkpoints and "optimizer" in checkpoints:
     #     logging.info("Loading optimizer state dict")
     #     optimizer.load_state_dict(checkpoints["optimizer"])
@@ -1608,7 +1608,7 @@ def remove_short_and_long_utt(c: Cut):
         # You should use ../local/display_manifest_statistics.py to get
         # an utterance duration distribution for your dataset to select
         # the threshold
-        if c.duration =< 0.1 or c.duration >= 30.0:
+        if c.duration <= 0.1 or c.duration >= 30.0:
             # logging.warning(
             #     f"Exclude cut with ID {c.id} from training. Duration: {c.duration}"
             # )
@@ -1646,7 +1646,7 @@ def remove_short_and_long_utt(c: Cut):
             #         f"Number of tokens: {len(st_tokens)}"
             #     )
                 return False
-            
+
         if params.use_asr_cr_ctc:
             T = ((c.num_frames - 7) // 2 + 1) // 2
             tokens = sp.encode(c.supervisions[0].text, out_type=str)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		/exp/ahussein/tmp/icefall/egs/iwslt22_ta/ST/local/prepare_transcripts.py
		../../ST/local/prepare_transcripts.py
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		../../../librispeech/ASR/zipformer/profile.py
		../../ASR/zipformer/profile.py