Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 40 additions & 11 deletions egs/gigaspeech/ASR/local/compute_fbank_gigaspeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import logging
from pathlib import Path

import torch
from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig

from icefall.utils import str2bool

# Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down.
# Do this outside of main() in case it needs to take effect
Expand All @@ -30,7 +33,22 @@
torch.set_num_interop_threads(1)


def compute_fbank_gigaspeech():
def get_args():
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--on-the-fly",
type=str2bool,
default=True,
help="When True, do not compute and store fbank features; only "
"produce the trimmed cut manifests so that features are extracted "
"on-the-fly during training.",
)
return parser.parse_args()


def compute_fbank_gigaspeech(on_the_fly: bool = True):
in_out_dir = Path("data/fbank")

# number of workers in dataloader
Expand All @@ -51,7 +69,11 @@ def compute_fbank_gigaspeech():
device = torch.device("cpu")
if torch.cuda.is_available():
device = torch.device("cuda", 0)
extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))

# on-the-fly mode does not need the extractor (and kaldifeat may be absent)
extractor = None
if not on_the_fly:
extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))

logging.info(f"device: {device}")

Expand All @@ -66,15 +88,21 @@ def compute_fbank_gigaspeech():
logging.info(f"Loading {raw_cuts_path}")
cut_set = CutSet.from_file(raw_cuts_path)

logging.info("Computing features")
if on_the_fly:
logging.info(
"on-the-fly is enabled - skipping feature extraction, "
"only saving the trimmed cut manifest"
)
else:
logging.info("Computing features")
cut_set = cut_set.compute_and_store_features_batch(
extractor=extractor,
storage_path=f"{in_out_dir}/gigaspeech_feats_{partition}",
num_workers=num_workers,
batch_duration=batch_duration,
overwrite=True,
)

cut_set = cut_set.compute_and_store_features_batch(
extractor=extractor,
storage_path=f"{in_out_dir}/gigaspeech_feats_{partition}",
num_workers=num_workers,
batch_duration=batch_duration,
overwrite=True,
)
cut_set = cut_set.trim_to_supervisions(
keep_overlapping=False, min_duration=None
)
Expand All @@ -88,7 +116,8 @@ def main():
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
logging.basicConfig(format=formatter, level=logging.INFO)

compute_fbank_gigaspeech()
args = get_args()
compute_fbank_gigaspeech(on_the_fly=args.on_the_fly)


if __name__ == "__main__":
Expand Down
48 changes: 47 additions & 1 deletion egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,14 @@
import argparse
import logging
import os
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path

import torch
from lhotse import CutSet, KaldifeatFbank, KaldifeatFbankConfig

from icefall.utils import str2bool

# Torch's multithreaded behavior needs to be disabled or
# it wastes a lot of CPU and slow things down.
# Do this outside of main() in case it needs to take effect
Expand Down Expand Up @@ -71,9 +74,36 @@ def get_args():
default=-1,
help="Stop processing pieces until this number (exclusive).",
)

parser.add_argument(
"--on-the-fly",
type=str2bool,
default=True,
help="When True, do not compute and store fbank features; only "
"produce the trimmed cut manifests so that features are extracted "
"on-the-fly during training.",
)
return parser.parse_args()


def trim_split(idx: str, output_dir: Path) -> None:
"""Trim one raw split to supervisions and save it (no feature extraction)."""
cuts_path = output_dir / f"gigaspeech_cuts_XL.{idx}.jsonl.gz"
if cuts_path.is_file():
logging.info(f"{cuts_path} exists - skipping")
return

raw_cuts_path = output_dir / f"gigaspeech_cuts_XL_raw.{idx}.jsonl.gz"
if not raw_cuts_path.is_file():
logging.info(f"{raw_cuts_path} does not exist - skipping it")
return

cut_set = CutSet.from_file(raw_cuts_path)
cut_set = cut_set.trim_to_supervisions(keep_overlapping=False, min_duration=None)
cut_set.to_file(cuts_path)
logging.info(f"Saved to {cuts_path}")


def compute_fbank_gigaspeech_splits(args):
num_splits = args.num_splits
output_dir = "data/fbank/gigaspeech_XL_split"
Expand All @@ -87,13 +117,29 @@ def compute_fbank_gigaspeech_splits(args):

stop = min(stop, num_splits)

num_digits = 8 # num_digits is fixed by lhotse split-lazy

if args.on_the_fly:
# on-the-fly does not compute features, so the per-split work is pure
# CPU and independent -- fan it out across processes instead of the
# serial loop the GPU path needs.
logging.info(
f"on-the-fly is enabled - trimming splits in parallel "
f"with {args.num_workers} workers"
)
idxs = [f"{i}".zfill(num_digits) for i in range(start, stop)]
with ProcessPoolExecutor(max_workers=args.num_workers) as ex:
futures = [ex.submit(trim_split, idx, output_dir) for idx in idxs]
for f in futures:
f.result()
Comment on lines +122 to +134

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🩺 Stability & Availability | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

# Verify callsites and defaults around --num-workers in this recipe.
rg -n --type=py --type=sh 'compute_fbank_gigaspeech_splits\.py|--num-workers' egs/gigaspeech/ASR

# Verify stdlib behavior for invalid ProcessPoolExecutor worker counts.
python - <<'PY'
from concurrent.futures import ProcessPoolExecutor
for n in (0, -1):
    try:
        ProcessPoolExecutor(max_workers=n)
    except Exception as e:
        print(f"max_workers={n}: {type(e).__name__}: {e}")
PY

Repository: k2-fsa/icefall

Length of output: 873


🏁 Script executed:

#!/bin/bash
set -euo pipefail

# Read the argument parser definition around line 44
sed -n '30,70p' egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py

# Read the full context around lines 122-134 to check for validation
sed -n '100,150p' egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py

Repository: k2-fsa/icefall

Length of output: 3072


Validate worker count before creating ProcessPoolExecutor.

--num-workers accepts any integer, but ProcessPoolExecutor(max_workers=...) fails for <= 0, so the new on-the-fly path can crash immediately on validly parsed input. The code currently has no validation before calling ProcessPoolExecutor(max_workers=args.num_workers) at lines 131-133.

💡 Proposed fix
     if args.on_the_fly:
+        if args.num_workers <= 0:
+            raise ValueError(
+                "--num-workers must be >= 1 when --on-the-fly is true"
+            )
         # on-the-fly does not compute features, so the per-split work is pure
         # CPU and independent -- fan it out across processes instead of the
         # serial loop the GPU path needs.
         logging.info(
             f"on-the-fly is enabled - trimming splits in parallel "
             f"with {args.num_workers} workers"
         )
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@egs/gigaspeech/ASR/local/compute_fbank_gigaspeech_splits.py` around lines 122
- 134, The ProcessPoolExecutor constructor fails when max_workers is less than
or equal to 0, but args.num_workers is not validated before being passed to
ProcessPoolExecutor(max_workers=args.num_workers) in the on-the-fly path. Add
validation to ensure args.num_workers is greater than 0 before the
ProcessPoolExecutor instantiation, and raise an appropriate error or exit if the
validation fails. This validation should occur after the logging statement but
before creating the ProcessPoolExecutor with the worker count.

return

device = torch.device("cpu")
if torch.cuda.is_available():
device = torch.device("cuda", 0)
extractor = KaldifeatFbank(KaldifeatFbankConfig(device=device))
logging.info(f"device: {device}")

num_digits = 8 # num_digits is fixed by lhotse split-lazy
for i in range(start, stop):
idx = f"{i}".zfill(num_digits)
logging.info(f"Processing {idx}/{num_splits}")
Expand Down
38 changes: 23 additions & 15 deletions egs/gigaspeech/ASR/prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ stop_stage=8
start=0
stop=-1 # -1 means until the end

# If true, skip storing GigaSpeech fbank features; only produce the (trimmed)
# cut manifests for on-the-fly feature extraction during training.
on_the_fly=true

# If false (default), skip all musan steps (download, manifest, fbank).
use_musan=false

# Note: This script just prepares the minimal requirements needed by a
# transducer training with bpe units.
#
Expand Down Expand Up @@ -138,7 +145,7 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then
#
# ln -svf /path/to/musan $dl_dir/
#
if [ ! -d $dl_dir/musan ]; then
if [ $use_musan == true ] && [ ! -d $dl_dir/musan ]; then

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎯 Functional Correctness | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

if command -v shellcheck >/dev/null 2>&1; then
  shellcheck -f gcc egs/gigaspeech/ASR/prepare.sh | rg -n 'SC2086|prepare.sh'
else
  # Fallback quick scan if shellcheck isn't installed in the environment.
  rg -n '\$use_musan|\$dl_dir/musan|--on-the-fly \$on_the_fly|--stop \$stop' egs/gigaspeech/ASR/prepare.sh
fi

Repository: k2-fsa/icefall

Length of output: 4213


🏁 Script executed:

#!/bin/bash
# First, let's look at the specific line ranges mentioned in the review
sed -n '145,155p' egs/gigaspeech/ASR/prepare.sh
sed -n '163,175p' egs/gigaspeech/ASR/prepare.sh
sed -n '182,190p' egs/gigaspeech/ASR/prepare.sh
sed -n '200,215p' egs/gigaspeech/ASR/prepare.sh

Repository: k2-fsa/icefall

Length of output: 1760


Quote variable expansions in shell conditionals and arguments.

Several lines use unquoted expansions ($use_musan, $dl_dir/musan, $on_the_fly, $stop, etc.), which risk word splitting/globbing and brittle condition parsing. ShellCheck confirms SC2086 violations on all affected lines.

💡 Proposed fix
-  if [ $use_musan == true ] && [ ! -d $dl_dir/musan ]; then
+  if [[ "$use_musan" == true ]] && [[ ! -d "$dl_dir/musan" ]]; then

-if [ $use_musan == true ] && [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+if [[ "$use_musan" == true ]] && [[ "$stage" -le 2 ]] && [[ "$stop_stage" -ge 2 ]]; then

-    lhotse prepare musan $dl_dir/musan data/manifests
+    lhotse prepare musan "$dl_dir/musan" data/manifests

-  python3 ./local/compute_fbank_gigaspeech.py --on-the-fly $on_the_fly
+  python3 ./local/compute_fbank_gigaspeech.py --on-the-fly "$on_the_fly"

-    --stop $stop \
-    --on-the-fly $on_the_fly
+    --stop "$stop" \
+    --on-the-fly "$on_the_fly"

-if [ $use_musan == true ] && [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
+if [[ "$use_musan" == true ]] && [[ "$stage" -le 7 ]] && [[ "$stop_stage" -ge 7 ]]; then

Affects lines 148–149, 163–171, 204, 206–207, and 210.

🧰 Tools
🪛 Shellcheck (0.11.0)

[info] 148-148: Double quote to prevent globbing and word splitting.

(SC2086)

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@egs/gigaspeech/ASR/prepare.sh` at line 148, Quote all variable expansions
throughout the prepare.sh script to prevent word splitting and globbing issues.
Specifically, wrap variables like $use_musan, $dl_dir, $on_the_fly, and $stop in
double quotes when used in conditionals and as arguments. Update the affected
lines (148-149, 163-171, 204, 206-207, and 210) by changing patterns like [
$use_musan == true ] to [ "$use_musan" == true ] and $dl_dir/musan to
"$dl_dir"/musan to resolve all ShellCheck SC2086 violations.

Source: Linters/SAST tools

lhotse download musan $dl_dir
fi
Comment on lines +148 to 150

This comment was marked as resolved.

fi
Expand All @@ -156,25 +163,25 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
$dl_dir/GigaSpeech data/manifests
fi

if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Prepare musan manifest"
# We assume that you have downloaded the musan corpus
# to $dl_dir/musan
mkdir -p data/manifests
lhotse prepare musan $dl_dir/musan data/manifests
if [ $use_musan == true ] && [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
log "Stage 2: Prepare musan manifest"
# We assume that you have downloaded the musan corpus
# to $dl_dir/musan
mkdir -p data/manifests
lhotse prepare musan $dl_dir/musan data/manifests
fi
Comment on lines +166 to 172

This comment was marked as resolved.


if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
log "State 3: Preprocess GigaSpeech manifest"
log "Stage 3: Preprocess GigaSpeech manifest"
if [ ! -f data/fbank/.preprocess_complete ]; then
python3 ./local/preprocess_gigaspeech.py
touch data/fbank/.preprocess_complete
fi
fi

if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
log "Stage 4: Compute features for DEV, TEST, L, M, S, and XS subsets of GigaSpeech."
python3 ./local/compute_fbank_gigaspeech.py
log "Stage 4: Compute features for DEV and TEST subsets of GigaSpeech."
python3 ./local/compute_fbank_gigaspeech.py --on-the-fly $on_the_fly

This comment was marked as resolved.

fi

if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
Expand All @@ -196,13 +203,14 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then
--batch-duration 600 \
--num-splits $num_splits \
--start $start \
--stop $stop
--stop $stop \
--on-the-fly $on_the_fly

This comment was marked as resolved.

fi

if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
log "Stage 7: Compute fbank for musan"
mkdir -p data/fbank
./local/compute_fbank_musan.py
if [ $use_musan == true ] && [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
log "Stage 7: Compute fbank for musan"
mkdir -p data/fbank
./local/compute_fbank_musan.py
fi

if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then
Expand Down
25 changes: 20 additions & 5 deletions egs/gigaspeech/ASR/zipformer/asr_datamodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def add_arguments(cls, parser: argparse.ArgumentParser):
group.add_argument(
"--on-the-fly-feats",
type=str2bool,
default=False,
default=True,
help="When enabled, use on-the-fly cut mixing and feature "
"extraction. Will drop existing precomputed feature manifests "
"if available.",
Expand Down Expand Up @@ -164,11 +164,19 @@ def add_arguments(cls, parser: argparse.ArgumentParser):
group.add_argument(
"--num-workers",
type=int,
default=2,
default=8,
help="The number of training dataloader workers that "
"collect the batches.",
)

group.add_argument(
"--prefetch-factor",
type=int,
default=8,
help="Number of batches each worker prefetches in advance. "
"Ignored when --num-workers is 0.",
)
Comment on lines +172 to +178

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎯 Functional Correctness | 🟡 Minor | ⚡ Quick win

Validate --prefetch-factor as a positive integer.

Line 173 introduces a user-controlled value that can be set to 0 or negative and then trigger DataLoader construction errors when --num-workers > 0. Add CLI-side validation to fail fast with a clear message.

Proposed fix
+        def positive_int(v: str) -> int:
+            iv = int(v)
+            if iv <= 0:
+                raise argparse.ArgumentTypeError("must be > 0")
+            return iv
...
         group.add_argument(
             "--prefetch-factor",
-            type=int,
+            type=positive_int,
             default=8,
             help="Number of batches each worker prefetches in advance. "
             "Ignored when --num-workers is 0.",
         )
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@egs/gigaspeech/ASR/zipformer/asr_datamodule.py` around lines 172 - 178, The
`--prefetch-factor` argument in the add_argument call lacks validation and can
accept 0 or negative values, which causes DataLoader construction errors at
runtime when `--num-workers > 0`. Add a custom type converter function to the
`type` parameter of the `--prefetch-factor` add_argument call that validates the
input is a positive integer (greater than 0), ensuring the validation fails fast
with a clear error message when an invalid value is provided.


group.add_argument(
"--enable-spec-aug",
type=str2bool,
Expand All @@ -189,7 +197,7 @@ def add_arguments(cls, parser: argparse.ArgumentParser):
group.add_argument(
"--enable-musan",
type=str2bool,
default=True,
default=False,
help="When enabled, select noise from MUSAN and mix it"
"with training dataset. ",
)
Expand Down Expand Up @@ -343,8 +351,12 @@ def train_dataloaders(
sampler=train_sampler,
batch_size=None,
num_workers=self.args.num_workers,
persistent_workers=False,
persistent_workers=self.args.num_workers > 0,
worker_init_fn=worker_init_fn,
pin_memory=True,
prefetch_factor=self.args.prefetch_factor
if self.args.num_workers > 0
else None,
)

return train_dl
Expand Down Expand Up @@ -389,8 +401,11 @@ def valid_dataloaders(
validate,
sampler=valid_sampler,
batch_size=None,
num_workers=2,
num_workers=self.args.num_workers,
persistent_workers=False,
prefetch_factor=self.args.prefetch_factor
if self.args.num_workers > 0
else None,
)

return valid_dl
Expand Down
Loading
Loading