-
Notifications
You must be signed in to change notification settings - Fork 414
Modern gigaspeech recipe #2090
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Modern gigaspeech recipe #2090
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,6 +15,13 @@ stop_stage=8 | |
| start=0 | ||
| stop=-1 # -1 means until the end | ||
|
|
||
| # If true, skip storing GigaSpeech fbank features; only produce the (trimmed) | ||
| # cut manifests for on-the-fly feature extraction during training. | ||
| on_the_fly=true | ||
|
|
||
| # If false (default), skip all musan steps (download, manifest, fbank). | ||
| use_musan=false | ||
|
|
||
| # Note: This script just prepares the minimal requirements needed by a | ||
| # transducer training with bpe units. | ||
| # | ||
|
|
@@ -138,7 +145,7 @@ if [ $stage -le 0 ] && [ $stop_stage -ge 0 ]; then | |
| # | ||
| # ln -svf /path/to/musan $dl_dir/ | ||
| # | ||
| if [ ! -d $dl_dir/musan ]; then | ||
| if [ $use_musan == true ] && [ ! -d $dl_dir/musan ]; then | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🎯 Functional Correctness | 🟡 Minor 🧩 Analysis chain🏁 Script executed: #!/bin/bash
set -euo pipefail
if command -v shellcheck >/dev/null 2>&1; then
shellcheck -f gcc egs/gigaspeech/ASR/prepare.sh | rg -n 'SC2086|prepare.sh'
else
# Fallback quick scan if shellcheck isn't installed in the environment.
rg -n '\$use_musan|\$dl_dir/musan|--on-the-fly \$on_the_fly|--stop \$stop' egs/gigaspeech/ASR/prepare.sh
fiRepository: k2-fsa/icefall Length of output: 4213 🏁 Script executed: #!/bin/bash
# First, let's look at the specific line ranges mentioned in the review
sed -n '145,155p' egs/gigaspeech/ASR/prepare.sh
sed -n '163,175p' egs/gigaspeech/ASR/prepare.sh
sed -n '182,190p' egs/gigaspeech/ASR/prepare.sh
sed -n '200,215p' egs/gigaspeech/ASR/prepare.shRepository: k2-fsa/icefall Length of output: 1760 Quote variable expansions in shell conditionals and arguments. Several lines use unquoted expansions ( 💡 Proposed fix- if [ $use_musan == true ] && [ ! -d $dl_dir/musan ]; then
+ if [[ "$use_musan" == true ]] && [[ ! -d "$dl_dir/musan" ]]; then
-if [ $use_musan == true ] && [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+if [[ "$use_musan" == true ]] && [[ "$stage" -le 2 ]] && [[ "$stop_stage" -ge 2 ]]; then
- lhotse prepare musan $dl_dir/musan data/manifests
+ lhotse prepare musan "$dl_dir/musan" data/manifests
- python3 ./local/compute_fbank_gigaspeech.py --on-the-fly $on_the_fly
+ python3 ./local/compute_fbank_gigaspeech.py --on-the-fly "$on_the_fly"
- --stop $stop \
- --on-the-fly $on_the_fly
+ --stop "$stop" \
+ --on-the-fly "$on_the_fly"
-if [ $use_musan == true ] && [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then
+if [[ "$use_musan" == true ]] && [[ "$stage" -le 7 ]] && [[ "$stop_stage" -ge 7 ]]; thenAffects lines 148–149, 163–171, 204, 206–207, and 210. 🧰 Tools🪛 Shellcheck (0.11.0)[info] 148-148: Double quote to prevent globbing and word splitting. (SC2086) 🤖 Prompt for AI AgentsSource: Linters/SAST tools |
||
| lhotse download musan $dl_dir | ||
| fi | ||
|
Comment on lines
+148
to
150
This comment was marked as resolved.
Sorry, something went wrong. |
||
| fi | ||
|
|
@@ -156,25 +163,25 @@ if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then | |
| $dl_dir/GigaSpeech data/manifests | ||
| fi | ||
|
|
||
| if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then | ||
| log "Stage 2: Prepare musan manifest" | ||
| # We assume that you have downloaded the musan corpus | ||
| # to $dl_dir/musan | ||
| mkdir -p data/manifests | ||
| lhotse prepare musan $dl_dir/musan data/manifests | ||
| if [ $use_musan == true ] && [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then | ||
| log "Stage 2: Prepare musan manifest" | ||
| # We assume that you have downloaded the musan corpus | ||
| # to $dl_dir/musan | ||
| mkdir -p data/manifests | ||
| lhotse prepare musan $dl_dir/musan data/manifests | ||
| fi | ||
|
Comment on lines
+166
to
172
This comment was marked as resolved.
Sorry, something went wrong. |
||
|
|
||
| if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then | ||
| log "State 3: Preprocess GigaSpeech manifest" | ||
| log "Stage 3: Preprocess GigaSpeech manifest" | ||
| if [ ! -f data/fbank/.preprocess_complete ]; then | ||
| python3 ./local/preprocess_gigaspeech.py | ||
| touch data/fbank/.preprocess_complete | ||
| fi | ||
| fi | ||
|
|
||
| if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then | ||
| log "Stage 4: Compute features for DEV, TEST, L, M, S, and XS subsets of GigaSpeech." | ||
| python3 ./local/compute_fbank_gigaspeech.py | ||
| log "Stage 4: Compute features for DEV and TEST subsets of GigaSpeech." | ||
| python3 ./local/compute_fbank_gigaspeech.py --on-the-fly $on_the_fly | ||
This comment was marked as resolved.
Sorry, something went wrong. |
||
| fi | ||
|
|
||
| if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then | ||
|
|
@@ -196,13 +203,14 @@ if [ $stage -le 6 ] && [ $stop_stage -ge 6 ]; then | |
| --batch-duration 600 \ | ||
| --num-splits $num_splits \ | ||
| --start $start \ | ||
| --stop $stop | ||
| --stop $stop \ | ||
| --on-the-fly $on_the_fly | ||
This comment was marked as resolved.
Sorry, something went wrong. |
||
| fi | ||
|
|
||
| if [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then | ||
| log "Stage 7: Compute fbank for musan" | ||
| mkdir -p data/fbank | ||
| ./local/compute_fbank_musan.py | ||
| if [ $use_musan == true ] && [ $stage -le 7 ] && [ $stop_stage -ge 7 ]; then | ||
| log "Stage 7: Compute fbank for musan" | ||
| mkdir -p data/fbank | ||
| ./local/compute_fbank_musan.py | ||
| fi | ||
|
|
||
| if [ $stage -le 8 ] && [ $stop_stage -ge 8 ]; then | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -134,7 +134,7 @@ def add_arguments(cls, parser: argparse.ArgumentParser): | |
| group.add_argument( | ||
| "--on-the-fly-feats", | ||
| type=str2bool, | ||
| default=False, | ||
| default=True, | ||
| help="When enabled, use on-the-fly cut mixing and feature " | ||
| "extraction. Will drop existing precomputed feature manifests " | ||
| "if available.", | ||
|
|
@@ -164,11 +164,19 @@ def add_arguments(cls, parser: argparse.ArgumentParser): | |
| group.add_argument( | ||
| "--num-workers", | ||
| type=int, | ||
| default=2, | ||
| default=8, | ||
| help="The number of training dataloader workers that " | ||
| "collect the batches.", | ||
| ) | ||
|
|
||
| group.add_argument( | ||
| "--prefetch-factor", | ||
| type=int, | ||
| default=8, | ||
| help="Number of batches each worker prefetches in advance. " | ||
| "Ignored when --num-workers is 0.", | ||
| ) | ||
|
Comment on lines
+172
to
+178
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🎯 Functional Correctness | 🟡 Minor | ⚡ Quick win Validate Line 173 introduces a user-controlled value that can be set to Proposed fix+ def positive_int(v: str) -> int:
+ iv = int(v)
+ if iv <= 0:
+ raise argparse.ArgumentTypeError("must be > 0")
+ return iv
...
group.add_argument(
"--prefetch-factor",
- type=int,
+ type=positive_int,
default=8,
help="Number of batches each worker prefetches in advance. "
"Ignored when --num-workers is 0.",
)🤖 Prompt for AI Agents |
||
|
|
||
| group.add_argument( | ||
| "--enable-spec-aug", | ||
| type=str2bool, | ||
|
|
@@ -189,7 +197,7 @@ def add_arguments(cls, parser: argparse.ArgumentParser): | |
| group.add_argument( | ||
| "--enable-musan", | ||
| type=str2bool, | ||
| default=True, | ||
| default=False, | ||
| help="When enabled, select noise from MUSAN and mix it" | ||
| "with training dataset. ", | ||
| ) | ||
|
|
@@ -343,8 +351,12 @@ def train_dataloaders( | |
| sampler=train_sampler, | ||
| batch_size=None, | ||
| num_workers=self.args.num_workers, | ||
| persistent_workers=False, | ||
| persistent_workers=self.args.num_workers > 0, | ||
| worker_init_fn=worker_init_fn, | ||
| pin_memory=True, | ||
| prefetch_factor=self.args.prefetch_factor | ||
| if self.args.num_workers > 0 | ||
| else None, | ||
| ) | ||
|
|
||
| return train_dl | ||
|
|
@@ -389,8 +401,11 @@ def valid_dataloaders( | |
| validate, | ||
| sampler=valid_sampler, | ||
| batch_size=None, | ||
| num_workers=2, | ||
| num_workers=self.args.num_workers, | ||
| persistent_workers=False, | ||
| prefetch_factor=self.args.prefetch_factor | ||
| if self.args.num_workers > 0 | ||
| else None, | ||
| ) | ||
|
|
||
| return valid_dl | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🩺 Stability & Availability | 🟠 Major
🧩 Analysis chain
🏁 Script executed:
Repository: k2-fsa/icefall
Length of output: 873
🏁 Script executed:
Repository: k2-fsa/icefall
Length of output: 3072
Validate worker count before creating
ProcessPoolExecutor.--num-workersaccepts any integer, butProcessPoolExecutor(max_workers=...)fails for<= 0, so the new on-the-fly path can crash immediately on validly parsed input. The code currently has no validation before callingProcessPoolExecutor(max_workers=args.num_workers)at lines 131-133.💡 Proposed fix
🤖 Prompt for AI Agents