From ae89665a1aeb40842001967c04634960966c7a7c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=83=A4=E3=83=B3=E3=83=A4=E3=83=B3?= Date: Sun, 6 Dec 2020 22:17:58 +0800 Subject: [PATCH 1/5] Missing hyphen in --config option `queue.pl -config ...` should be revised as `queue.pl --config ...`, line 248 --- src/doc/queue.dox | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/doc/queue.dox b/src/doc/queue.dox index 3da2cf4b263..5192d50d778 100644 --- a/src/doc/queue.dox +++ b/src/doc/queue.dox @@ -245,7 +245,7 @@ option allow_k20=true option allow_k20=false -l 'hostname=!g01*&!g02*&!b06*' \endverbatim We then set the relevant $cmd variable to the value -queue.pl -config conf/no_k20.conf --allow-k20 false. +queue.pl --config conf/no_k20.conf --allow-k20 false. Note that a simpler way to have done this would have been to simply edit the command line in the config file to read \verbatim From 3373c6a53ce33596cb177c92085a8b217dc39080 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=83=A4=E3=83=B3=E3=83=A4=E3=83=B3?= Date: Sun, 23 Jun 2024 15:28:32 +0800 Subject: [PATCH 2/5] fix bug in compute-gop.cc Same continuous phonemes are aggregated in comput-gop [Problem Statement] In computer-assisted pronunciation training, we use time-aligned information to compute the pronunciation features such as goodness of pronunciation (GOP). We want each phoneme to be separately processed to obtain their features or scores. However, in the original implementation of compute-gop:L163, it used phoneme transition to decide if it is the next phoneme or not to recompute the phoneme duration, which encounters the problem that if some word is composed of some continuous duplicated phonemes, for example: SOUNDNESS S AH1 D AH0 N N AH0 S it finally makes an outcome for a single N. [Solution] Add the phoneme boundary information to avoid such a case. --- src/bin/compute-gop.cc | 53 ++++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 15 deletions(-) diff --git a/src/bin/compute-gop.cc b/src/bin/compute-gop.cc index a6db0fc0c9e..4f6180a47ba 100644 --- a/src/bin/compute-gop.cc +++ b/src/bin/compute-gop.cc @@ -1,6 +1,7 @@ // bin/compute-gop.cc // Copyright 2019 Junbo Zhang +// 2024 Jiun-Ting Li (National Taiwan Normal University) // See ../../COPYING for clarification regarding multiple authors // @@ -107,7 +108,9 @@ int main(int argc, char *argv[]) { const char *usage = "Compute Goodness Of Pronunciation (GOP) from a matrix of " "probabilities (e.g. from nnet3-compute).\n" - "Usage: compute-gop [options] " + "Usage: compute-gop [options] " + " " + " " " " "[]\n" "e.g.:\n" @@ -130,16 +133,17 @@ int main(int argc, char *argv[]) { po.Read(argc, argv); - if (po.NumArgs() != 4 && po.NumArgs() != 5) { + if (po.NumArgs() != 6) { po.PrintUsage(); exit(1); } std::string model_filename = po.GetArg(1), - alignments_rspecifier = po.GetArg(2), - prob_rspecifier = po.GetArg(3), - gop_wspecifier = po.GetArg(4), - feat_wspecifier = po.GetArg(5); + transition_alignments_rspecifier = po.GetArg(2), + phoneme_alignments_rspecifier = po.GetArg(3), + prob_rspecifier = po.GetArg(4), + gop_wspecifier = po.GetArg(5), + feat_wspecifier = po.GetArg(6); TransitionModel trans_model; { @@ -174,7 +178,8 @@ int main(int argc, char *argv[]) { } } - RandomAccessInt32VectorReader alignment_reader(alignments_rspecifier); + RandomAccessInt32VectorReader phoneme_alignments_reader(phoneme_alignments_rspecifier); + RandomAccessInt32VectorReader transition_alignments_reader(transition_alignments_rspecifier); SequentialBaseFloatMatrixReader prob_reader(prob_rspecifier); PosteriorWriter gop_writer(gop_wspecifier); BaseFloatVectorWriter feat_writer(feat_wspecifier); @@ -182,25 +187,41 @@ int main(int argc, char *argv[]) { int32 num_done = 0; for (; !prob_reader.Done(); prob_reader.Next()) { std::string key = prob_reader.Key(); - if (!alignment_reader.HasKey(key)) { - KALDI_WARN << "No alignment for utterance " << key; + if (!phoneme_alignments_reader.HasKey(key)) { + KALDI_WARN << "No phoneme alignment for utterance " << key; continue; } - auto alignment = alignment_reader.Value(key); + if (!transition_alignments_reader.HasKey(key)) { + KALDI_WARN << "No transition alignment for utterance " << key; + continue; + } + auto phoneme_alignment = phoneme_alignments_reader.Value(key); + auto transition_alignment = transition_alignments_reader.Value(key); Matrix &probs = prob_reader.Value(); if (log_applied) probs.ApplyExp(); + std::vector > split; + SplitToPhones(trans_model, transition_alignment, &split); + + std::vector phone_boundary; + for (int32 i = 0; i < split.size(); i++) { + for (int32 j = 0; j < split[i].size(); j++) { + phone_boundary.push_back(static_cast(i)); + } + } + Matrix lpps; ComputeLpps(probs, pdf2phones, &lpps); - int32 frame_num = alignment.size(); - if (alignment.size() != probs.NumRows()) { + int32 frame_num = phoneme_alignment.size(); + if (phoneme_alignment.size() != probs.NumRows()) { KALDI_WARN << "The frame numbers of alignment and prob are not equal."; if (frame_num > probs.NumRows()) frame_num = probs.NumRows(); } KALDI_ASSERT(frame_num > 0); - int32 cur_phone_id = alignment[0]; + int32 cur_phone_id = phoneme_alignment[0]; + int32 cur_phone_pos = phone_boundary[0]; int32 duration = 0; Vector phone_level_feat(1 + phone_num * 2); // [phone LPPs LPRs] SubVector lpp_part(phone_level_feat, 1, phone_num); @@ -220,8 +241,9 @@ int main(int argc, char *argv[]) { lpp_part.AddVec(1, frame_level_lpp); duration++; - int32 next_phone_id = (i < frame_num - 1) ? alignment[i + 1]: -1; - if (next_phone_id != cur_phone_id) { + int32 next_phone_id = (i < frame_num - 1) ? phoneme_alignment[i + 1]: -1; + int32 next_phone_pos = (i < frame_num - 1) ? phone_boundary[i + 1]: -1; + if (next_phone_pos != cur_phone_pos) { int32 phone_id = phone_map.empty() ? cur_phone_id : phone_map[cur_phone_id]; // The current phone's feature have been ready @@ -248,6 +270,7 @@ int main(int argc, char *argv[]) { duration = 0; } cur_phone_id = next_phone_id; + cur_phone_pos = next_phone_pos; } // Write GOPs and the GOP-based features From 7c1c5e6eb99b0b8e115cccd243d943ba8b7356e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=83=A4=E3=83=B3=E3=83=A4=E3=83=B3?= Date: Sun, 23 Jun 2024 15:30:35 +0800 Subject: [PATCH 3/5] Update run.sh due to the changes in comput-gop.cc --- egs/gop_speechocean762/s5/run.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/egs/gop_speechocean762/s5/run.sh b/egs/gop_speechocean762/s5/run.sh index cf081a18133..989d247736f 100755 --- a/egs/gop_speechocean762/s5/run.sh +++ b/egs/gop_speechocean762/s5/run.sh @@ -2,6 +2,7 @@ # Copyright 2019 Junbo Zhang # 2020-2021 Xiaomi Corporation (Author: Junbo Zhang, Yongqing Wang) +# 2024 Jiun-Ting Li (National Taiwan Normal University) # Apache 2.0 # This script shows how to calculate Goodness of Pronunciation (GOP) and @@ -175,6 +176,7 @@ if [ $stage -le 12 ]; then compute-gop --phone-map=data/lang_nosp/phone-to-pure-phone.int \ --skip-phones-string=0:1:2 \ $model/final.mdl \ + "ark,t:gunzip -c exp/ali_$part/ali.JOB.gz|" \ "ark,t:gunzip -c exp/ali_$part/ali-phone.JOB.gz|" \ "ark:exp/probs_$part/output.JOB.ark" \ "ark,scp:exp/gop_$part/gop.JOB.ark,exp/gop_$part/gop.JOB.scp" \ From 768a23eb43e1ca841e33326c872344c658d157c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=83=A4=E3=83=B3=E3=83=A4=E3=83=B3?= Date: Mon, 24 Jun 2024 10:31:19 +0800 Subject: [PATCH 4/5] Correct the last commit of compute-gop.cc With the advisement from code reviewer @csukuangfj, I updated the code, addressing all the identified issues and implementing best practices. --- src/bin/compute-gop.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/bin/compute-gop.cc b/src/bin/compute-gop.cc index 4f6180a47ba..08847579f85 100644 --- a/src/bin/compute-gop.cc +++ b/src/bin/compute-gop.cc @@ -112,9 +112,10 @@ int main(int argc, char *argv[]) { " " " " " " - "[]\n" + "\n" "e.g.:\n" - " nnet3-compute [args] | compute-gop 1.mdl ark:ali-phone.1 ark:-" + " nnet3-compute [args] | compute-gop 1.mdl ark:ali.1 ark:ali-phone.1 " + " ark:output.1.ark " " ark:gop.1 ark:phone-feat.1\n"; ParseOptions po(usage); @@ -206,7 +207,7 @@ int main(int argc, char *argv[]) { std::vector phone_boundary; for (int32 i = 0; i < split.size(); i++) { for (int32 j = 0; j < split[i].size(); j++) { - phone_boundary.push_back(static_cast(i)); + phone_boundary.push_back(i); } } From 0e61da99cae607ded8624d8717268234b618cff9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=83=A4=E3=83=B3=E3=83=A4=E3=83=B3?= Date: Sat, 29 Jun 2024 13:00:32 +0800 Subject: [PATCH 5/5] Update README.md Remove the deprecated document link from the request of @jimbozhang. --- egs/gop_speechocean762/README.md | 5 ----- 1 file changed, 5 deletions(-) diff --git a/egs/gop_speechocean762/README.md b/egs/gop_speechocean762/README.md index 77b520eadee..1c39f2f1cc6 100644 --- a/egs/gop_speechocean762/README.md +++ b/egs/gop_speechocean762/README.md @@ -1,8 +1,3 @@ -There is a copy of this document on Google Docs, which renders the equations better: -[link](https://docs.google.com/document/d/1pie-PU6u2NZZC_FzocBGGm6mpfBJMiCft9UoG0uA1kA/edit?usp=sharing) - -* * * - # GOP on Kaldi The Goodness of Pronunciation (GOP) is a variation of the posterior probability, for phone level pronunciation scoring.