Skip to content

Commit 87a3e2d

Browse files
authored
Merge pull request #284 from cmusphinx/cmn_api
Add a proper API for cepstral mean normalization
2 parents eb6ce7b + 903fda3 commit 87a3e2d

11 files changed

Lines changed: 320 additions & 128 deletions

File tree

cython/_pocketsphinx.pxd

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,8 @@ cdef extern from "pocketsphinx.h":
341341
int ps_free(ps_decoder_t *ps)
342342
int ps_reinit(ps_decoder_t *ps, cmd_ln_t *config)
343343
int ps_reinit_feat(ps_decoder_t *ps, cmd_ln_t *config)
344+
const char *ps_get_cmn(ps_decoder_t *ps, int update)
345+
int ps_set_cmn(ps_decoder_t *ps, const char *cmn)
344346
logmath_t *ps_get_logmath(ps_decoder_t *ps)
345347
int ps_start_stream(ps_decoder_t *ps)
346348
int ps_get_in_speech(ps_decoder_t *ps)

cython/_pocketsphinx.pyx

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -803,6 +803,27 @@ cdef class Decoder:
803803
if ps_reinit_feat(self.ps, cconfig) < 0:
804804
raise RuntimeError("Failed to reinitialize feature extraction")
805805

806+
def get_cmn(self, update=False):
807+
"""Get current cepstral mean.
808+
809+
Args:
810+
update(boolean): Update the mean based on current utterance.
811+
Returns:
812+
str: Cepstral mean as a comma-separated list of numbers.
813+
"""
814+
cdef const char *cmn = ps_get_cmn(self.ps, update)
815+
return cmn.decode("utf-8")
816+
817+
def set_cmn(self, cmn):
818+
"""Get current cepstral mean.
819+
820+
Args:
821+
cmn(str): Cepstral mean as a comma-separated list of numbers.
822+
"""
823+
cdef int rv = ps_set_cmn(self.ps, cmn.encode("utf-8"))
824+
if rv != 0:
825+
raise ValueError("Invalid CMN string")
826+
806827
def start_stream(self):
807828
"""Reset noise statistics.
808829
@@ -1535,6 +1556,39 @@ cdef class Vad:
15351556

15361557
cdef class Endpointer:
15371558
"""Simple endpointer using voice activity detection.
1559+
1560+
Args:
1561+
window(float): Length in seconds of window for decision.
1562+
ratio(float): Fraction of window that must be speech or
1563+
non-speech to make a transition.
1564+
mode(int): Aggressiveness of voice activity detction (0-3)
1565+
sample_rate(int): Sampling rate of input, default is 16000.
1566+
Rates other than 8000, 16000, 32000, 48000
1567+
are only approximately supported, see note
1568+
in `frame_length`. Outlandish sampling
1569+
rates like 3924 and 115200 will raise a
1570+
`ValueError`.
1571+
frame_length(float): Desired input frame length in seconds,
1572+
default is 0.03. The *actual* frame
1573+
length may be different if an
1574+
approximately supported sampling rate is
1575+
requested. You must *always* use the
1576+
`frame_bytes` and `frame_length`
1577+
attributes to determine the input size.
1578+
1579+
Attributes:
1580+
sample_rate(int): Sampling rate of input (default is 16000)
1581+
frame_bytes(int): Number of bytes in a frame accepted by `process`.
1582+
frame_length(float): Length of a frame (*may be different from
1583+
the one requested in the constructor*!)
1584+
in_speech(boolean): Are we currently in a speech region?
1585+
speech_start(float): Start of previous speech segment.
1586+
speech_end(float): End of previous speech segment.
1587+
1588+
Raises:
1589+
ValueError: Invalid input parameter. Also raised if the ratio
1590+
makes it impossible to do endpointing (i.e. it
1591+
is more than N-1 or less than 1 frame).
15381592
"""
15391593
cdef ps_endpointer_t *_ep
15401594
DEFAULT_WINDOW = PS_ENDPOINTER_DEFAULT_WINDOW
@@ -1597,6 +1651,19 @@ cdef class Endpointer:
15971651
return (<const unsigned char *>&outframe[0])[:n_samples * 2]
15981652

15991653
def end_stream(self, frame):
1654+
"""Read a final frame of data and return speech if any.
1655+
1656+
Args:
1657+
frame(bytes): Buffer containing speech data (16-bit signed
1658+
integers). Must be of length `frame_bytes`
1659+
(in bytes) *or less*.
1660+
Returns:
1661+
(bytes) Remaining speech data (could be more than one frame),
1662+
or None if none detected.
1663+
Raises:
1664+
IndexError: `buf` is of invalid size.
1665+
ValueError: Other internal VAD error.
1666+
"""
16001667
cdef const unsigned char[:] cframe = frame
16011668
cdef Py_ssize_t n_samples = len(cframe) // 2
16021669
cdef const short *outbuf
@@ -1612,7 +1679,15 @@ cdef class Endpointer:
16121679
return (<const unsigned char *>&outbuf[0])[:out_n_samples * 2]
16131680

16141681
def set_loglevel(level):
1682+
"""Set internal log level of PocketSphinx.
1683+
1684+
Args:
1685+
level(str): one of "DEBUG", "INFO", "ERROR", "FATAL".
1686+
Raises:
1687+
ValueError: Invalid log level string.
1688+
"""
16151689
cdef const char *prev_level
16161690
prev_level = err_set_loglevel_str(level.encode('utf-8'))
16171691
if prev_level == NULL:
16181692
raise ValueError("Invalid log level %s" % level)
1693+

cython/test/continuous_test.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,30 +14,30 @@ def test_continuous(self):
1414
config.set_string("-hmm", os.path.join(MODELDIR, "en-us/en-us"))
1515
config.set_string("-lm", os.path.join(MODELDIR, "en-us/en-us.lm.bin"))
1616
config.set_string("-dict", os.path.join(MODELDIR, "en-us/cmudict-en-us.dict"))
17-
config.set_string(
18-
"-cmninit",
19-
"41.00,-5.29,-0.12,5.09,2.48,-4.07,-1.37,-1.78,-5.08,-2.05,-6.45,-1.42,1.17",
17+
prev_cmn = (
18+
"41,-5.29,-0.12,5.09,2.48,-4.07,-1.37,-1.78,-5.08,-2.05,-6.45,-1.42,1.17"
2019
)
20+
config.set_string("-cmninit", prev_cmn)
2121
decoder = Decoder(config)
22+
self.assertEqual(prev_cmn, decoder.get_cmn(False))
2223

2324
with open(os.path.join(DATADIR, "goforward.raw"), "rb") as stream:
24-
in_speech_bf = False
2525
decoder.start_utt()
2626
while True:
2727
buf = stream.read(1024)
2828
if buf:
2929
decoder.process_raw(buf, False, False)
30-
if decoder.get_in_speech() != in_speech_bf:
31-
in_speech_bf = decoder.get_in_speech()
32-
if not in_speech_bf:
33-
decoder.end_utt()
34-
print('Result:', decoder.hyp().hypstr)
35-
decoder.start_utt()
30+
cmn = decoder.get_cmn(True)
31+
self.assertNotEqual(prev_cmn, cmn)
32+
prev_cmn = cmn
3633
else:
3734
break
3835
decoder.end_utt()
3936
print("Result:", decoder.hyp().hypstr)
4037
self.assertEqual("go forward ten meters", decoder.hyp().hypstr)
38+
cmn = decoder.get_cmn(False)
39+
self.assertNotEqual(prev_cmn, cmn)
40+
prev_cmn = cmn
4141

4242

4343
if __name__ == "__main__":

include/pocketsphinx.h

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -139,12 +139,13 @@ int ps_reinit(ps_decoder_t *ps, cmd_ln_t *config);
139139
*
140140
* This function allows you to switch the feature computation
141141
* parameters without otherwise affecting the decoder configuration.
142-
* For example, if you change the sample rate or the frame rate, the
143-
* cepstral mean, or the VTLN warping factor, and do not need to
144-
* reconfigure the rest of the decoder.
142+
* For example, if you change the sample rate or the frame rate, and
143+
* do not want to reconfigure the rest of the decoder.
145144
*
146145
* Note that if your code has modified any internal parameters in the
147146
* \ref acmod_t, these will be overriden by values from the config.
147+
* Likewise if you have set a custom cepstral mean with ps_set_cmn(),
148+
* it will be overridden.
148149
*
149150
* @note The decoder retains ownership of the pointer `config`, so you
150151
* should free it when no longer used.
@@ -158,6 +159,40 @@ int ps_reinit(ps_decoder_t *ps, cmd_ln_t *config);
158159
POCKETSPHINX_EXPORT
159160
int ps_reinit_feat(ps_decoder_t *ps, cmd_ln_t *config);
160161

162+
/**
163+
* Get the current cepstral mean as a string.
164+
*
165+
* This is the string representation of the current cepstral mean,
166+
* which represents the acoustic channel conditions in live
167+
* recognition. This can be used to initialize the decoder with the
168+
* `-cmninit` flag.
169+
*
170+
* @param ps Decoder
171+
* @param update Update the cepstral mean using data processed so far.
172+
* @return String representation of cepstral mean, as
173+
* `-ceplen` comma-separated numbers. This pointer is owned
174+
* by the decoder and only valid until the next call to
175+
* ps_get_cmn(), ps_set_cmn() or ps_end_utt().
176+
*/
177+
POCKETSPHINX_EXPORT
178+
const char *ps_get_cmn(ps_decoder_t *ps, int update);
179+
180+
/**
181+
* Set the current cepstral mean from a string.
182+
*
183+
* This does the same thing as setting `-cmninit` and running
184+
* `ps_reinit_feat()` but is more efficient, and can also be
185+
* done in the middle of an utterance if you like.
186+
*
187+
* @param ps Decoder
188+
* @param cmn String representation of cepstral mean, as
189+
* up to `-ceplen` comma-separated numbers (any
190+
* missing values will be zero-filled).
191+
* @return 0 for success of -1 for invalid input.
192+
*/
193+
POCKETSPHINX_EXPORT
194+
int ps_set_cmn(ps_decoder_t *ps, const char *cmn);
195+
161196
/**
162197
* Returns the argument definitions used in ps_init().
163198
*

include/sphinxbase/cmn.h

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,8 @@ typedef struct {
131131
mfcc_t *sum; /**< Accumulated cepstra for computing mean */
132132
int32 nframe; /**< Number of frames */
133133
int32 veclen; /**< Length of cepstral vector */
134+
char *repr; /**< String representation of current means */
135+
int refcount;
134136
} cmn_t;
135137

136138
SPHINXBASE_EXPORT
@@ -169,14 +171,36 @@ SPHINXBASE_EXPORT
169171
void cmn_live_update(cmn_t *cmn);
170172

171173
/**
172-
* Set the live mean.
174+
* Set live mean from a vector of length cmn->veclen
175+
*/
176+
void cmn_live_set(cmn_t *cmn, mfcc_t const * vec);
177+
178+
/**
179+
* Get the string representation of the live mean.
180+
*/
181+
#define cmn_repr(cmn) (cmn)->repr
182+
183+
/**
184+
* Update the string representation.
185+
*/
186+
const char *cmn_update_repr(cmn_t *cmn);
187+
188+
/**
189+
* Set the live mean from a string.
173190
*/
174191
SPHINXBASE_EXPORT
175-
void cmn_live_set(cmn_t *cmn, mfcc_t const *vec);
192+
int cmn_set_repr(cmn_t *cmn, char const *repr);
176193

177-
/* RAH, free previously allocated memory */
194+
/**
195+
* Retain a CMN.
196+
*/
197+
cmn_t *cmn_retain(cmn_t *cmn);
198+
199+
/**
200+
* Release a CMN, possibly freeing it.
201+
*/
178202
SPHINXBASE_EXPORT
179-
void cmn_free (cmn_t *cmn);
203+
int cmn_free (cmn_t *cmn);
180204

181205
#ifdef __cplusplus
182206
}

src/acmod.c

Lines changed: 6 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,6 @@
6565
#include "ms_mgau.h"
6666

6767
static int32 acmod_process_mfcbuf(acmod_t *acmod);
68-
static const char *acmod_update_cmninit(acmod_t *acmod);
6968

7069
static int
7170
acmod_init_am(acmod_t *acmod)
@@ -196,23 +195,8 @@ acmod_reinit_feat(acmod_t *acmod, fe_t *fe, feat_t *fcb)
196195
if (fcb->cmn_struct
197196
&& cmd_ln_exists_r(acmod->config, "-cmninit")
198197
&& cmd_ln_str_r(acmod->config, "-cmninit")) {
199-
char *c, *cc, *vallist;
200-
int32 nvals;
201-
202-
vallist = ckd_salloc(cmd_ln_str_r(acmod->config, "-cmninit"));
203-
c = vallist;
204-
nvals = 0;
205-
while (nvals < fcb->cmn_struct->veclen
206-
&& (cc = strchr(c, ',')) != NULL) {
207-
*cc = '\0';
208-
fcb->cmn_struct->cmn_mean[nvals] = FLOAT2MFCC(atof_c(c));
209-
c = cc + 1;
210-
++nvals;
211-
}
212-
if (nvals < fcb->cmn_struct->veclen && *c != '\0') {
213-
fcb->cmn_struct->cmn_mean[nvals] = FLOAT2MFCC(atof_c(c));
214-
}
215-
ckd_free(vallist);
198+
E_INFO("Setting initial CMN to %s\n", cmd_ln_str_r(acmod->config, "-cmninit"));
199+
cmn_set_repr(fcb->cmn_struct, cmd_ln_str_r(acmod->config, "-cmninit"));
216200
}
217201
}
218202
if (acmod_feat_mismatch(acmod, fcb)) {
@@ -462,7 +446,11 @@ acmod_end_utt(acmod_t *acmod)
462446
/* Process whatever's left, and any leadout. */
463447
if (nfr)
464448
nfr = acmod_process_mfcbuf(acmod);
449+
else /* Make sure to update CMN! */
450+
feat_update_stats(acmod->fcb);
465451
}
452+
else /* Make sure to update CMN! */
453+
feat_update_stats(acmod->fcb);
466454
if (acmod->mfcfh) {
467455
int32 outlen, rv;
468456
outlen = (ftell(acmod->mfcfh) - 4) / 4;
@@ -484,45 +472,9 @@ acmod_end_utt(acmod_t *acmod)
484472
acmod->senfh = NULL;
485473
}
486474

487-
acmod_update_cmninit(acmod);
488-
489475
return nfr;
490476
}
491477

492-
static const char *
493-
acmod_update_cmninit(acmod_t *acmod)
494-
{
495-
char *cmninit, *ptr;
496-
cmn_t *cmn;
497-
int i, len;
498-
499-
if (acmod->fcb == NULL)
500-
return NULL;
501-
if ((cmn = acmod->fcb->cmn_struct) == NULL)
502-
return NULL;
503-
len = 0;
504-
for (i = 0; i < cmn->veclen; ++i) {
505-
int nbytes = snprintf(NULL, 0, "%g,", cmn->cmn_mean[i]);
506-
if (nbytes <= 0) {
507-
E_ERROR_SYSTEM("Failed to format %g for cmninit", cmn->cmn_mean[i]);
508-
return NULL;
509-
}
510-
len += nbytes;
511-
}
512-
len++;
513-
ptr = cmninit = ckd_malloc(len);
514-
if (ptr == NULL) {
515-
E_ERROR_SYSTEM("Failed to allocate %d bytes for cmninit", len);
516-
return NULL;
517-
}
518-
for (i = 0; i < cmn->veclen; ++i)
519-
ptr += snprintf(ptr, cmninit + len - ptr, "%g,", cmn->cmn_mean[i]);
520-
*--ptr = '\0';
521-
cmd_ln_set_str_r(acmod->config, "-cmninit", cmninit);
522-
ckd_free(cmninit);
523-
return cmd_ln_str_r(acmod->config, "-cmninit");
524-
}
525-
526478
static int
527479
acmod_log_mfc(acmod_t *acmod,
528480
mfcc_t **cep, int n_frames)

0 commit comments

Comments
 (0)