SphinxBase 0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 1996-2004 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * This work was supported in part by funding from the Defense Advanced 00019 * Research Projects Agency and the National Science Foundation of the 00020 * United States of America, and the CMU Sphinx Speech Consortium. 00021 * 00022 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00023 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00024 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00025 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00026 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00028 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00029 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00030 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00032 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 * 00034 * ==================================================================== 00035 * 00036 */ 00037 #include <stdio.h> 00038 #include <stdlib.h> 00039 #include <string.h> 00040 #include <time.h> 00041 #include <assert.h> 00042 00043 #ifdef HAVE_CONFIG_H 00044 #include <config.h> 00045 #endif 00046 00047 #ifdef HAVE_SNDFILE_H 00048 #include <sndfile.h> 00049 #endif 00050 00051 #include <sphinxbase/fe.h> 00052 #include <sphinxbase/strfuncs.h> 00053 #include <sphinxbase/pio.h> 00054 #include <sphinxbase/filename.h> 00055 #include <sphinxbase/cmd_ln.h> 00056 #include <sphinxbase/err.h> 00057 #include <sphinxbase/ckd_alloc.h> 00058 #include <sphinxbase/byteorder.h> 00059 #include <sphinxbase/hash_table.h> 00060 00061 #include "sphinx_wave2feat.h" 00062 #include "cmd_ln_defn.h" 00063 00064 typedef struct audio_type_s { 00065 char const *name; 00066 int (*detect)(sphinx_wave2feat_t *wtf); 00067 int (*decode)(sphinx_wave2feat_t *wtf); 00068 } audio_type_t; 00069 00070 typedef struct output_type_s { 00071 char const *name; 00072 int (*output_header)(sphinx_wave2feat_t *wtf, int nfloat); 00073 int (*output_frames)(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr); 00074 } output_type_t; 00075 00076 struct sphinx_wave2feat_s { 00077 int refcount; 00078 cmd_ln_t *config; 00079 fe_t *fe; 00080 char *infile; 00081 char *outfile; 00082 FILE *infh; 00083 FILE *outfh; 00084 short *audio; 00085 mfcc_t **feat; 00086 int blocksize; 00087 int featsize; 00088 int veclen; 00089 int in_veclen; 00090 int byteswap; 00091 #ifdef HAVE_SNDFILE_H 00092 SNDFILE *insfh; 00093 #endif 00094 output_type_t const *ot; 00095 }; 00096 00098 typedef struct RIFFHeader{ 00099 char rifftag[4]; /* "RIFF" string */ 00100 int32 TotalLength; /* Total length */ 00101 char wavefmttag[8]; /* "WAVEfmt " string (note space after 't') */ 00102 int32 RemainingLength; /* Remaining length */ 00103 int16 data_format; /* data format tag, 1 = PCM */ 00104 int16 numchannels; /* Number of channels in file */ 00105 int32 SamplingFreq; /* Sampling frequency */ 00106 int32 BytesPerSec; /* Average bytes/sec */ 00107 int16 BlockAlign; /* Block align */ 00108 int16 BitsPerSample; /* 8 or 16 bit */ 00109 char datatag[4]; /* "data" string */ 00110 int32 datalength; /* Raw data length */ 00111 } MSWAV_hdr; 00112 00118 static int 00119 detect_riff(sphinx_wave2feat_t *wtf) 00120 { 00121 FILE *fh; 00122 MSWAV_hdr hdr; 00123 00124 if ((fh = fopen(wtf->infile, "rb")) == NULL) { 00125 E_ERROR_SYSTEM("Failed to open %s", wtf->infile); 00126 return -1; 00127 } 00128 if (fread(&hdr, sizeof(hdr), 1, fh) != 1) { 00129 E_ERROR_SYSTEM("Failed to read RIFF header"); 00130 fclose(fh); 00131 return -1; 00132 } 00133 /* Make sure it is actually a RIFF file. */ 00134 if (0 != memcmp(hdr.rifftag, "RIFF", 4)) { 00135 fclose(fh); 00136 return FALSE; 00137 } 00138 00139 /* Get relevant information. */ 00140 cmd_ln_set_int32_r(wtf->config, "-nchans", hdr.numchannels); 00141 cmd_ln_set_float32_r(wtf->config, "-samprate", hdr.SamplingFreq); 00142 wtf->infh = fh; 00143 00144 return TRUE; 00145 } 00146 00147 static int 00148 open_nist_file(sphinx_wave2feat_t *wtf, char const *infile, FILE **out_fh, int detect_endian) 00149 { 00150 char nist[7]; 00151 lineiter_t *li; 00152 FILE *fh; 00153 00154 if ((fh = fopen(infile, "rb")) == NULL) { 00155 E_ERROR_SYSTEM("Failed to open %s", infile); 00156 return -1; 00157 } 00158 if (fread(&nist, 1, 7, fh) != 7) { 00159 E_ERROR_SYSTEM("Failed to read NIST header"); 00160 fclose(fh); 00161 return -1; 00162 } 00163 /* Is this actually a NIST file? */ 00164 if (0 != strncmp(nist, "NIST_1A", 7)) { 00165 fclose(fh); 00166 return FALSE; 00167 } 00168 /* Rewind, parse lines. */ 00169 fseek(fh, 0, SEEK_SET); 00170 for (li = lineiter_start(fh); li; li = lineiter_next(li)) { 00171 char **words; 00172 int nword; 00173 00174 string_trim(li->buf, STRING_BOTH); 00175 if (strlen(li->buf) == 0) { 00176 lineiter_free(li); 00177 break; 00178 } 00179 nword = str2words(li->buf, NULL, 0); 00180 if (nword != 3) 00181 continue; 00182 words = ckd_calloc(nword, sizeof(*words)); 00183 str2words(li->buf, words, nword); 00184 if (0 == strcmp(words[0], "sample_rate")) { 00185 cmd_ln_set_float32_r(wtf->config, "-samprate", atof_c(words[2])); 00186 } 00187 if (0 == strcmp(words[0], "channel_count")) { 00188 cmd_ln_set_int32_r(wtf->config, "-nchans", atoi(words[2])); 00189 } 00190 if (detect_endian && 0 == strcmp(words[0], "sample_byte_format")) { 00191 cmd_ln_set_str_r(wtf->config, "-input_endian", 00192 (0 == strcmp(words[2], "10")) ? "big" : "little"); 00193 } 00194 ckd_free(words); 00195 } 00196 00197 fseek(fh, 1024, SEEK_SET); 00198 if (out_fh) 00199 *out_fh = fh; 00200 else 00201 fclose(fh); 00202 return TRUE; 00203 } 00204 00205 #ifdef HAVE_POPEN 00206 static int 00207 detect_sph2pipe(sphinx_wave2feat_t *wtf) 00208 { 00209 FILE *fh; 00210 char *cmdline; 00211 int rv; 00212 00213 /* Determine if it's NIST file and get parameters. */ 00214 if ((rv = open_nist_file(wtf, wtf->infile, NULL, FALSE)) != TRUE) 00215 return rv; 00216 00217 /* Now popen it with sph2pipe. */ 00218 cmdline = string_join("sph2pipe -f raw '", wtf->infile, "'", NULL); 00219 if ((fh = popen(cmdline, "r")) == NULL) { 00220 E_ERROR_SYSTEM("Failed to popen(\"sph2pipe -f raw '%s'\")", wtf->infile); 00221 ckd_free(cmdline); 00222 return -1; 00223 } 00224 00225 wtf->infh = fh; 00226 return TRUE; 00227 } 00228 #else /* !HAVE_POPEN */ 00229 static int 00230 detect_sph2pipe(sphinx_wave2feat_t *wtf) 00231 { 00232 E_ERROR("popen() not available, cannot run sph2pipe\n"); 00233 return -1; 00234 } 00235 #endif /* !HAVE_POPEN */ 00236 00242 static int 00243 detect_nist(sphinx_wave2feat_t *wtf) 00244 { 00245 FILE *fh; 00246 int rv; 00247 00248 if ((rv = open_nist_file(wtf, wtf->infile, &fh, TRUE)) != TRUE) 00249 return rv; 00250 wtf->infh = fh; 00251 00252 return TRUE; 00253 } 00254 00255 00262 static int 00263 detect_raw(sphinx_wave2feat_t *wtf) 00264 { 00265 FILE *fh; 00266 00267 if ((fh = fopen(wtf->infile, "rb")) == NULL) { 00268 E_ERROR_SYSTEM("Failed to open %s", wtf->infile); 00269 return -1; 00270 } 00271 wtf->infh = fh; 00272 return TRUE; 00273 } 00274 00281 static int 00282 detect_sphinx_mfc(sphinx_wave2feat_t *wtf) 00283 { 00284 FILE *fh; 00285 int32 len; 00286 long flen; 00287 00288 if ((fh = fopen(wtf->infile, "rb")) == NULL) { 00289 E_ERROR_SYSTEM("Failed to open %s", wtf->infile); 00290 return -1; 00291 } 00292 if (fread(&len, 4, 1, fh) != 1) { 00293 E_ERROR_SYSTEM("Failed to read header from %s\n", wtf->infile); 00294 fclose(fh); 00295 return -1; 00296 } 00297 fseek(fh, 0, SEEK_END); 00298 flen = ftell(fh); 00299 00300 /* figure out whether to byteswap */ 00301 flen = (flen / 4) - 1; 00302 if (flen != len) { 00303 /* First make sure this is an endianness problem, otherwise fail. */ 00304 SWAP_INT32(&len); 00305 if (flen != len) { 00306 SWAP_INT32(&len); 00307 E_ERROR("Mismatch in header/file lengths: 0x%08x vs 0x%08x\n", 00308 len, flen); 00309 return -1; 00310 } 00311 /* Set the input endianness to the opposite of the machine endianness... */ 00312 cmd_ln_set_str_r(wtf->config, "-input_endian", 00313 (0 == strcmp("big", cmd_ln_str_r(wtf->config, "-mach_endian")) 00314 ? "little" : "big")); 00315 } 00316 00317 fseek(fh, 4, SEEK_SET); 00318 wtf->infh = fh; 00319 if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) { 00320 wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-nfilt"); 00321 } 00322 else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) { 00323 wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-ncep"); 00324 wtf->veclen = cmd_ln_int32_r(wtf->config, "-nfilt"); 00325 } 00326 else { 00327 /* Should not happen. */ 00328 E_ERROR("Sphinx MFCC file reading requested but -spec2cep/-cep2spec not given\n"); 00329 assert(FALSE); 00330 } 00331 00332 return TRUE; 00333 } 00334 00335 int 00336 mixnpick_channels(int16 *buf, int32 nsamp, int32 nchans, int32 whichchan) 00337 { 00338 int i, j; 00339 00340 if (whichchan > 0) { 00341 for (i = whichchan - 1; i < nsamp; i += nchans) 00342 buf[i/nchans] = buf[i]; 00343 } 00344 else { 00345 for (i = 0; i < nsamp; i += nchans) { 00346 float64 tmp = 0.0; 00347 for (j = 0; j < nchans && i + j < nsamp; ++j) { 00348 tmp += buf[i + j]; 00349 } 00350 buf[i/nchans] = (int16)(tmp / nchans); 00351 } 00352 } 00353 return i/nchans; 00354 } 00355 00356 #ifdef HAVE_SNDFILE_H 00357 00362 static int 00363 detect_sndfile(sphinx_wave2feat_t *wtf) 00364 { 00365 SNDFILE *sf; 00366 SF_INFO sfinfo; 00367 00368 memset(&sfinfo, 0, sizeof(sfinfo)); 00369 /* We let other detectors catch I/O errors, since there is 00370 no way to tell them from format errors when opening :( */ 00371 if ((sf = sf_open(wtf->infile, SFM_READ, &sfinfo)) == NULL) { 00372 return FALSE; 00373 } 00374 /* Get relevant information. */ 00375 cmd_ln_set_int32_r(wtf->config, "-nchans", sfinfo.channels); 00376 cmd_ln_set_float32_r(wtf->config, "-samprate", sfinfo.samplerate); 00377 wtf->insfh = sf; 00378 wtf->infh = NULL; 00379 00380 return TRUE; 00381 } 00382 00387 static int 00388 decode_sndfile(sphinx_wave2feat_t *wtf) 00389 { 00390 size_t nsamp; 00391 int32 nfr, nchans, whichchan; 00392 int nfloat, n; 00393 00394 nchans = cmd_ln_int32_r(wtf->config, "-nchans"); 00395 whichchan = cmd_ln_int32_r(wtf->config, "-whichchan"); 00396 fe_start_utt(wtf->fe); 00397 nfloat = 0; 00398 while ((nsamp = sf_read_short(wtf->insfh, 00399 wtf->audio, 00400 wtf->blocksize)) != 0) { 00401 int16 const *inspeech; 00402 size_t nvec; 00403 00404 /* Mix or pick channels. */ 00405 if (nchans > 1) 00406 nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan); 00407 00408 inspeech = wtf->audio; 00409 nvec = wtf->featsize; 00410 /* Consume all samples. */ 00411 while (nsamp) { 00412 nfr = nvec; 00413 fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr); 00414 if (nfr) { 00415 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0) 00416 return -1; 00417 nfloat += n; 00418 } 00419 } 00420 inspeech = wtf->audio; 00421 } 00422 /* Now process any leftover audio frames. */ 00423 fe_end_utt(wtf->fe, wtf->feat[0], &nfr); 00424 if (nfr) { 00425 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0) 00426 return -1; 00427 nfloat += n; 00428 } 00429 00430 sf_close(wtf->insfh); 00431 wtf->insfh = NULL; 00432 return nfloat; 00433 } 00434 #endif /* HAVE_SNDFILE_H */ 00435 00440 static int 00441 decode_pcm(sphinx_wave2feat_t *wtf) 00442 { 00443 size_t nsamp; 00444 int32 nfr, nchans, whichchan; 00445 int nfloat, n; 00446 00447 nchans = cmd_ln_int32_r(wtf->config, "-nchans"); 00448 whichchan = cmd_ln_int32_r(wtf->config, "-whichchan"); 00449 fe_start_utt(wtf->fe); 00450 nfloat = 0; 00451 while ((nsamp = fread(wtf->audio, 2, wtf->blocksize, wtf->infh)) != 0) { 00452 size_t nvec; 00453 int16 const *inspeech; 00454 00455 /* Byteswap stuff here if necessary. */ 00456 if (wtf->byteswap) { 00457 for (n = 0; n < nsamp; ++n) 00458 SWAP_INT16(wtf->audio + n); 00459 } 00460 00461 /* Mix or pick channels. */ 00462 if (nchans > 1) 00463 nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan); 00464 00465 inspeech = wtf->audio; 00466 nvec = wtf->featsize; 00467 /* Consume all samples. */ 00468 while (nsamp) { 00469 nfr = nvec; 00470 fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr); 00471 if (nfr) { 00472 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0) 00473 return -1; 00474 nfloat += n; 00475 } 00476 } 00477 inspeech = wtf->audio; 00478 } 00479 /* Now process any leftover audio frames. */ 00480 fe_end_utt(wtf->fe, wtf->feat[0], &nfr); 00481 if (nfr) { 00482 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0) 00483 return -1; 00484 nfloat += n; 00485 } 00486 00487 if (fclose(wtf->infh) == EOF) 00488 E_ERROR_SYSTEM("Failed to close input file"); 00489 wtf->infh = NULL; 00490 return nfloat; 00491 } 00492 00497 static int 00498 decode_sphinx_mfc(sphinx_wave2feat_t *wtf) 00499 { 00500 int nfloat = 0, n; 00501 int featsize = wtf->featsize; 00502 00503 /* If the input vector length is less than the output length, we 00504 * need to do this one frame at a time, because there's empty 00505 * space at the end of each vector in wtf->feat. */ 00506 if (wtf->in_veclen < wtf->veclen) 00507 featsize = 1; 00508 while ((n = fread(wtf->feat[0], sizeof(**wtf->feat), 00509 featsize * wtf->in_veclen, wtf->infh)) != 0) { 00510 int i, nfr = n / wtf->in_veclen; 00511 if (n % wtf->in_veclen) { 00512 E_ERROR("Size of file %d not a multiple of veclen %d\n", 00513 n, wtf->in_veclen); 00514 return -1; 00515 } 00516 /* Byteswap stuff here if necessary. */ 00517 if (wtf->byteswap) { 00518 for (i = 0; i < n; ++i) 00519 SWAP_FLOAT32(wtf->feat[0] + i); 00520 } 00521 fe_float_to_mfcc(wtf->fe, (float32 **)wtf->feat, wtf->feat, nfr); 00522 for (i = 0; i < nfr; ++i) { 00523 if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) { 00524 if (0 == strcmp(cmd_ln_str_r(wtf->config, "-transform"), "legacy")) 00525 fe_logspec_to_mfcc(wtf->fe, wtf->feat[i], wtf->feat[i]); 00526 else 00527 fe_logspec_dct2(wtf->fe, wtf->feat[i], wtf->feat[i]); 00528 } 00529 else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) { 00530 fe_mfcc_dct3(wtf->fe, wtf->feat[i], wtf->feat[i]); 00531 } 00532 } 00533 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0) 00534 return -1; 00535 nfloat += n; 00536 } 00537 00538 if (fclose(wtf->infh) == EOF) 00539 E_ERROR_SYSTEM("Failed to close input file"); 00540 wtf->infh = NULL; 00541 return nfloat; 00542 } 00543 00544 static const audio_type_t types[] = { 00545 #ifdef HAVE_SNDFILE_H 00546 { "-sndfile", &detect_sndfile, &decode_sndfile }, 00547 #endif 00548 { "-mswav", &detect_riff, &decode_pcm }, 00549 { "-nist", &detect_nist, &decode_pcm }, 00550 { "-raw", &detect_raw, &decode_pcm }, 00551 { "-sph2pipe", &detect_sph2pipe, &decode_pcm } 00552 }; 00553 static const int ntypes = sizeof(types)/sizeof(types[0]); 00554 static const audio_type_t mfcc_type = { 00555 "sphinx_mfc", &detect_sphinx_mfc, &decode_sphinx_mfc 00556 }; 00557 00563 static int 00564 output_header_sphinx(sphinx_wave2feat_t *wtf, int32 nfloat) 00565 { 00566 if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1) { 00567 E_ERROR_SYSTEM("Failed to write to %s", wtf->outfile); 00568 return -1; 00569 } 00570 return 0; 00571 } 00572 00578 static int 00579 output_frames_sphinx(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr) 00580 { 00581 int i, nfloat = 0; 00582 00583 fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr); 00584 for (i = 0; i < nfr; ++i) { 00585 if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) { 00586 E_ERROR_SYSTEM("Writing %d values to %s failed", 00587 wtf->veclen, wtf->outfile); 00588 return -1; 00589 } 00590 nfloat += wtf->veclen; 00591 } 00592 return nfloat; 00593 } 00594 00595 typedef enum htk_feature_kind_e { 00596 WAVEFORM = 0, /* PCM audio (rarely used) */ 00597 LPC = 1, /* LPC filter coefficients */ 00598 LPCREFC = 2, /* LPC reflection coefficients */ 00599 LPCEPSTRA = 3, /* LPC-based cepstral coefficients */ 00600 LPCDELCEP = 4, /* LPCC plus deltas */ 00601 IREFC = 5, /* 16-bit integer LPC reflection coefficients */ 00602 MFCC = 6, /* MFCCs */ 00603 FBANK = 7, /* Log mel spectrum */ 00604 MELSPEC = 8, /* Linear mel spectrum */ 00605 USER = 9, /* User defined */ 00606 DISCRETE = 10, /* Vector quantized data */ 00607 PLP = 11 /* PLP coefficients */ 00608 } htk_feature_kind_t; 00609 00610 typedef enum htk_feature_flag_e { 00611 _E = 0000100, /* has energy */ 00612 _N = 0000200, /* absolute energy supressed */ 00613 _D = 0000400, /* has delta coefficients */ 00614 _A = 0001000, /* has acceleration (delta-delta) coefficients */ 00615 _C = 0002000, /* is compressed */ 00616 _Z = 0004000, /* has zero mean static coefficients (i.e. CMN) */ 00617 _K = 0010000, /* has CRC checksum */ 00618 _O = 0020000, /* has 0th cepstral coefficient */ 00619 _V = 0040000, /* has VQ data */ 00620 _T = 0100000 /* has third differential coefficients */ 00621 } htk_feature_flag_t; 00622 00626 static int 00627 output_header_htk(sphinx_wave2feat_t *wtf, int32 nfloat) 00628 { 00629 int32 samp_period; 00630 int16 samp_size; 00631 int16 param_kind; 00632 int swap = FALSE; 00633 00634 /* HTK files are big-endian. */ 00635 if (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian"))) 00636 swap = TRUE; 00637 /* Same file size thing as in Sphinx files (I think) */ 00638 if (swap) SWAP_INT32(&nfloat); 00639 if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1) 00640 return -1; 00641 /* Sample period in 100ns units. */ 00642 samp_period = (int32)(1e+7 / cmd_ln_float32_r(wtf->config, "-frate")); 00643 if (swap) SWAP_INT32(&samp_period); 00644 if (fwrite(&samp_period, 4, 1, wtf->outfh) != 1) 00645 return -1; 00646 /* Sample size - veclen * sizeof each sample. */ 00647 samp_size = wtf->veclen * 4; 00648 if (swap) SWAP_INT16(&samp_size); 00649 if (fwrite(&samp_size, 2, 1, wtf->outfh) != 1) 00650 return -1; 00651 /* Format and flags. */ 00652 if (cmd_ln_boolean_r(wtf->config, "-logspec") 00653 || cmd_ln_boolean_r(wtf->config, "-cep2spec")) 00654 param_kind = FBANK; /* log mel-filter bank outputs */ 00655 else 00656 param_kind = MFCC | _O; /* MFCC + CEP0 (note reordering...) */ 00657 if (swap) SWAP_INT16(¶m_kind); 00658 if (fwrite(¶m_kind, 2, 1, wtf->outfh) != 1) 00659 return -1; 00660 00661 return 0; 00662 } 00663 00667 static int 00668 output_frames_htk(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr) 00669 { 00670 int i, j, swap, htk_reorder, nfloat = 0; 00671 00672 fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr); 00673 /* This is possibly inefficient, but probably not a big deal. */ 00674 swap = (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian"))); 00675 htk_reorder = (0 == strcmp("htk", wtf->ot->name) 00676 && !(cmd_ln_boolean_r(wtf->config, "-logspec") 00677 || cmd_ln_boolean_r(wtf->config, "-cep2spec"))); 00678 for (i = 0; i < nfr; ++i) { 00679 if (htk_reorder) { 00680 mfcc_t c0 = frames[i][0]; 00681 memmove(frames[i] + 1, frames[i], (wtf->veclen - 1) * 4); 00682 frames[i][wtf->veclen - 1] = c0; 00683 } 00684 if (swap) 00685 for (j = 0; j < wtf->veclen; ++j) 00686 SWAP_FLOAT32(frames[i] + j); 00687 if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) { 00688 E_ERROR_SYSTEM("Writing %d values to %s failed", 00689 wtf->veclen, wtf->outfile); 00690 return -1; 00691 } 00692 nfloat += wtf->veclen; 00693 } 00694 return nfloat; 00695 } 00696 00700 static int 00701 output_frames_text(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr) 00702 { 00703 int i, j, nfloat = 0; 00704 00705 fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr); 00706 for (i = 0; i < nfr; ++i) { 00707 for (j = 0; j < wtf->veclen; ++j) { 00708 fprintf(wtf->outfh, "%.5g", frames[i][j]); 00709 if (j == wtf->veclen - 1) 00710 fprintf(wtf->outfh, "\n"); 00711 else 00712 fprintf(wtf->outfh, " "); 00713 } 00714 nfloat += wtf->veclen; 00715 } 00716 return nfloat; 00717 } 00718 00719 static const output_type_t outtypes[] = { 00720 { "sphinx", &output_header_sphinx, &output_frames_sphinx }, 00721 { "htk", &output_header_htk, &output_frames_htk }, 00722 { "text", NULL, &output_frames_text } 00723 }; 00724 static const int nouttypes = sizeof(outtypes)/sizeof(outtypes[0]); 00725 00726 sphinx_wave2feat_t * 00727 sphinx_wave2feat_init(cmd_ln_t *config) 00728 { 00729 sphinx_wave2feat_t *wtf; 00730 int i; 00731 00732 wtf = ckd_calloc(1, sizeof(*wtf)); 00733 wtf->refcount = 1; 00734 wtf->config = cmd_ln_retain(config); 00735 wtf->fe = fe_init_auto_r(wtf->config); 00736 wtf->ot = outtypes; /* Default (sphinx) type. */ 00737 for (i = 0; i < nouttypes; ++i) { 00738 output_type_t const *otype = &outtypes[i]; 00739 if (0 == strcmp(cmd_ln_str_r(config, "-ofmt"), otype->name)) { 00740 wtf->ot = otype; 00741 break; 00742 } 00743 } 00744 if (i == nouttypes) { 00745 E_ERROR("Unknown output type: '%s'\n", 00746 cmd_ln_str_r(config, "-ofmt")); 00747 sphinx_wave2feat_free(wtf); 00748 return NULL; 00749 } 00750 00751 return wtf; 00752 } 00753 00754 int 00755 sphinx_wave2feat_free(sphinx_wave2feat_t *wtf) 00756 { 00757 if (wtf == NULL) 00758 return 0; 00759 if (--wtf->refcount > 0) 00760 return wtf->refcount; 00761 00762 if (wtf->audio) 00763 ckd_free(wtf->audio); 00764 if (wtf->feat) 00765 ckd_free_2d(wtf->feat); 00766 if (wtf->infile) 00767 ckd_free(wtf->infile); 00768 if (wtf->outfile) 00769 ckd_free(wtf->outfile); 00770 if (wtf->infh) { 00771 if (fclose(wtf->infh) == EOF) 00772 E_ERROR_SYSTEM("Failed to close input file"); 00773 } 00774 if (wtf->outfh) { 00775 if (fclose(wtf->outfh) == EOF) 00776 E_ERROR_SYSTEM("Failed to close output file"); 00777 } 00778 cmd_ln_free_r(wtf->config); 00779 fe_free(wtf->fe); 00780 ckd_free(wtf); 00781 00782 return 0; 00783 } 00784 00785 sphinx_wave2feat_t * 00786 sphinx_wave2feat_retain(sphinx_wave2feat_t *wtf) 00787 { 00788 ++wtf->refcount; 00789 return wtf; 00790 } 00791 00792 static audio_type_t const * 00793 detect_audio_type(sphinx_wave2feat_t *wtf) 00794 { 00795 audio_type_t const *atype; 00796 int i; 00797 00798 /* Special case audio type for Sphinx MFCC inputs. */ 00799 if (cmd_ln_boolean_r(wtf->config, "-spec2cep") 00800 || cmd_ln_boolean_r(wtf->config, "-cep2spec")) { 00801 int rv = mfcc_type.detect(wtf); 00802 if (rv == -1) 00803 goto error_out; 00804 return &mfcc_type; 00805 } 00806 00807 /* Try to use the type of infile given on the command line. */ 00808 for (i = 0; i < ntypes; ++i) { 00809 int rv; 00810 atype = &types[i]; 00811 if (cmd_ln_boolean_r(wtf->config, atype->name)) { 00812 rv = (*atype->detect)(wtf); 00813 if (rv == -1) 00814 goto error_out; 00815 else if (rv == TRUE) 00816 break; 00817 } 00818 } 00819 if (i == ntypes) { 00820 /* Detect file type of infile and get parameters. */ 00821 for (i = 0; i < ntypes; ++i) { 00822 int rv; 00823 atype = &types[i]; 00824 rv = (*atype->detect)(wtf); 00825 if (rv == -1) 00826 goto error_out; 00827 else if (rv == TRUE) 00828 break; 00829 } 00830 if (i == ntypes) 00831 goto error_out; 00832 } 00833 return atype; 00834 error_out: 00835 if (wtf->infh) 00836 fclose(wtf->infh); 00837 wtf->infh = NULL; 00838 return NULL; 00839 } 00840 00841 int 00842 sphinx_wave2feat_convert_file(sphinx_wave2feat_t *wtf, 00843 char const *infile, char const *outfile) 00844 { 00845 int nchans, minfft, nfft, nfloat, veclen; 00846 audio_type_t const *atype; 00847 int fshift, fsize; 00848 00849 if (cmd_ln_boolean_r(wtf->config, "-verbose")) 00850 E_INFO("Converting %s to %s\n", infile, outfile); 00851 00852 wtf->infile = ckd_salloc(infile); 00853 00854 /* Detect input file type. */ 00855 if ((atype = detect_audio_type(wtf)) == NULL) 00856 return -1; 00857 00858 /* Determine whether to byteswap input. */ 00859 wtf->byteswap = strcmp(cmd_ln_str_r(wtf->config, "-mach_endian"), 00860 cmd_ln_str_r(wtf->config, "-input_endian")); 00861 00862 /* Make sure the FFT size is sufficiently large. */ 00863 minfft = (int)(cmd_ln_float32_r(wtf->config, "-samprate") 00864 * cmd_ln_float32_r(wtf->config, "-wlen") + 0.5); 00865 for (nfft = 1; nfft < minfft; nfft <<= 1) 00866 ; 00867 if (nfft > cmd_ln_int32_r(wtf->config, "-nfft")) { 00868 E_WARN("Value of -nfft = %d is too small, increasing to %d\n", 00869 cmd_ln_int32_r(wtf->config, "-nfft"), nfft); 00870 cmd_ln_set_int32_r(wtf->config, "-nfft", nfft); 00871 fe_free(wtf->fe); 00872 wtf->fe = fe_init_auto_r(wtf->config); 00873 } 00874 00875 /* Get the output frame size (if not already set). */ 00876 if (wtf->veclen == 0) 00877 wtf->veclen = fe_get_output_size(wtf->fe); 00878 00879 /* Set up the input and output buffers. */ 00880 fe_get_input_size(wtf->fe, &fshift, &fsize); 00881 /* Want to get at least a whole frame plus shift in here. Also we 00882 will either pick or mix multiple channels so we need to read 00883 them all at once. */ 00884 nchans = cmd_ln_int32_r(wtf->config, "-nchans"); 00885 wtf->blocksize = cmd_ln_int32_r(wtf->config, "-blocksize") * nchans; 00886 if (wtf->blocksize < (fsize + fshift) * nchans) { 00887 E_INFO("Block size of %d too small, increasing to %d\n", 00888 wtf->blocksize, 00889 (fsize + fshift) * nchans); 00890 wtf->blocksize = (fsize + fshift) * nchans; 00891 } 00892 wtf->audio = ckd_calloc(wtf->blocksize, sizeof(*wtf->audio)); 00893 wtf->featsize = (wtf->blocksize / nchans - fsize) / fshift; 00894 00895 /* Use the maximum of the input and output frame sizes to allocate this. */ 00896 veclen = wtf->veclen; 00897 if (wtf->in_veclen > veclen) veclen = wtf->in_veclen; 00898 00899 wtf->feat = ckd_calloc_2d(wtf->featsize, veclen, sizeof(**wtf->feat)); 00900 00901 /* Let's go! */ 00902 if ((wtf->outfh = fopen(outfile, "wb")) == NULL) { 00903 E_ERROR_SYSTEM("Failed to open %s for writing", outfile); 00904 return -1; 00905 } 00906 /* Write an empty header, which we'll fill in later. */ 00907 if (wtf->ot->output_header && 00908 (*wtf->ot->output_header)(wtf, 0) < 0) { 00909 E_ERROR_SYSTEM("Failed to write empty header to %s\n", outfile); 00910 goto error_out; 00911 } 00912 wtf->outfile = ckd_salloc(outfile); 00913 00914 if ((nfloat = (*atype->decode)(wtf)) < 0) { 00915 E_ERROR("Failed to convert"); 00916 goto error_out; 00917 } 00918 00919 if (wtf->ot->output_header) { 00920 if (fseek(wtf->outfh, 0, SEEK_SET) < 0) { 00921 E_ERROR_SYSTEM("Failed to seek to beginning of %s\n", outfile); 00922 goto error_out; 00923 } 00924 if ((*wtf->ot->output_header)(wtf, nfloat) < 0) { 00925 E_ERROR_SYSTEM("Failed to write header to %s\n", outfile); 00926 goto error_out; 00927 } 00928 } 00929 00930 00931 if (wtf->audio) 00932 ckd_free(wtf->audio); 00933 if (wtf->feat) 00934 ckd_free_2d(wtf->feat); 00935 if (wtf->infile) 00936 ckd_free(wtf->infile); 00937 if (wtf->outfile) 00938 ckd_free(wtf->outfile); 00939 00940 wtf->audio = NULL; 00941 wtf->infile = NULL; 00942 wtf->feat = NULL; 00943 wtf->outfile = NULL; 00944 00945 if (wtf->outfh) 00946 if (fclose(wtf->outfh) == EOF) 00947 E_ERROR_SYSTEM("Failed to close output file"); 00948 wtf->outfh = NULL; 00949 00950 return 0; 00951 00952 error_out: 00953 00954 if (wtf->audio) 00955 ckd_free(wtf->audio); 00956 if (wtf->feat) 00957 ckd_free_2d(wtf->feat); 00958 if (wtf->infile) 00959 ckd_free(wtf->infile); 00960 if (wtf->outfile) 00961 ckd_free(wtf->outfile); 00962 00963 wtf->audio = NULL; 00964 wtf->infile = NULL; 00965 wtf->feat = NULL; 00966 wtf->outfile = NULL; 00967 00968 if (wtf->outfh) 00969 if (fclose(wtf->outfh) == EOF) 00970 E_ERROR_SYSTEM("Failed to close output file"); 00971 wtf->outfh = NULL; 00972 00973 return -1; 00974 } 00975 00976 void 00977 build_filenames(cmd_ln_t *config, char const *basename, 00978 char **out_infile, char **out_outfile) 00979 { 00980 char const *di, *do_, *ei, *eo; 00981 00982 di = cmd_ln_str_r(config, "-di"); 00983 do_ = cmd_ln_str_r(config, "-do"); 00984 ei = cmd_ln_str_r(config, "-ei"); 00985 eo = cmd_ln_str_r(config, "-eo"); 00986 00987 *out_infile = string_join(di ? di : "", 00988 di ? "/" : "", 00989 basename, 00990 ei ? "." : "", 00991 ei ? ei : "", 00992 NULL); 00993 *out_outfile = string_join(do_ ? do_ : "", 00994 do_ ? "/" : "", 00995 basename, 00996 eo ? "." : "", 00997 eo ? eo : "", 00998 NULL); 00999 /* Build output directory structure if possible/requested (it is 01000 * by default). */ 01001 if (cmd_ln_boolean_r(config, "-build_outdirs")) { 01002 char *dirname = ckd_salloc(*out_outfile); 01003 path2dirname(*out_outfile, dirname); 01004 build_directory(dirname); 01005 ckd_free(dirname); 01006 } 01007 } 01008 01009 static int 01010 run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile) 01011 { 01012 hash_table_t *files; 01013 hash_iter_t *itor; 01014 lineiter_t *li; 01015 FILE *ctlfh; 01016 int nskip, runlen, npart, rv = 0; 01017 01018 if ((ctlfh = fopen(ctlfile, "r")) == NULL) { 01019 E_ERROR_SYSTEM("Failed to open control file %s", ctlfile); 01020 return -1; 01021 } 01022 nskip = cmd_ln_int32_r(wtf->config, "-nskip"); 01023 runlen = cmd_ln_int32_r(wtf->config, "-runlen"); 01024 if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) { 01025 /* Count lines in the file. */ 01026 int partlen, part, nlines = 0; 01027 part = cmd_ln_int32_r(wtf->config, "-part"); 01028 for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) 01029 ++nlines; 01030 fseek(ctlfh, 0, SEEK_SET); 01031 partlen = nlines / npart; 01032 nskip = partlen * (part - 1); 01033 if (part == npart) 01034 runlen = -1; 01035 else 01036 runlen = partlen; 01037 } 01038 if (runlen != -1){ 01039 E_INFO("Processing %d utterances at position %d\n", runlen, nskip); 01040 files = hash_table_new(runlen, HASH_CASE_YES); 01041 } 01042 else { 01043 E_INFO("Processing all remaining utterances at position %d\n", nskip); 01044 files = hash_table_new(1000, HASH_CASE_YES); 01045 } 01046 for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) { 01047 char *c, *infile, *outfile; 01048 01049 if (nskip-- > 0) 01050 continue; 01051 if (runlen == 0) { 01052 lineiter_free(li); 01053 break; 01054 } 01055 --runlen; 01056 01057 string_trim(li->buf, STRING_BOTH); 01058 /* Extract the file ID from the control line. */ 01059 if ((c = strchr(li->buf, ' ')) != NULL) 01060 *c = '\0'; 01061 if (strlen(li->buf) == 0) { 01062 E_WARN("Empty line %d in control file, skipping\n", li->lineno); 01063 continue; 01064 } 01065 build_filenames(wtf->config, li->buf, &infile, &outfile); 01066 if (hash_table_lookup(files, infile, NULL) == 0) 01067 continue; 01068 rv = sphinx_wave2feat_convert_file(wtf, infile, outfile); 01069 hash_table_enter(files, infile, outfile); 01070 if (rv != 0) { 01071 lineiter_free(li); 01072 break; 01073 } 01074 } 01075 for (itor = hash_table_iter(files); itor; 01076 itor = hash_table_iter_next(itor)) { 01077 ckd_free((void *)hash_entry_key(itor->ent)); 01078 ckd_free(hash_entry_val(itor->ent)); 01079 } 01080 hash_table_free(files); 01081 01082 if (fclose(ctlfh) == EOF) 01083 E_ERROR_SYSTEM("Failed to close control file"); 01084 return rv; 01085 } 01086 01087 int 01088 main(int argc, char *argv[]) 01089 { 01090 sphinx_wave2feat_t *wtf; 01091 cmd_ln_t *config; 01092 int rv; 01093 01094 /* Initialize config. */ 01095 if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL) 01096 return 2; 01097 01098 /* Parse an argument file if there's one in there. */ 01099 if (cmd_ln_str_r(config, "-argfile")) 01100 config = cmd_ln_parse_file_r(config, defn, 01101 cmd_ln_str_r(config, "-argfile"), FALSE); 01102 if (config == NULL) { 01103 E_ERROR("Command line parsing failed\n"); 01104 return 1; 01105 } 01106 if ((wtf = sphinx_wave2feat_init(config)) == NULL) { 01107 E_ERROR("Failed to initialize wave2feat object\n"); 01108 return 1; 01109 } 01110 01111 /* If there's a control file run through it, otherwise we will do 01112 * a single file (which is what run_control_file will do 01113 * internally too) */ 01114 if (cmd_ln_str_r(config, "-c")) 01115 rv = run_control_file(wtf, cmd_ln_str_r(config, "-c")); 01116 else 01117 rv = sphinx_wave2feat_convert_file(wtf, cmd_ln_str_r(config, "-i"), 01118 cmd_ln_str_r(config, "-o")); 01119 01120 sphinx_wave2feat_free(wtf); 01121 cmd_ln_free_r(config); 01122 return rv; 01123 }