SphinxBase 0.6

src/sphinx_fe/sphinx_fe.c

00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1996-2004 Carnegie Mellon University.  All rights 
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 #include <stdio.h>
00038 #include <stdlib.h>
00039 #include <string.h>
00040 #include <time.h>
00041 #include <assert.h>
00042 
00043 #ifdef HAVE_CONFIG_H
00044 #include <config.h>
00045 #endif
00046 
00047 #ifdef HAVE_SNDFILE_H
00048 #include <sndfile.h>
00049 #endif
00050 
00051 #include <sphinxbase/fe.h>
00052 #include <sphinxbase/strfuncs.h>
00053 #include <sphinxbase/pio.h>
00054 #include <sphinxbase/filename.h>
00055 #include <sphinxbase/cmd_ln.h>
00056 #include <sphinxbase/err.h>
00057 #include <sphinxbase/ckd_alloc.h>
00058 #include <sphinxbase/byteorder.h>
00059 #include <sphinxbase/hash_table.h>
00060 
00061 #include "sphinx_wave2feat.h"
00062 #include "cmd_ln_defn.h"
00063 
00064 typedef struct audio_type_s {
00065     char const *name;
00066     int (*detect)(sphinx_wave2feat_t *wtf);
00067     int (*decode)(sphinx_wave2feat_t *wtf);
00068 } audio_type_t;
00069 
00070 typedef struct output_type_s {
00071     char const *name;
00072     int (*output_header)(sphinx_wave2feat_t *wtf, int nfloat);
00073     int (*output_frames)(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr);
00074 } output_type_t;
00075 
00076 struct sphinx_wave2feat_s {
00077     int refcount;     
00078     cmd_ln_t *config; 
00079     fe_t *fe;         
00080     char *infile;     
00081     char *outfile;    
00082     FILE *infh;       
00083     FILE *outfh;      
00084     short *audio;     
00085     mfcc_t **feat;    
00086     int blocksize;    
00087     int featsize;     
00088     int veclen;       
00089     int in_veclen;    
00090     int byteswap;     
00091 #ifdef HAVE_SNDFILE_H
00092     SNDFILE *insfh;   
00093 #endif
00094     output_type_t const *ot;
00095 };
00096 
00098 typedef struct RIFFHeader{
00099     char rifftag[4];      /* "RIFF" string */
00100     int32 TotalLength;      /* Total length */
00101     char wavefmttag[8];   /* "WAVEfmt " string (note space after 't') */
00102     int32 RemainingLength;  /* Remaining length */
00103     int16 data_format;    /* data format tag, 1 = PCM */
00104     int16 numchannels;    /* Number of channels in file */
00105     int32 SamplingFreq;     /* Sampling frequency */
00106     int32 BytesPerSec;      /* Average bytes/sec */
00107     int16 BlockAlign;     /* Block align */
00108     int16 BitsPerSample;  /* 8 or 16 bit */
00109     char datatag[4];      /* "data" string */
00110     int32 datalength;       /* Raw data length */
00111 } MSWAV_hdr;
00112 
00118 static int
00119 detect_riff(sphinx_wave2feat_t *wtf)
00120 {
00121     FILE *fh;
00122     MSWAV_hdr hdr;
00123 
00124     if ((fh = fopen(wtf->infile, "rb")) == NULL) {
00125         E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
00126         return -1;
00127     }
00128     if (fread(&hdr, sizeof(hdr), 1, fh) != 1) {
00129         E_ERROR_SYSTEM("Failed to read RIFF header");
00130         fclose(fh);
00131         return -1;
00132     }
00133     /* Make sure it is actually a RIFF file. */
00134     if (0 != memcmp(hdr.rifftag, "RIFF", 4)) {
00135         fclose(fh);
00136         return FALSE;
00137     }
00138 
00139     /* Get relevant information. */
00140     cmd_ln_set_int32_r(wtf->config, "-nchans", hdr.numchannels);
00141     cmd_ln_set_float32_r(wtf->config, "-samprate", hdr.SamplingFreq);
00142     wtf->infh = fh;
00143 
00144     return TRUE;
00145 }
00146 
00147 static int
00148 open_nist_file(sphinx_wave2feat_t *wtf, char const *infile, FILE **out_fh, int detect_endian)
00149 {
00150     char nist[7];
00151     lineiter_t *li;
00152     FILE *fh;
00153 
00154     if ((fh = fopen(infile, "rb")) == NULL) {
00155         E_ERROR_SYSTEM("Failed to open %s", infile);
00156         return -1;
00157     }
00158     if (fread(&nist, 1, 7, fh) != 7) {
00159         E_ERROR_SYSTEM("Failed to read NIST header");
00160         fclose(fh);
00161         return -1;
00162     }
00163     /* Is this actually a NIST file? */
00164     if (0 != strncmp(nist, "NIST_1A", 7)) {
00165         fclose(fh);
00166         return FALSE;
00167     }
00168     /* Rewind, parse lines. */
00169     fseek(fh, 0, SEEK_SET);
00170     for (li = lineiter_start(fh); li; li = lineiter_next(li)) {
00171         char **words;
00172         int nword;
00173 
00174         string_trim(li->buf, STRING_BOTH);
00175         if (strlen(li->buf) == 0) {
00176             lineiter_free(li);
00177             break;
00178         }
00179         nword = str2words(li->buf, NULL, 0);
00180         if (nword != 3)
00181             continue;
00182         words = ckd_calloc(nword, sizeof(*words));
00183         str2words(li->buf, words, nword);
00184         if (0 == strcmp(words[0], "sample_rate")) {
00185             cmd_ln_set_float32_r(wtf->config, "-samprate", atof_c(words[2]));
00186         }
00187         if (0 == strcmp(words[0], "channel_count")) {
00188             cmd_ln_set_int32_r(wtf->config, "-nchans", atoi(words[2]));
00189         }
00190         if (detect_endian && 0 == strcmp(words[0], "sample_byte_format")) {
00191             cmd_ln_set_str_r(wtf->config, "-input_endian",
00192                              (0 == strcmp(words[2], "10")) ? "big" : "little");
00193         }
00194         ckd_free(words);
00195     }
00196 
00197     fseek(fh, 1024, SEEK_SET);
00198     if (out_fh)
00199         *out_fh = fh;
00200     else
00201         fclose(fh);
00202     return TRUE;
00203 }
00204 
00205 #ifdef HAVE_POPEN
00206 static int
00207 detect_sph2pipe(sphinx_wave2feat_t *wtf)
00208 {
00209     FILE *fh;
00210     char *cmdline;
00211     int rv;
00212 
00213     /* Determine if it's NIST file and get parameters. */
00214     if ((rv = open_nist_file(wtf, wtf->infile, NULL, FALSE)) != TRUE)
00215         return rv;
00216 
00217     /* Now popen it with sph2pipe. */
00218     cmdline = string_join("sph2pipe -f raw '", wtf->infile, "'", NULL);
00219     if ((fh = popen(cmdline, "r")) == NULL) {
00220         E_ERROR_SYSTEM("Failed to popen(\"sph2pipe -f raw '%s'\")", wtf->infile);
00221         ckd_free(cmdline);
00222         return -1;
00223     }
00224 
00225     wtf->infh = fh;
00226     return TRUE;
00227 }
00228 #else /* !HAVE_POPEN */
00229 static int
00230 detect_sph2pipe(sphinx_wave2feat_t *wtf)
00231 {
00232     E_ERROR("popen() not available, cannot run sph2pipe\n");
00233     return -1;
00234 }
00235 #endif /* !HAVE_POPEN */
00236 
00242 static int
00243 detect_nist(sphinx_wave2feat_t *wtf)
00244 {
00245     FILE *fh;
00246     int rv;
00247 
00248     if ((rv = open_nist_file(wtf, wtf->infile, &fh, TRUE)) != TRUE)
00249         return rv;
00250     wtf->infh = fh;
00251 
00252     return TRUE;
00253 }
00254 
00255 
00262 static int
00263 detect_raw(sphinx_wave2feat_t *wtf)
00264 {
00265     FILE *fh;
00266 
00267     if ((fh = fopen(wtf->infile, "rb")) == NULL) {
00268         E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
00269         return -1;
00270     }
00271     wtf->infh = fh;
00272     return TRUE;
00273 }
00274 
00281 static int
00282 detect_sphinx_mfc(sphinx_wave2feat_t *wtf)
00283 {
00284     FILE *fh;
00285     int32 len;
00286     long flen;
00287 
00288     if ((fh = fopen(wtf->infile, "rb")) == NULL) {
00289         E_ERROR_SYSTEM("Failed to open %s", wtf->infile);
00290         return -1;
00291     }
00292     if (fread(&len, 4, 1, fh) != 1) {
00293         E_ERROR_SYSTEM("Failed to read header from %s\n", wtf->infile);
00294         fclose(fh);
00295         return -1;
00296     }
00297     fseek(fh, 0, SEEK_END);
00298     flen = ftell(fh);
00299 
00300     /* figure out whether to byteswap */
00301     flen = (flen / 4) - 1;
00302     if (flen != len) {
00303         /* First make sure this is an endianness problem, otherwise fail. */
00304         SWAP_INT32(&len);
00305         if (flen != len) {
00306             SWAP_INT32(&len);
00307             E_ERROR("Mismatch in header/file lengths: 0x%08x vs 0x%08x\n",
00308                     len, flen);
00309             return -1;
00310         }
00311         /* Set the input endianness to the opposite of the machine endianness... */
00312         cmd_ln_set_str_r(wtf->config, "-input_endian",
00313                          (0 == strcmp("big", cmd_ln_str_r(wtf->config, "-mach_endian"))
00314                           ? "little" : "big"));
00315     }
00316     
00317     fseek(fh, 4, SEEK_SET);
00318     wtf->infh = fh;
00319     if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
00320         wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
00321     }
00322     else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
00323         wtf->in_veclen = cmd_ln_int32_r(wtf->config, "-ncep");
00324         wtf->veclen = cmd_ln_int32_r(wtf->config, "-nfilt");
00325     }
00326     else {
00327         /* Should not happen. */
00328         E_ERROR("Sphinx MFCC file reading requested but -spec2cep/-cep2spec not given\n");
00329         assert(FALSE);
00330     }
00331             
00332     return TRUE;
00333 }
00334 
00335 int
00336 mixnpick_channels(int16 *buf, int32 nsamp, int32 nchans, int32 whichchan)
00337 {
00338     int i, j;
00339 
00340     if (whichchan > 0) {
00341         for (i = whichchan - 1; i < nsamp; i += nchans)
00342             buf[i/nchans] = buf[i];
00343     }
00344     else {
00345         for (i = 0; i < nsamp; i += nchans) {
00346             float64 tmp = 0.0;
00347             for (j = 0; j < nchans && i + j < nsamp; ++j) {
00348                 tmp += buf[i + j];
00349             }
00350             buf[i/nchans] = (int16)(tmp / nchans);
00351         }
00352     }
00353     return i/nchans;
00354 }
00355 
00356 #ifdef HAVE_SNDFILE_H
00357 
00362 static int
00363 detect_sndfile(sphinx_wave2feat_t *wtf)
00364 {
00365     SNDFILE *sf;
00366     SF_INFO sfinfo;
00367 
00368     memset(&sfinfo, 0, sizeof(sfinfo));
00369     /* We let other detectors catch I/O errors, since there is
00370        no way to tell them from format errors when opening :( */
00371     if ((sf = sf_open(wtf->infile, SFM_READ, &sfinfo)) == NULL) {
00372         return FALSE;
00373     }
00374     /* Get relevant information. */
00375     cmd_ln_set_int32_r(wtf->config, "-nchans", sfinfo.channels);
00376     cmd_ln_set_float32_r(wtf->config, "-samprate", sfinfo.samplerate);
00377     wtf->insfh = sf;
00378     wtf->infh = NULL;
00379 
00380     return TRUE;
00381 }
00382 
00387 static int
00388 decode_sndfile(sphinx_wave2feat_t *wtf)
00389 {
00390     size_t nsamp;
00391     int32 nfr, nchans, whichchan;
00392     int nfloat, n;
00393 
00394     nchans = cmd_ln_int32_r(wtf->config, "-nchans");
00395     whichchan = cmd_ln_int32_r(wtf->config, "-whichchan");
00396     fe_start_utt(wtf->fe);
00397     nfloat = 0;
00398     while ((nsamp = sf_read_short(wtf->insfh,
00399                                   wtf->audio,
00400                                   wtf->blocksize)) != 0) {
00401         int16 const *inspeech;
00402         size_t nvec;
00403 
00404         /* Mix or pick channels. */
00405         if (nchans > 1)
00406             nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan);
00407 
00408         inspeech = wtf->audio;
00409         nvec = wtf->featsize;
00410         /* Consume all samples. */
00411         while (nsamp) {
00412             nfr = nvec;
00413             fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr);
00414             if (nfr) {
00415                 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
00416                     return -1;
00417                 nfloat += n;
00418             }
00419         }
00420         inspeech = wtf->audio;
00421     }
00422     /* Now process any leftover audio frames. */
00423     fe_end_utt(wtf->fe, wtf->feat[0], &nfr);
00424     if (nfr) {
00425         if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
00426             return -1;
00427         nfloat += n;
00428     }
00429 
00430     sf_close(wtf->insfh);
00431     wtf->insfh = NULL;
00432     return nfloat;
00433 }
00434 #endif /* HAVE_SNDFILE_H */
00435 
00440 static int
00441 decode_pcm(sphinx_wave2feat_t *wtf)
00442 {
00443     size_t nsamp;
00444     int32 nfr, nchans, whichchan;
00445     int nfloat, n;
00446 
00447     nchans = cmd_ln_int32_r(wtf->config, "-nchans");
00448     whichchan = cmd_ln_int32_r(wtf->config, "-whichchan");
00449     fe_start_utt(wtf->fe);
00450     nfloat = 0;
00451     while ((nsamp = fread(wtf->audio, 2, wtf->blocksize, wtf->infh)) != 0) {
00452         size_t nvec;
00453         int16 const *inspeech;
00454 
00455         /* Byteswap stuff here if necessary. */
00456         if (wtf->byteswap) {
00457             for (n = 0; n < nsamp; ++n)
00458                 SWAP_INT16(wtf->audio + n);
00459         }
00460 
00461         /* Mix or pick channels. */
00462         if (nchans > 1)
00463             nsamp = mixnpick_channels(wtf->audio, nsamp, nchans, whichchan);
00464             
00465         inspeech = wtf->audio;
00466         nvec = wtf->featsize;
00467         /* Consume all samples. */
00468         while (nsamp) {
00469             nfr = nvec;
00470             fe_process_frames(wtf->fe, &inspeech, &nsamp, wtf->feat, &nfr);
00471             if (nfr) {
00472                 if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
00473                     return -1;
00474                 nfloat += n;
00475             }
00476         }
00477         inspeech = wtf->audio;
00478     }
00479     /* Now process any leftover audio frames. */
00480     fe_end_utt(wtf->fe, wtf->feat[0], &nfr);
00481     if (nfr) {
00482         if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
00483             return -1;
00484         nfloat += n;
00485     }
00486 
00487     if (fclose(wtf->infh) == EOF)
00488         E_ERROR_SYSTEM("Failed to close input file");
00489     wtf->infh = NULL;
00490     return nfloat;
00491 }
00492 
00497 static int
00498 decode_sphinx_mfc(sphinx_wave2feat_t *wtf)
00499 {
00500     int nfloat = 0, n;
00501     int featsize = wtf->featsize;
00502 
00503     /* If the input vector length is less than the output length, we
00504      * need to do this one frame at a time, because there's empty
00505      * space at the end of each vector in wtf->feat. */
00506     if (wtf->in_veclen < wtf->veclen)
00507         featsize = 1;
00508     while ((n = fread(wtf->feat[0], sizeof(**wtf->feat),
00509                       featsize * wtf->in_veclen, wtf->infh)) != 0) {
00510         int i, nfr = n / wtf->in_veclen;
00511         if (n % wtf->in_veclen) {
00512             E_ERROR("Size of file %d not a multiple of veclen %d\n",
00513                     n, wtf->in_veclen);
00514             return -1;
00515         }
00516         /* Byteswap stuff here if necessary. */
00517         if (wtf->byteswap) {
00518             for (i = 0; i < n; ++i)
00519                 SWAP_FLOAT32(wtf->feat[0] + i);
00520         }
00521         fe_float_to_mfcc(wtf->fe, (float32 **)wtf->feat, wtf->feat, nfr);
00522         for (i = 0; i < nfr; ++i) {
00523             if (cmd_ln_boolean_r(wtf->config, "-spec2cep")) {
00524                 if (0 == strcmp(cmd_ln_str_r(wtf->config, "-transform"), "legacy"))
00525                     fe_logspec_to_mfcc(wtf->fe, wtf->feat[i], wtf->feat[i]);
00526                 else
00527                     fe_logspec_dct2(wtf->fe, wtf->feat[i], wtf->feat[i]);
00528             }
00529             else if (cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
00530                 fe_mfcc_dct3(wtf->fe, wtf->feat[i], wtf->feat[i]);
00531             }
00532         }
00533         if ((n = (*wtf->ot->output_frames)(wtf, wtf->feat, nfr)) < 0)
00534             return -1;
00535         nfloat += n;
00536     }
00537 
00538     if (fclose(wtf->infh) == EOF)
00539         E_ERROR_SYSTEM("Failed to close input file");
00540     wtf->infh = NULL;
00541     return nfloat;
00542 }
00543 
00544 static const audio_type_t types[] = {
00545 #ifdef HAVE_SNDFILE_H
00546     { "-sndfile", &detect_sndfile, &decode_sndfile },
00547 #endif
00548     { "-mswav", &detect_riff, &decode_pcm },
00549     { "-nist", &detect_nist, &decode_pcm },
00550     { "-raw", &detect_raw, &decode_pcm },
00551     { "-sph2pipe", &detect_sph2pipe, &decode_pcm }
00552 };
00553 static const int ntypes = sizeof(types)/sizeof(types[0]);
00554 static const audio_type_t mfcc_type = {
00555     "sphinx_mfc", &detect_sphinx_mfc, &decode_sphinx_mfc
00556 };
00557 
00563 static int
00564 output_header_sphinx(sphinx_wave2feat_t *wtf, int32 nfloat)
00565 {
00566     if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1) {
00567         E_ERROR_SYSTEM("Failed to write to %s", wtf->outfile);
00568         return -1;
00569     }
00570     return 0;
00571 }
00572 
00578 static int
00579 output_frames_sphinx(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
00580 {
00581     int i, nfloat = 0;
00582 
00583     fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
00584     for (i = 0; i < nfr; ++i) {
00585         if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
00586             E_ERROR_SYSTEM("Writing %d values to %s failed",
00587                            wtf->veclen, wtf->outfile);
00588             return -1;
00589         }
00590         nfloat += wtf->veclen;
00591     }
00592     return nfloat;
00593 }
00594 
00595 typedef enum htk_feature_kind_e {
00596     WAVEFORM = 0,   /* PCM audio (rarely used) */
00597     LPC = 1,        /* LPC filter coefficients */
00598     LPCREFC = 2,    /* LPC reflection coefficients */
00599     LPCEPSTRA = 3,  /* LPC-based cepstral coefficients */
00600     LPCDELCEP = 4,  /* LPCC plus deltas */
00601     IREFC = 5,      /* 16-bit integer LPC reflection coefficients */
00602     MFCC = 6,       /* MFCCs */
00603     FBANK = 7,      /* Log mel spectrum */
00604     MELSPEC = 8,    /* Linear mel spectrum */
00605     USER = 9,       /* User defined */
00606     DISCRETE = 10,  /* Vector quantized data */
00607     PLP = 11        /* PLP coefficients */
00608 } htk_feature_kind_t;
00609 
00610 typedef enum htk_feature_flag_e {
00611     _E = 0000100, /* has energy */
00612     _N = 0000200, /* absolute energy supressed */
00613     _D = 0000400, /* has delta coefficients */
00614     _A = 0001000, /* has acceleration (delta-delta) coefficients */
00615     _C = 0002000, /* is compressed */
00616     _Z = 0004000, /* has zero mean static coefficients (i.e. CMN) */
00617     _K = 0010000, /* has CRC checksum */
00618     _O = 0020000, /* has 0th cepstral coefficient */
00619     _V = 0040000, /* has VQ data */
00620     _T = 0100000  /* has third differential coefficients */
00621 } htk_feature_flag_t;
00622 
00626 static int
00627 output_header_htk(sphinx_wave2feat_t *wtf, int32 nfloat)
00628 {
00629     int32 samp_period;
00630     int16 samp_size;
00631     int16 param_kind;
00632     int swap = FALSE;
00633 
00634     /* HTK files are big-endian. */
00635     if (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")))
00636         swap = TRUE;
00637     /* Same file size thing as in Sphinx files (I think) */
00638     if (swap) SWAP_INT32(&nfloat);
00639     if (fwrite(&nfloat, 4, 1, wtf->outfh) != 1)
00640         return -1;
00641     /* Sample period in 100ns units. */
00642     samp_period = (int32)(1e+7 / cmd_ln_float32_r(wtf->config, "-frate"));
00643     if (swap) SWAP_INT32(&samp_period);
00644     if (fwrite(&samp_period, 4, 1, wtf->outfh) != 1)
00645         return -1;
00646     /* Sample size - veclen * sizeof each sample. */
00647     samp_size = wtf->veclen * 4;
00648     if (swap) SWAP_INT16(&samp_size);
00649     if (fwrite(&samp_size, 2, 1, wtf->outfh) != 1)
00650         return -1;
00651     /* Format and flags. */
00652     if (cmd_ln_boolean_r(wtf->config, "-logspec")
00653         || cmd_ln_boolean_r(wtf->config, "-cep2spec"))
00654         param_kind = FBANK; /* log mel-filter bank outputs */
00655     else
00656         param_kind = MFCC | _O; /* MFCC + CEP0 (note reordering...) */
00657     if (swap) SWAP_INT16(&param_kind);
00658     if (fwrite(&param_kind, 2, 1, wtf->outfh) != 1)
00659         return -1;
00660 
00661     return 0;
00662 }
00663 
00667 static int
00668 output_frames_htk(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
00669 {
00670     int i, j, swap, htk_reorder, nfloat = 0;
00671 
00672     fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
00673     /* This is possibly inefficient, but probably not a big deal. */
00674     swap = (0 == strcmp("little", cmd_ln_str_r(wtf->config, "-mach_endian")));
00675     htk_reorder = (0 == strcmp("htk", wtf->ot->name)
00676                    && !(cmd_ln_boolean_r(wtf->config, "-logspec")
00677                         || cmd_ln_boolean_r(wtf->config, "-cep2spec")));
00678     for (i = 0; i < nfr; ++i) {
00679         if (htk_reorder) {
00680             mfcc_t c0 = frames[i][0];
00681             memmove(frames[i] + 1, frames[i], (wtf->veclen - 1) * 4);
00682             frames[i][wtf->veclen - 1] = c0;
00683         }
00684         if (swap)
00685             for (j = 0; j < wtf->veclen; ++j)
00686                 SWAP_FLOAT32(frames[i] + j);
00687         if (fwrite(frames[i], sizeof(float32), wtf->veclen, wtf->outfh) != wtf->veclen) {
00688             E_ERROR_SYSTEM("Writing %d values to %s failed",
00689                            wtf->veclen, wtf->outfile);
00690             return -1;
00691         }
00692         nfloat += wtf->veclen;
00693     }
00694     return nfloat;
00695 }
00696 
00700 static int
00701 output_frames_text(sphinx_wave2feat_t *wtf, mfcc_t **frames, int nfr)
00702 {
00703     int i, j, nfloat = 0;
00704 
00705     fe_mfcc_to_float(wtf->fe, frames, (float32 **)frames, nfr);
00706     for (i = 0; i < nfr; ++i) {
00707         for (j = 0; j < wtf->veclen; ++j) {
00708             fprintf(wtf->outfh, "%.5g", frames[i][j]);
00709             if (j == wtf->veclen - 1)
00710                 fprintf(wtf->outfh, "\n");
00711             else
00712                 fprintf(wtf->outfh, " ");
00713         }
00714         nfloat += wtf->veclen;
00715     }
00716     return nfloat;
00717 }
00718 
00719 static const output_type_t outtypes[] = {
00720     { "sphinx", &output_header_sphinx, &output_frames_sphinx },
00721     { "htk", &output_header_htk, &output_frames_htk },
00722     { "text", NULL, &output_frames_text }
00723 };
00724 static const int nouttypes = sizeof(outtypes)/sizeof(outtypes[0]);
00725 
00726 sphinx_wave2feat_t *
00727 sphinx_wave2feat_init(cmd_ln_t *config)
00728 {
00729     sphinx_wave2feat_t *wtf;
00730     int i;
00731 
00732     wtf = ckd_calloc(1, sizeof(*wtf));
00733     wtf->refcount = 1;
00734     wtf->config = cmd_ln_retain(config);
00735     wtf->fe = fe_init_auto_r(wtf->config);
00736     wtf->ot = outtypes; /* Default (sphinx) type. */
00737     for (i = 0; i < nouttypes; ++i) {
00738         output_type_t const *otype = &outtypes[i];
00739         if (0 == strcmp(cmd_ln_str_r(config, "-ofmt"), otype->name)) {
00740             wtf->ot = otype;
00741             break;
00742         }
00743     }
00744     if (i == nouttypes) {
00745         E_ERROR("Unknown output type: '%s'\n",
00746                 cmd_ln_str_r(config, "-ofmt"));
00747         sphinx_wave2feat_free(wtf);
00748         return NULL;
00749     }
00750 
00751     return wtf;
00752 }
00753 
00754 int
00755 sphinx_wave2feat_free(sphinx_wave2feat_t *wtf)
00756 {
00757     if (wtf == NULL)
00758         return 0;
00759     if (--wtf->refcount > 0)
00760         return wtf->refcount;
00761 
00762     if (wtf->audio)
00763         ckd_free(wtf->audio);
00764     if (wtf->feat)
00765         ckd_free_2d(wtf->feat);
00766     if (wtf->infile)
00767         ckd_free(wtf->infile);
00768     if (wtf->outfile)
00769         ckd_free(wtf->outfile);
00770     if (wtf->infh) {
00771         if (fclose(wtf->infh) == EOF)
00772             E_ERROR_SYSTEM("Failed to close input file");
00773     }
00774     if (wtf->outfh) {
00775         if (fclose(wtf->outfh) == EOF)
00776             E_ERROR_SYSTEM("Failed to close output file");
00777     }
00778     cmd_ln_free_r(wtf->config);
00779     fe_free(wtf->fe);
00780     ckd_free(wtf);
00781 
00782     return 0;
00783 }
00784 
00785 sphinx_wave2feat_t *
00786 sphinx_wave2feat_retain(sphinx_wave2feat_t *wtf)
00787 {
00788     ++wtf->refcount;
00789     return wtf;
00790 }
00791 
00792 static audio_type_t const *
00793 detect_audio_type(sphinx_wave2feat_t *wtf)
00794 {
00795     audio_type_t const *atype;
00796     int i;
00797 
00798     /* Special case audio type for Sphinx MFCC inputs. */
00799     if (cmd_ln_boolean_r(wtf->config, "-spec2cep")
00800         || cmd_ln_boolean_r(wtf->config, "-cep2spec")) {
00801         int rv = mfcc_type.detect(wtf);
00802         if (rv == -1)
00803             goto error_out;
00804         return &mfcc_type;
00805     }
00806 
00807     /* Try to use the type of infile given on the command line. */
00808     for (i = 0; i < ntypes; ++i) {
00809         int rv;
00810         atype = &types[i];
00811         if (cmd_ln_boolean_r(wtf->config, atype->name)) {
00812             rv = (*atype->detect)(wtf);
00813             if (rv == -1)
00814                 goto error_out;
00815             else if (rv == TRUE)
00816                 break;
00817         }
00818     }
00819     if (i == ntypes) {
00820         /* Detect file type of infile and get parameters. */
00821         for (i = 0; i < ntypes; ++i) {
00822             int rv;
00823             atype = &types[i];
00824             rv = (*atype->detect)(wtf);
00825             if (rv == -1)
00826                 goto error_out;
00827             else if (rv == TRUE)
00828                 break;
00829         }
00830         if (i == ntypes)
00831             goto error_out;
00832     }
00833     return atype;
00834  error_out:
00835     if (wtf->infh)
00836         fclose(wtf->infh);
00837     wtf->infh = NULL;
00838     return NULL;
00839 }
00840 
00841 int
00842 sphinx_wave2feat_convert_file(sphinx_wave2feat_t *wtf,
00843                               char const *infile, char const *outfile)
00844 {
00845     int nchans, minfft, nfft, nfloat, veclen;
00846     audio_type_t const *atype;
00847     int fshift, fsize;
00848 
00849     if (cmd_ln_boolean_r(wtf->config, "-verbose"))
00850         E_INFO("Converting %s to %s\n", infile, outfile);
00851 
00852     wtf->infile = ckd_salloc(infile);
00853 
00854     /* Detect input file type. */
00855     if ((atype = detect_audio_type(wtf)) == NULL)
00856         return -1;
00857 
00858     /* Determine whether to byteswap input. */
00859     wtf->byteswap = strcmp(cmd_ln_str_r(wtf->config, "-mach_endian"),
00860                            cmd_ln_str_r(wtf->config, "-input_endian"));
00861 
00862     /* Make sure the FFT size is sufficiently large. */
00863     minfft = (int)(cmd_ln_float32_r(wtf->config, "-samprate")
00864                    * cmd_ln_float32_r(wtf->config, "-wlen") + 0.5);
00865     for (nfft = 1; nfft < minfft; nfft <<= 1)
00866         ;
00867     if (nfft > cmd_ln_int32_r(wtf->config, "-nfft")) {
00868         E_WARN("Value of -nfft = %d is too small, increasing to %d\n",
00869                cmd_ln_int32_r(wtf->config, "-nfft"), nfft);
00870         cmd_ln_set_int32_r(wtf->config, "-nfft", nfft);
00871         fe_free(wtf->fe);
00872         wtf->fe = fe_init_auto_r(wtf->config);
00873     }
00874 
00875     /* Get the output frame size (if not already set). */
00876     if (wtf->veclen == 0)
00877         wtf->veclen = fe_get_output_size(wtf->fe);
00878 
00879     /* Set up the input and output buffers. */
00880     fe_get_input_size(wtf->fe, &fshift, &fsize);
00881     /* Want to get at least a whole frame plus shift in here.  Also we
00882        will either pick or mix multiple channels so we need to read
00883        them all at once. */
00884     nchans = cmd_ln_int32_r(wtf->config, "-nchans");
00885     wtf->blocksize = cmd_ln_int32_r(wtf->config, "-blocksize") * nchans;
00886     if (wtf->blocksize < (fsize + fshift) * nchans) {
00887         E_INFO("Block size of %d too small, increasing to %d\n",
00888                wtf->blocksize,
00889                (fsize + fshift) * nchans);
00890         wtf->blocksize = (fsize + fshift) * nchans;
00891     }
00892     wtf->audio = ckd_calloc(wtf->blocksize, sizeof(*wtf->audio));
00893     wtf->featsize = (wtf->blocksize / nchans - fsize) / fshift;
00894 
00895     /* Use the maximum of the input and output frame sizes to allocate this. */
00896     veclen = wtf->veclen;
00897     if (wtf->in_veclen > veclen) veclen = wtf->in_veclen;
00898     
00899     wtf->feat = ckd_calloc_2d(wtf->featsize, veclen, sizeof(**wtf->feat));
00900 
00901     /* Let's go! */
00902     if ((wtf->outfh = fopen(outfile, "wb")) == NULL) {
00903         E_ERROR_SYSTEM("Failed to open %s for writing", outfile);
00904         return -1;
00905     }
00906     /* Write an empty header, which we'll fill in later. */
00907     if (wtf->ot->output_header &&
00908         (*wtf->ot->output_header)(wtf, 0) < 0) {
00909         E_ERROR_SYSTEM("Failed to write empty header to %s\n", outfile);
00910         goto error_out;
00911     }
00912     wtf->outfile = ckd_salloc(outfile);
00913 
00914     if ((nfloat = (*atype->decode)(wtf)) < 0) {
00915         E_ERROR("Failed to convert");
00916         goto error_out;
00917     }
00918 
00919     if (wtf->ot->output_header) {
00920         if (fseek(wtf->outfh, 0, SEEK_SET) < 0) {
00921             E_ERROR_SYSTEM("Failed to seek to beginning of %s\n", outfile);
00922             goto error_out;
00923         }
00924         if ((*wtf->ot->output_header)(wtf, nfloat) < 0) {
00925             E_ERROR_SYSTEM("Failed to write header to %s\n", outfile);
00926             goto error_out;
00927         }
00928     }
00929     
00930 
00931     if (wtf->audio)
00932         ckd_free(wtf->audio);
00933     if (wtf->feat)
00934         ckd_free_2d(wtf->feat);
00935     if (wtf->infile)
00936         ckd_free(wtf->infile);
00937     if (wtf->outfile)
00938         ckd_free(wtf->outfile);
00939 
00940     wtf->audio = NULL;
00941     wtf->infile = NULL;
00942     wtf->feat = NULL;
00943     wtf->outfile = NULL;
00944 
00945     if (wtf->outfh)
00946         if (fclose(wtf->outfh) == EOF)
00947             E_ERROR_SYSTEM("Failed to close output file");
00948     wtf->outfh = NULL;
00949 
00950     return 0;
00951 
00952 error_out:
00953 
00954     if (wtf->audio)
00955         ckd_free(wtf->audio);
00956     if (wtf->feat)
00957         ckd_free_2d(wtf->feat);
00958     if (wtf->infile)
00959         ckd_free(wtf->infile);
00960     if (wtf->outfile)
00961         ckd_free(wtf->outfile);
00962 
00963     wtf->audio = NULL;
00964     wtf->infile = NULL;
00965     wtf->feat = NULL;
00966     wtf->outfile = NULL;
00967 
00968     if (wtf->outfh)
00969         if (fclose(wtf->outfh) == EOF)
00970             E_ERROR_SYSTEM("Failed to close output file");
00971     wtf->outfh = NULL;
00972 
00973     return -1;
00974 }
00975 
00976 void
00977 build_filenames(cmd_ln_t *config, char const *basename,
00978                 char **out_infile, char **out_outfile)
00979 {
00980     char const *di, *do_, *ei, *eo;
00981 
00982     di = cmd_ln_str_r(config, "-di");
00983     do_ = cmd_ln_str_r(config, "-do");
00984     ei = cmd_ln_str_r(config, "-ei");
00985     eo = cmd_ln_str_r(config, "-eo");
00986 
00987     *out_infile = string_join(di ? di : "",
00988                               di ? "/" : "",
00989                               basename,
00990                               ei ? "." : "",
00991                               ei ? ei : "",
00992                               NULL);
00993     *out_outfile = string_join(do_ ? do_ : "",
00994                                do_ ? "/" : "",
00995                                basename,
00996                                eo ? "." : "",
00997                                eo ? eo : "",
00998                               NULL);
00999     /* Build output directory structure if possible/requested (it is
01000      * by default). */
01001     if (cmd_ln_boolean_r(config, "-build_outdirs")) {
01002         char *dirname = ckd_salloc(*out_outfile);
01003         path2dirname(*out_outfile, dirname);
01004         build_directory(dirname);
01005         ckd_free(dirname);
01006     }
01007 }
01008 
01009 static int
01010 run_control_file(sphinx_wave2feat_t *wtf, char const *ctlfile)
01011 {
01012     hash_table_t *files;
01013     hash_iter_t *itor;
01014     lineiter_t *li;
01015     FILE *ctlfh;
01016     int nskip, runlen, npart, rv = 0;
01017 
01018     if ((ctlfh = fopen(ctlfile, "r")) == NULL) {
01019         E_ERROR_SYSTEM("Failed to open control file %s", ctlfile);
01020         return -1;
01021     }
01022     nskip = cmd_ln_int32_r(wtf->config, "-nskip");
01023     runlen = cmd_ln_int32_r(wtf->config, "-runlen");
01024     if ((npart = cmd_ln_int32_r(wtf->config, "-npart"))) {
01025         /* Count lines in the file. */
01026         int partlen, part, nlines = 0;
01027         part = cmd_ln_int32_r(wtf->config, "-part");
01028         for (li = lineiter_start(ctlfh); li; li = lineiter_next(li))
01029             ++nlines;
01030         fseek(ctlfh, 0, SEEK_SET);
01031         partlen = nlines / npart;
01032         nskip = partlen * (part - 1);
01033         if (part == npart)
01034             runlen = -1;
01035         else
01036             runlen = partlen;
01037     }
01038     if (runlen != -1){
01039         E_INFO("Processing %d utterances at position %d\n", runlen, nskip);
01040         files = hash_table_new(runlen, HASH_CASE_YES);
01041     }
01042     else {
01043         E_INFO("Processing all remaining utterances at position %d\n", nskip);
01044         files = hash_table_new(1000, HASH_CASE_YES);
01045     }
01046     for (li = lineiter_start(ctlfh); li; li = lineiter_next(li)) {
01047         char *c, *infile, *outfile;
01048 
01049         if (nskip-- > 0)
01050             continue;
01051         if (runlen == 0) {
01052             lineiter_free(li);
01053             break;
01054         }
01055         --runlen;
01056 
01057         string_trim(li->buf, STRING_BOTH);
01058         /* Extract the file ID from the control line. */
01059         if ((c = strchr(li->buf, ' ')) != NULL)
01060             *c = '\0';
01061         if (strlen(li->buf) == 0) {
01062             E_WARN("Empty line %d in control file, skipping\n", li->lineno);
01063             continue;
01064         }
01065         build_filenames(wtf->config, li->buf, &infile, &outfile);
01066         if (hash_table_lookup(files, infile, NULL) == 0)
01067             continue;
01068         rv = sphinx_wave2feat_convert_file(wtf, infile, outfile);
01069         hash_table_enter(files, infile, outfile);
01070         if (rv != 0) {
01071             lineiter_free(li);
01072             break;
01073         }
01074     }
01075     for (itor = hash_table_iter(files); itor;
01076          itor = hash_table_iter_next(itor)) {
01077         ckd_free((void *)hash_entry_key(itor->ent));
01078         ckd_free(hash_entry_val(itor->ent));
01079     }
01080     hash_table_free(files);
01081 
01082     if (fclose(ctlfh) == EOF)
01083         E_ERROR_SYSTEM("Failed to close control file");
01084     return rv;
01085 }
01086 
01087 int
01088 main(int argc, char *argv[])
01089 {
01090     sphinx_wave2feat_t *wtf;
01091     cmd_ln_t *config;
01092     int rv;
01093 
01094     /* Initialize config. */
01095     if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
01096         return 2;
01097 
01098     /* Parse an argument file if there's one in there. */
01099     if (cmd_ln_str_r(config, "-argfile"))
01100         config = cmd_ln_parse_file_r(config, defn,
01101                                      cmd_ln_str_r(config, "-argfile"), FALSE);
01102     if (config == NULL) {
01103         E_ERROR("Command line parsing failed\n");
01104         return 1;
01105     }
01106     if ((wtf = sphinx_wave2feat_init(config)) == NULL) {
01107         E_ERROR("Failed to initialize wave2feat object\n");
01108         return 1;
01109     }
01110 
01111     /* If there's a control file run through it, otherwise we will do
01112      * a single file (which is what run_control_file will do
01113      * internally too) */
01114     if (cmd_ln_str_r(config, "-c"))
01115         rv = run_control_file(wtf, cmd_ln_str_r(config, "-c"));
01116     else
01117         rv = sphinx_wave2feat_convert_file(wtf, cmd_ln_str_r(config, "-i"),
01118                                            cmd_ln_str_r(config, "-o"));
01119 
01120     sphinx_wave2feat_free(wtf);
01121     cmd_ln_free_r(config);
01122     return rv;
01123 }