SphinxBase 0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 1996-2004 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * This work was supported in part by funding from the Defense Advanced 00019 * Research Projects Agency and the National Science Foundation of the 00020 * United States of America, and the CMU Sphinx Speech Consortium. 00021 * 00022 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00023 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00024 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00025 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00026 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00028 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00029 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00030 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00032 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 * 00034 * ==================================================================== 00035 * 00036 */ 00037 00038 /* 00039 * fe.h 00040 * 00041 * $Log: fe.h,v $ 00042 * Revision 1.11 2005/02/05 02:15:02 egouvea 00043 * Removed fe_process(), never used 00044 * 00045 * Revision 1.10 2004/12/10 16:48:55 rkm 00046 * Added continuous density acoustic model handling 00047 * 00048 * 00049 */ 00050 00051 #if defined(WIN32) && !defined(GNUWINCE) 00052 #define srand48(x) srand(x) 00053 #define lrand48() rand() 00054 #endif 00055 00056 #ifndef _NEW_FE_H_ 00057 #define _NEW_FE_H_ 00058 00059 /* Win32/WinCE DLL gunk */ 00060 #include <sphinxbase/sphinxbase_export.h> 00061 00062 #include <sphinxbase/cmd_ln.h> 00063 #include <sphinxbase/fixpoint.h> 00064 00065 #ifdef __cplusplus 00066 extern "C" { 00067 #endif 00068 #if 0 00069 /* Fool Emacs. */ 00070 } 00071 #endif 00072 00073 #ifdef WORDS_BIGENDIAN 00074 #define NATIVE_ENDIAN "big" 00075 #else 00076 #define NATIVE_ENDIAN "little" 00077 #endif 00078 00080 #define DEFAULT_SAMPLING_RATE 16000 00081 00082 #define DEFAULT_FRAME_RATE 100 00083 00085 #define DEFAULT_FRAME_SHIFT 160 00086 00087 #define DEFAULT_WINDOW_LENGTH 0.025625 00088 00089 #define DEFAULT_FFT_SIZE 512 00090 00091 #define DEFAULT_NUM_CEPSTRA 13 00092 00093 #define DEFAULT_NUM_FILTERS 40 00094 00095 #define DEFAULT_LOWER_FILT_FREQ 133.33334 00096 00097 #define DEFAULT_UPPER_FILT_FREQ 6855.4976 00098 00099 #define DEFAULT_PRE_EMPHASIS_ALPHA 0.97 00100 00101 #define DEFAULT_WARP_TYPE "inverse_linear" 00102 00103 #define SEED -1 00104 00105 #define waveform_to_cepstral_command_line_macro() \ 00106 { "-logspec", \ 00107 ARG_BOOLEAN, \ 00108 "no", \ 00109 "Write out logspectral files instead of cepstra" }, \ 00110 \ 00111 { "-smoothspec", \ 00112 ARG_BOOLEAN, \ 00113 "no", \ 00114 "Write out cepstral-smoothed logspectral files" }, \ 00115 \ 00116 { "-transform", \ 00117 ARG_STRING, \ 00118 "legacy", \ 00119 "Which type of transform to use to calculate cepstra (legacy, dct, or htk)" }, \ 00120 \ 00121 { "-alpha", \ 00122 ARG_FLOAT32, \ 00123 ARG_STRINGIFY(DEFAULT_PRE_EMPHASIS_ALPHA), \ 00124 "Preemphasis parameter" }, \ 00125 \ 00126 { "-samprate", \ 00127 ARG_FLOAT32, \ 00128 ARG_STRINGIFY(DEFAULT_SAMPLING_RATE), \ 00129 "Sampling rate" }, \ 00130 \ 00131 { "-frate", \ 00132 ARG_INT32, \ 00133 ARG_STRINGIFY(DEFAULT_FRAME_RATE), \ 00134 "Frame rate" }, \ 00135 \ 00136 { "-wlen", \ 00137 ARG_FLOAT32, \ 00138 ARG_STRINGIFY(DEFAULT_WINDOW_LENGTH), \ 00139 "Hamming window length" }, \ 00140 \ 00141 { "-nfft", \ 00142 ARG_INT32, \ 00143 ARG_STRINGIFY(DEFAULT_FFT_SIZE), \ 00144 "Size of FFT" }, \ 00145 \ 00146 { "-nfilt", \ 00147 ARG_INT32, \ 00148 ARG_STRINGIFY(DEFAULT_NUM_FILTERS), \ 00149 "Number of filter banks" }, \ 00150 \ 00151 { "-lowerf", \ 00152 ARG_FLOAT32, \ 00153 ARG_STRINGIFY(DEFAULT_LOWER_FILT_FREQ), \ 00154 "Lower edge of filters" }, \ 00155 \ 00156 { "-upperf", \ 00157 ARG_FLOAT32, \ 00158 ARG_STRINGIFY(DEFAULT_UPPER_FILT_FREQ), \ 00159 "Upper edge of filters" }, \ 00160 \ 00161 { "-unit_area", \ 00162 ARG_BOOLEAN, \ 00163 "yes", \ 00164 "Normalize mel filters to unit area" }, \ 00165 \ 00166 { "-round_filters", \ 00167 ARG_BOOLEAN, \ 00168 "yes", \ 00169 "Round mel filter frequencies to DFT points" }, \ 00170 \ 00171 { "-ncep", \ 00172 ARG_INT32, \ 00173 ARG_STRINGIFY(DEFAULT_NUM_CEPSTRA), \ 00174 "Number of cep coefficients" }, \ 00175 \ 00176 { "-doublebw", \ 00177 ARG_BOOLEAN, \ 00178 "no", \ 00179 "Use double bandwidth filters (same center freq)" }, \ 00180 \ 00181 { "-lifter", \ 00182 ARG_INT32, \ 00183 "0", \ 00184 "Length of sin-curve for liftering, or 0 for no liftering." }, \ 00185 \ 00186 { "-input_endian", \ 00187 ARG_STRING, \ 00188 NATIVE_ENDIAN, \ 00189 "Endianness of input data, big or little, ignored if NIST or MS Wav" }, \ 00190 \ 00191 { "-warp_type", \ 00192 ARG_STRING, \ 00193 DEFAULT_WARP_TYPE, \ 00194 "Warping function type (or shape)" }, \ 00195 \ 00196 { "-warp_params", \ 00197 ARG_STRING, \ 00198 NULL, \ 00199 "Parameters defining the warping function" }, \ 00200 \ 00201 { "-dither", \ 00202 ARG_BOOLEAN, \ 00203 "no", \ 00204 "Add 1/2-bit noise" }, \ 00205 \ 00206 { "-seed", \ 00207 ARG_INT32, \ 00208 ARG_STRINGIFY(SEED), \ 00209 "Seed for random number generator; if less than zero, pick our own" }, \ 00210 \ 00211 { "-remove_dc", \ 00212 ARG_BOOLEAN, \ 00213 "no", \ 00214 "Remove DC offset from each frame" }, \ 00215 \ 00216 { "-verbose", \ 00217 ARG_BOOLEAN, \ 00218 "no", \ 00219 "Show input filenames" } \ 00220 00221 00222 #ifdef FIXED_POINT 00223 00224 typedef fixed32 mfcc_t; 00225 00227 #define FLOAT2MFCC(x) FLOAT2FIX(x) 00228 00229 #define MFCC2FLOAT(x) FIX2FLOAT(x) 00230 00231 #define MFCCMUL(a,b) FIXMUL(a,b) 00232 #define MFCCLN(x,in,out) FIXLN_ANY(x,in,out) 00233 #else /* !FIXED_POINT */ 00234 00236 typedef float32 mfcc_t; 00238 #define FLOAT2MFCC(x) (x) 00239 00240 #define MFCC2FLOAT(x) (x) 00241 00242 #define MFCCMUL(a,b) ((a)*(b)) 00243 #define MFCCLN(x,in,out) log(x) 00244 #endif /* !FIXED_POINT */ 00245 00249 typedef struct fe_s fe_t; 00250 00254 enum fe_error_e { 00255 FE_SUCCESS = 0, 00256 FE_OUTPUT_FILE_SUCCESS = 0, 00257 FE_CONTROL_FILE_ERROR = -1, 00258 FE_START_ERROR = -2, 00259 FE_UNKNOWN_SINGLE_OR_BATCH = -3, 00260 FE_INPUT_FILE_OPEN_ERROR = -4, 00261 FE_INPUT_FILE_READ_ERROR = -5, 00262 FE_MEM_ALLOC_ERROR = -6, 00263 FE_OUTPUT_FILE_WRITE_ERROR = -7, 00264 FE_OUTPUT_FILE_OPEN_ERROR = -8, 00265 FE_ZERO_ENERGY_ERROR = -9, 00266 FE_INVALID_PARAM_ERROR = -10 00267 }; 00268 00276 SPHINXBASE_EXPORT 00277 fe_t* fe_init_auto(void); 00278 00286 SPHINXBASE_EXPORT 00287 arg_t const *fe_get_args(void); 00288 00299 SPHINXBASE_EXPORT 00300 fe_t *fe_init_auto_r(cmd_ln_t *config); 00301 00309 SPHINXBASE_EXPORT 00310 const cmd_ln_t *fe_get_config(fe_t *fe); 00311 00316 SPHINXBASE_EXPORT 00317 int fe_start_utt(fe_t *fe); 00318 00331 SPHINXBASE_EXPORT 00332 int fe_get_output_size(fe_t *fe); 00333 00346 SPHINXBASE_EXPORT 00347 void fe_get_input_size(fe_t *fe, int *out_frame_shift, 00348 int *out_frame_size); 00349 00364 SPHINXBASE_EXPORT 00365 int fe_end_utt(fe_t *fe, mfcc_t *out_cepvector, int32 *out_nframes); 00366 00372 SPHINXBASE_EXPORT 00373 fe_t *fe_retain(fe_t *fe); 00374 00382 SPHINXBASE_EXPORT 00383 int fe_free(fe_t *fe); 00384 00393 SPHINXBASE_EXPORT 00394 int fe_process_frame(fe_t *fe, int16 const *spch, 00395 int32 nsamps, mfcc_t *out_cep); 00396 00444 SPHINXBASE_EXPORT 00445 int fe_process_frames(fe_t *fe, 00446 int16 const **inout_spch, 00447 size_t *inout_nsamps, 00448 mfcc_t **buf_cep, 00449 int32 *inout_nframes); 00450 00466 SPHINXBASE_EXPORT 00467 int fe_process_utt(fe_t *fe, 00468 int16 const *spch, 00469 size_t nsamps, 00470 mfcc_t ***cep_block, 00471 int32 *nframes 00472 ); 00473 00477 SPHINXBASE_EXPORT 00478 void fe_free_2d(void *arr); 00479 00483 SPHINXBASE_EXPORT 00484 int fe_mfcc_to_float(fe_t *fe, 00485 mfcc_t **input, 00486 float32 **output, 00487 int32 nframes); 00488 00492 SPHINXBASE_EXPORT 00493 int fe_float_to_mfcc(fe_t *fe, 00494 float32 **input, 00495 mfcc_t **output, 00496 int32 nframes); 00497 00521 SPHINXBASE_EXPORT 00522 int fe_logspec_to_mfcc(fe_t *fe, 00523 const mfcc_t *fr_spec, 00524 mfcc_t *fr_cep 00525 ); 00526 00535 SPHINXBASE_EXPORT 00536 int fe_logspec_dct2(fe_t *fe, 00537 const mfcc_t *fr_spec, 00538 mfcc_t *fr_cep 00539 ); 00540 00549 SPHINXBASE_EXPORT 00550 int fe_mfcc_dct3(fe_t *fe, 00551 const mfcc_t *fr_cep, 00552 mfcc_t *fr_spec 00553 ); 00554 00555 #ifdef __cplusplus 00556 } 00557 #endif 00558 00559 00560 #endif