SphinxBase 0.6

src/sphinx_lmtools/sphinx_lm_convert.c

Go to the documentation of this file.
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 2009 Carnegie Mellon University.  All rights 
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00041 #include <sphinxbase/logmath.h>
00042 #include <sphinxbase/ngram_model.h>
00043 #include <sphinxbase/cmd_ln.h>
00044 #include <sphinxbase/ckd_alloc.h>
00045 #include <sphinxbase/err.h>
00046 #include <sphinxbase/pio.h>
00047 #include <sphinxbase/strfuncs.h>
00048 
00049 #include <stdio.h>
00050 #include <string.h>
00051 #include <math.h>
00052 
00053 static const arg_t defn[] = {
00054   { "-help",
00055     ARG_BOOLEAN,
00056     "no",
00057     "Shows the usage of the tool"},
00058 
00059   { "-logbase",
00060     ARG_FLOAT64,
00061     "1.0001",
00062     "Base in which all log-likelihoods calculated" },
00063 
00064   { "-i",
00065     REQARG_STRING,
00066     NULL,
00067     "Input language model file (required)"},
00068 
00069   { "-o",
00070     REQARG_STRING,
00071     NULL,
00072     "Output language model file (required)"},
00073 
00074   { "-ifmt",
00075     ARG_STRING,
00076     NULL,
00077     "Input language model format (will guess if not specified)"},
00078 
00079   { "-ofmt",
00080     ARG_STRING,
00081     NULL,
00082     "Output language model file (will guess if not specified)"},
00083 
00084   { "-ienc",
00085     ARG_STRING,
00086     NULL,
00087     "Input language model text encoding (no conversion done if not specified)"},
00088 
00089   { "-oenc",
00090     ARG_STRING,
00091     "utf8",
00092     "Output language model text encoding"},
00093 
00094   { "-case",
00095     ARG_STRING,
00096     NULL,
00097     "Ether 'lower' or 'upper' - case fold to lower/upper case (NOT UNICODE AWARE)" },
00098 
00099   { "-mmap",
00100     ARG_BOOLEAN,
00101     "no",
00102     "Use memory-mapped I/O for reading binary LM files"},
00103 
00104   { "-debug",
00105     ARG_INT32,
00106     NULL,
00107     "Verbosity level for debugging messages"
00108   },
00109 
00110   { NULL, 0, NULL, NULL }
00111 };
00112 
00113 static void
00114 usagemsg(char *pgm)
00115 {
00116     E_INFO("Usage: %s -i <input.lm> \\\n", pgm);
00117     E_INFOCONT("\t[-ifmt txt] [-ofmt dmp]\n");
00118     E_INFOCONT("\t-o <output.lm.DMP>\n");
00119 
00120     exit(0);
00121 }
00122 
00123 
00124 int
00125 main(int argc, char *argv[])
00126 {
00127         cmd_ln_t *config;
00128         ngram_model_t *lm = NULL;
00129         logmath_t *lmath;
00130         int itype, otype;
00131         char const *kase;
00132 
00133         if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL)
00134                 return 1;
00135                 
00136         if (cmd_ln_boolean_r(config, "-help")) {
00137             usagemsg(argv[0]);
00138         }
00139 
00140         err_set_debug_level(cmd_ln_int32_r(config, "-debug"));
00141 
00142         /* Create log math object. */
00143         if ((lmath = logmath_init
00144              (cmd_ln_float64_r(config, "-logbase"), 0, 0)) == NULL) {
00145                 E_FATAL("Failed to initialize log math\n");
00146         }
00147         
00148         if (cmd_ln_str_r(config, "-i") == NULL || cmd_ln_str_r(config, "-i") == NULL) {
00149             E_ERROR("Please specify both input and output models\n");
00150             goto error_out;
00151         }
00152             
00153         
00154         /* Load the input language model. */
00155         if (cmd_ln_str_r(config, "-ifmt")) {
00156             if ((itype = ngram_str_to_type(cmd_ln_str_r(config, "-ifmt")))
00157                 == NGRAM_INVALID) {
00158                 E_ERROR("Invalid input type %s\n", cmd_ln_str_r(config, "-ifmt"));
00159                 goto error_out;
00160             }
00161             lm = ngram_model_read(config, cmd_ln_str_r(config, "-i"),
00162                                   itype, lmath);
00163         }
00164         else {
00165             lm = ngram_model_read(config, cmd_ln_str_r(config, "-i"),
00166                                   NGRAM_AUTO, lmath);
00167         }
00168 
00169         if (lm == NULL) {
00170             E_FATAL("Failed to read the model from the file '%s'", cmd_ln_str_r(config, "-i"));
00171         }
00172 
00173         /* Guess or set the output language model type. */
00174         if (cmd_ln_str_r(config, "-ofmt")) {
00175             if ((otype = ngram_str_to_type(cmd_ln_str_r(config, "-ofmt")))
00176                 == NGRAM_INVALID) {
00177                 E_ERROR("Invalid output type %s\n", cmd_ln_str_r(config, "-ofmt"));
00178                 goto error_out;
00179             }
00180         }
00181         else {
00182             otype = ngram_file_name_to_type(cmd_ln_str_r(config, "-o"));
00183         }
00184 
00185         /* Recode the language model if desired. */
00186         if (cmd_ln_str_r(config, "-ienc")) {
00187             if (ngram_model_recode(lm, cmd_ln_str_r(config, "-ienc"),
00188                                    cmd_ln_str_r(config, "-oenc")) != 0) {
00189                 E_ERROR("Failed to recode language model from %s to %s\n",
00190                         cmd_ln_str_r(config, "-ienc"),
00191                         cmd_ln_str_r(config, "-oenc"));
00192                 goto error_out;
00193             }
00194         }
00195 
00196         /* Case fold if requested. */
00197         if ((kase = cmd_ln_str_r(config, "-case"))) {
00198             if (0 == strcmp(kase, "lower")) {
00199                 ngram_model_casefold(lm, NGRAM_LOWER);
00200             }
00201             else if (0 == strcmp(kase, "upper")) {
00202                 ngram_model_casefold(lm, NGRAM_UPPER);
00203             }
00204             else {
00205                 E_ERROR("Unknown value for -case: %s\n", kase);
00206                 goto error_out;
00207             }
00208         }
00209 
00210         /* Write the output language model. */
00211         if (ngram_model_write(lm, cmd_ln_str_r(config, "-o"), otype) != 0) {
00212             E_ERROR("Failed to write language model in format %s to %s\n",
00213                     ngram_type_to_str(otype), cmd_ln_str_r(config, "-o"));
00214             goto error_out;
00215         }
00216 
00217         /* That's all folks! */
00218         ngram_model_free(lm);
00219         return 0;
00220 
00221 error_out:
00222         ngram_model_free(lm);
00223         return 1;
00224 }