SphinxBase 0.6

src/libsphinxbase/lm/ngram_model_arpa.c

00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1999-2007 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 /*
00038  * \file ngram_model_arpa.c ARPA format language models
00039  *
00040  * Author: David Huggins-Daines <dhuggins@cs.cmu.edu>
00041  */
00042 
00043 #include "sphinxbase/ckd_alloc.h"
00044 #include <string.h>
00045 #include <limits.h>
00046 #include <assert.h>
00047 
00048 #include "sphinxbase/err.h"
00049 #include "sphinxbase/pio.h"
00050 #include "sphinxbase/listelem_alloc.h"
00051 #include "sphinxbase/strfuncs.h"
00052 
00053 #include "ngram_model_arpa.h"
00054 
00055 static ngram_funcs_t ngram_model_arpa_funcs;
00056 
00057 #define TSEG_BASE(m,b)          ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ])
00058 #define FIRST_BG(m,u)           ((m)->lm3g.unigrams[u].bigrams)
00059 #define FIRST_TG(m,b)           (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams))
00060 
00061 /*
00062  * Read and return #unigrams, #bigrams, #trigrams as stated in input file.
00063  */
00064 static int
00065 ReadNgramCounts(lineiter_t **li, int32 * n_ug, int32 * n_bg, int32 * n_tg)
00066 {
00067     int32 ngram, ngram_cnt;
00068 
00069     /* skip file until past the '\data\' marker */
00070     while (*li) {
00071         string_trim((*li)->buf, STRING_BOTH);
00072         if (strcmp((*li)->buf, "\\data\\") == 0)
00073             break;
00074         *li = lineiter_next(*li);
00075     }
00076     if (*li == NULL || strcmp((*li)->buf, "\\data\\") != 0) {
00077         E_INFO("No \\data\\ mark in LM file\n");
00078         return -1;
00079     }
00080 
00081     *n_ug = *n_bg = *n_tg = 0;
00082     while ((*li = lineiter_next(*li))) {
00083         if (sscanf((*li)->buf, "ngram %d=%d", &ngram, &ngram_cnt) != 2)
00084             break;
00085         switch (ngram) {
00086         case 1:
00087             *n_ug = ngram_cnt;
00088             break;
00089         case 2:
00090             *n_bg = ngram_cnt;
00091             break;
00092         case 3:
00093             *n_tg = ngram_cnt;
00094             break;
00095         default:
00096             E_ERROR("Unknown ngram (%d)\n", ngram);
00097             return -1;
00098         }
00099     }
00100     if (*li == NULL) {
00101         E_ERROR("EOF while reading ngram counts\n");
00102         return -1;
00103     }
00104 
00105     /* Position iterator to the unigrams header '\1-grams:\' */
00106     while ((*li = lineiter_next(*li))) {
00107         string_trim((*li)->buf, STRING_BOTH);
00108         if (strcmp((*li)->buf, "\\1-grams:") == 0)
00109             break;
00110     }
00111     if (*li == NULL) {
00112         E_ERROR_SYSTEM("Failed to read \\1-grams: mark");
00113         return -1;
00114     }
00115 
00116     if ((*n_ug <= 0) || (*n_bg < 0) || (*n_tg < 0)) {
00117         E_ERROR("Bad or missing ngram count\n");
00118         return -1;
00119     }
00120     return 0;
00121 }
00122 
00123 /*
00124  * Read in the unigrams from given file into the LM structure model.
00125  * On entry to this procedure, the iterator is positioned to the
00126  * header line '\1-grams:'.
00127  */
00128 static int
00129 ReadUnigrams(lineiter_t **li, ngram_model_arpa_t * model)
00130 {
00131     ngram_model_t *base = &model->base;
00132     int32 wcnt;
00133     float p1;
00134 
00135     E_INFO("Reading unigrams\n");
00136 
00137     wcnt = 0;
00138     while ((*li = lineiter_next(*li))) {
00139         char *wptr[3], *name;
00140         float32 bo_wt = 0.0f;
00141         int n;
00142 
00143         string_trim((*li)->buf, STRING_BOTH);
00144         if (strcmp((*li)->buf, "\\2-grams:") == 0
00145             || strcmp((*li)->buf, "\\end\\") == 0)
00146             break;
00147 
00148         if ((n = str2words((*li)->buf, wptr, 3)) < 2) {
00149             if ((*li)->buf[0] != '\0')
00150                 E_WARN("Format error; unigram ignored: %s\n", (*li)->buf);
00151             continue;
00152         }
00153         else {
00154             p1 = (float)atof_c(wptr[0]);
00155             name = wptr[1];
00156             if (n == 3)
00157                 bo_wt = (float)atof_c(wptr[2]);
00158         }
00159 
00160         if (wcnt >= base->n_counts[0]) {
00161             E_ERROR("Too many unigrams\n");
00162             return -1;
00163         }
00164 
00165         /* Associate name with word id */
00166         base->word_str[wcnt] = ckd_salloc(name);
00167         if ((hash_table_enter(base->wid, base->word_str[wcnt], (void *)(long)wcnt))
00168             != (void *)(long)wcnt) {
00169                 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[wcnt]);
00170         }
00171         model->lm3g.unigrams[wcnt].prob1.l = logmath_log10_to_log(base->lmath, p1);
00172         model->lm3g.unigrams[wcnt].bo_wt1.l = logmath_log10_to_log(base->lmath, bo_wt);
00173         wcnt++;
00174     }
00175 
00176     if (base->n_counts[0] != wcnt) {
00177         E_WARN("lm_t.ucount(%d) != #unigrams read(%d)\n",
00178                base->n_counts[0], wcnt);
00179         base->n_counts[0] = wcnt;
00180         base->n_words = wcnt;
00181     }
00182     return 0;
00183 }
00184 
00185 /*
00186  * Read bigrams from given file into given model structure.
00187  */
00188 static int
00189 ReadBigrams(lineiter_t **li, ngram_model_arpa_t * model)
00190 {
00191     ngram_model_t *base = &model->base;
00192     int32 w1, w2, prev_w1, bgcount;
00193     bigram_t *bgptr;
00194 
00195     E_INFO("Reading bigrams\n");
00196 
00197     bgcount = 0;
00198     bgptr = model->lm3g.bigrams;
00199     prev_w1 = -1;
00200 
00201     while ((*li = lineiter_next(*li))) {
00202         float32 p, bo_wt = 0.0f;
00203         int32 p2, bo_wt2;
00204         char *wptr[4], *word1, *word2;
00205         int n;
00206 
00207         string_trim((*li)->buf, STRING_BOTH);
00208         wptr[3] = NULL;
00209         if ((n = str2words((*li)->buf, wptr, 4)) < 3) {
00210             if ((*li)->buf[0] != '\0')
00211                 break;
00212             continue;
00213         }
00214         else {
00215             p = (float32)atof_c(wptr[0]);
00216             word1 = wptr[1];
00217             word2 = wptr[2];
00218             if (wptr[3])
00219                 bo_wt = (float32)atof_c(wptr[3]);
00220         }
00221 
00222         if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) {
00223             E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n",
00224                     word1, word1, word2);
00225             continue;
00226         }
00227         if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) {
00228             E_ERROR("Unknown word: %s, skipping bigram (%s %s)\n",
00229                     word2, word1, word2);
00230             continue;
00231         }
00232 
00233         /* FIXME: Should use logmath_t quantization here. */
00234         /* HACK!! to quantize probs to 4 decimal digits */
00235         p = (float32)((int32)(p * 10000)) / 10000;
00236         bo_wt = (float32)((int32)(bo_wt * 10000)) / 10000;
00237 
00238         p2 = logmath_log10_to_log(base->lmath, p);
00239         bo_wt2 = logmath_log10_to_log(base->lmath, bo_wt);
00240 
00241         if (bgcount >= base->n_counts[1]) {
00242             E_ERROR("Too many bigrams\n");
00243             return -1;
00244         }
00245 
00246         bgptr->wid = w2;
00247         bgptr->prob2 = sorted_id(&model->sorted_prob2, &p2);
00248         if (base->n_counts[2] > 0)
00249             bgptr->bo_wt2 = sorted_id(&model->sorted_bo_wt2, &bo_wt2);
00250 
00251         if (w1 != prev_w1) {
00252             if (w1 < prev_w1) {
00253                 E_ERROR("Bigrams not in unigram order\n");
00254                 return -1;
00255             }
00256 
00257             for (prev_w1++; prev_w1 <= w1; prev_w1++)
00258                 model->lm3g.unigrams[prev_w1].bigrams = bgcount;
00259             prev_w1 = w1;
00260         }
00261         bgcount++;
00262         bgptr++;
00263 
00264         if ((bgcount & 0x0000ffff) == 0) {
00265             E_INFOCONT(".");
00266         }
00267     }
00268     if (*li == NULL || ((strcmp((*li)->buf, "\\end\\") != 0)
00269                         && (strcmp((*li)->buf, "\\3-grams:") != 0))) {
00270         E_ERROR("Bad bigram: %s\n", (*li)->buf);
00271         return -1;
00272     }
00273 
00274     for (prev_w1++; prev_w1 <= base->n_counts[0]; prev_w1++)
00275         model->lm3g.unigrams[prev_w1].bigrams = bgcount;
00276 
00277     return 0;
00278 }
00279 
00280 /*
00281  * Very similar to ReadBigrams.
00282  */
00283 static int
00284 ReadTrigrams(lineiter_t **li, ngram_model_arpa_t * model)
00285 {
00286     ngram_model_t *base = &model->base;
00287     int32 i, w1, w2, w3, prev_w1, prev_w2, tgcount, prev_bg, bg, endbg;
00288     int32 seg, prev_seg, prev_seg_lastbg;
00289     trigram_t *tgptr;
00290     bigram_t *bgptr;
00291 
00292     E_INFO("Reading trigrams\n");
00293 
00294     tgcount = 0;
00295     tgptr = model->lm3g.trigrams;
00296     prev_w1 = -1;
00297     prev_w2 = -1;
00298     prev_bg = -1;
00299     prev_seg = -1;
00300 
00301     while ((*li = lineiter_next(*li))) {
00302         float32 p;
00303         int32 p3;
00304         char *wptr[4], *word1, *word2, *word3;
00305 
00306         string_trim((*li)->buf, STRING_BOTH);
00307         if (str2words((*li)->buf, wptr, 4) != 4) {
00308             if ((*li)->buf[0] != '\0')
00309                 break;
00310             continue;
00311         }
00312         else {
00313             p = (float32)atof_c(wptr[0]);
00314             word1 = wptr[1];
00315             word2 = wptr[2];
00316             word3 = wptr[3];
00317         }
00318 
00319         if ((w1 = ngram_wid(base, word1)) == NGRAM_INVALID_WID) {
00320             E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
00321                     word1, word1, word2, word3);
00322             continue;
00323         }
00324         if ((w2 = ngram_wid(base, word2)) == NGRAM_INVALID_WID) {
00325             E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
00326                     word2, word1, word2, word3);
00327             continue;
00328         }
00329         if ((w3 = ngram_wid(base, word3)) == NGRAM_INVALID_WID) {
00330             E_ERROR("Unknown word: %s, skipping trigram (%s %s %s)\n",
00331                     word3, word1, word2, word3);
00332             continue;
00333         }
00334 
00335         /* FIXME: Should use logmath_t quantization here. */
00336         /* HACK!! to quantize probs to 4 decimal digits */
00337         p = (float32)((int32)(p * 10000)) / 10000;
00338         p3 = logmath_log10_to_log(base->lmath, p);
00339 
00340         if (tgcount >= base->n_counts[2]) {
00341             E_ERROR("Too many trigrams\n");
00342             return -1;
00343         }
00344 
00345         tgptr->wid = w3;
00346         tgptr->prob3 = sorted_id(&model->sorted_prob3, &p3);
00347 
00348         if ((w1 != prev_w1) || (w2 != prev_w2)) {
00349             /* Trigram for a new bigram; update tg info for all previous bigrams */
00350             if ((w1 < prev_w1) || ((w1 == prev_w1) && (w2 < prev_w2))) {
00351                 E_ERROR("Trigrams not in bigram order\n");
00352                 return -1;
00353             }
00354 
00355             bg = (w1 !=
00356                   prev_w1) ? model->lm3g.unigrams[w1].bigrams : prev_bg + 1;
00357             endbg = model->lm3g.unigrams[w1 + 1].bigrams;
00358             bgptr = model->lm3g.bigrams + bg;
00359             for (; (bg < endbg) && (bgptr->wid != w2); bg++, bgptr++);
00360             if (bg >= endbg) {
00361                 E_ERROR("Missing bigram for trigram: %s", (*li)->buf);
00362                 return -1;
00363             }
00364 
00365             /* bg = bigram entry index for <w1,w2>.  Update tseg_base */
00366             seg = bg >> LOG_BG_SEG_SZ;
00367             for (i = prev_seg + 1; i <= seg; i++)
00368                 model->lm3g.tseg_base[i] = tgcount;
00369 
00370             /* Update trigrams pointers for all bigrams until bg */
00371             if (prev_seg < seg) {
00372                 int32 tgoff = 0;
00373 
00374                 if (prev_seg >= 0) {
00375                     tgoff = tgcount - model->lm3g.tseg_base[prev_seg];
00376                     if (tgoff > 65535) {
00377                         E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
00378                         return -1;
00379                     }
00380                 }
00381 
00382                 prev_seg_lastbg = ((prev_seg + 1) << LOG_BG_SEG_SZ) - 1;
00383                 bgptr = model->lm3g.bigrams + prev_bg;
00384                 for (++prev_bg, ++bgptr; prev_bg <= prev_seg_lastbg;
00385                      prev_bg++, bgptr++)
00386                     bgptr->trigrams = tgoff;
00387 
00388                 for (; prev_bg <= bg; prev_bg++, bgptr++)
00389                     bgptr->trigrams = 0;
00390             }
00391             else {
00392                 int32 tgoff;
00393 
00394                 tgoff = tgcount - model->lm3g.tseg_base[prev_seg];
00395                 if (tgoff > 65535) {
00396                     E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
00397                     return -1;
00398                 }
00399 
00400                 bgptr = model->lm3g.bigrams + prev_bg;
00401                 for (++prev_bg, ++bgptr; prev_bg <= bg; prev_bg++, bgptr++)
00402                     bgptr->trigrams = tgoff;
00403             }
00404 
00405             prev_w1 = w1;
00406             prev_w2 = w2;
00407             prev_bg = bg;
00408             prev_seg = seg;
00409         }
00410 
00411         tgcount++;
00412         tgptr++;
00413 
00414         if ((tgcount & 0x0000ffff) == 0) {
00415             E_INFOCONT(".");
00416         }
00417     }
00418     if (*li == NULL || strcmp((*li)->buf, "\\end\\") != 0) {
00419         E_ERROR("Bad trigram: %s\n", (*li)->buf);
00420         return -1;
00421     }
00422 
00423     for (prev_bg++; prev_bg <= base->n_counts[1]; prev_bg++) {
00424         if ((prev_bg & (BG_SEG_SZ - 1)) == 0)
00425             model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ] = tgcount;
00426         if ((tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ]) > 65535) {
00427             E_ERROR("Size of trigram segment is bigger than 65535, such a big language models are not supported, use smaller vocabulary\n");
00428             return -1;
00429         }
00430         model->lm3g.bigrams[prev_bg].trigrams =
00431             tgcount - model->lm3g.tseg_base[prev_bg >> LOG_BG_SEG_SZ];
00432     }
00433     return 0;
00434 }
00435 
00436 static unigram_t *
00437 new_unigram_table(int32 n_ug)
00438 {
00439     unigram_t *table;
00440     int32 i;
00441 
00442     table = ckd_calloc(n_ug, sizeof(unigram_t));
00443     for (i = 0; i < n_ug; i++) {
00444         table[i].prob1.l = INT_MIN;
00445         table[i].bo_wt1.l = INT_MIN;
00446     }
00447     return table;
00448 }
00449 
00450 ngram_model_t *
00451 ngram_model_arpa_read(cmd_ln_t *config,
00452                       const char *file_name,
00453                       logmath_t *lmath)
00454 {
00455     lineiter_t *li;
00456     FILE *fp;
00457     int32 is_pipe;
00458     int32 n_unigram;
00459     int32 n_bigram;
00460     int32 n_trigram;
00461     int32 n;
00462     ngram_model_arpa_t *model;
00463     ngram_model_t *base;
00464 
00465     if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) {
00466         E_ERROR("File %s not found\n", file_name);
00467         return NULL;
00468     }
00469     li = lineiter_start(fp);
00470  
00471     /* Read #unigrams, #bigrams, #trigrams from file */
00472     if (ReadNgramCounts(&li, &n_unigram, &n_bigram, &n_trigram) == -1) {
00473         lineiter_free(li);
00474         fclose_comp(fp, is_pipe);
00475         return NULL;
00476     }
00477     E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram);
00478 
00479     /* Allocate space for LM, including initial OOVs and placeholders; initialize it */
00480     model = ckd_calloc(1, sizeof(*model));
00481     base = &model->base;
00482     if (n_trigram > 0)
00483         n = 3;
00484     else if (n_bigram > 0)
00485         n = 2;
00486     else
00487         n = 1;
00488     /* Initialize base model. */
00489     ngram_model_init(base, &ngram_model_arpa_funcs, lmath, n, n_unigram);
00490     base->n_counts[0] = n_unigram;
00491     base->n_counts[1] = n_bigram;
00492     base->n_counts[2] = n_trigram;
00493     base->writable = TRUE;
00494 
00495     /*
00496      * Allocate one extra unigram and bigram entry: sentinels to terminate
00497      * followers (bigrams and trigrams, respectively) of previous entry.
00498      */
00499     model->lm3g.unigrams = new_unigram_table(n_unigram + 1);
00500     model->lm3g.bigrams =
00501         ckd_calloc(n_bigram + 1, sizeof(bigram_t));
00502     if (n_trigram > 0)
00503         model->lm3g.trigrams =
00504             ckd_calloc(n_trigram, sizeof(trigram_t));
00505 
00506     if (n_trigram > 0) {
00507         model->lm3g.tseg_base =
00508             ckd_calloc((n_bigram + 1) / BG_SEG_SZ + 1,
00509                        sizeof(int32));
00510     }
00511     if (ReadUnigrams(&li, model) == -1) {
00512         fclose_comp(fp, is_pipe);
00513         ngram_model_free(base);
00514         return NULL;
00515     }
00516     E_INFO("%8d = #unigrams created\n", base->n_counts[0]);
00517 
00518     init_sorted_list(&model->sorted_prob2);
00519     if (base->n_counts[2] > 0)
00520         init_sorted_list(&model->sorted_bo_wt2);
00521 
00522     if (base->n_counts[1] > 0) {
00523         if (ReadBigrams(&li, model) == -1) {
00524             fclose_comp(fp, is_pipe);
00525             ngram_model_free(base);
00526             return NULL;
00527         }
00528 
00529         base->n_counts[1] = FIRST_BG(model, base->n_counts[0]);
00530         model->lm3g.n_prob2 = model->sorted_prob2.free;
00531         model->lm3g.prob2 = vals_in_sorted_list(&model->sorted_prob2);
00532         free_sorted_list(&model->sorted_prob2);
00533         E_INFO("%8d = #bigrams created\n", base->n_counts[1]);
00534         E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2);
00535     }
00536 
00537     if (base->n_counts[2] > 0) {
00538         /* Create trigram bo-wts array */
00539         model->lm3g.n_bo_wt2 = model->sorted_bo_wt2.free;
00540         model->lm3g.bo_wt2 = vals_in_sorted_list(&model->sorted_bo_wt2);
00541         free_sorted_list(&model->sorted_bo_wt2);
00542         E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2);
00543 
00544         init_sorted_list(&model->sorted_prob3);
00545 
00546         if (ReadTrigrams(&li, model) == -1) {
00547             fclose_comp(fp, is_pipe);
00548             ngram_model_free(base);
00549             return NULL;
00550         }
00551 
00552         base->n_counts[2] = FIRST_TG(model, base->n_counts[1]);
00553         model->lm3g.n_prob3 = model->sorted_prob3.free;
00554         model->lm3g.prob3 = vals_in_sorted_list(&model->sorted_prob3);
00555         E_INFO("%8d = #trigrams created\n", base->n_counts[2]);
00556         E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3);
00557 
00558         free_sorted_list(&model->sorted_prob3);
00559 
00560         /* Initialize tginfo */
00561         model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *));
00562         model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t));
00563     }
00564 
00565     lineiter_free(li);
00566     fclose_comp(fp, is_pipe);
00567     return base;
00568 }
00569 
00570 int
00571 ngram_model_arpa_write(ngram_model_t *model,
00572                        const char *file_name)
00573 {
00574     ngram_iter_t *itor;
00575     FILE *fh;
00576     int i;
00577 
00578     if ((fh = fopen(file_name, "w")) == NULL) {
00579         E_ERROR_SYSTEM("Failed to open %s for writing", file_name);
00580         return -1;
00581     }
00582     fprintf(fh, "This is an ARPA-format language model file, generated by CMU Sphinx\n");
00583 
00584     /* The ARPA format doesn't require any extra information that
00585      * N-Gram iterators can't give us, so this is very
00586      * straightforward compared with DMP writing. */
00587 
00588     /* Write N-gram counts. */
00589     fprintf(fh, "\\data\\\n");
00590     for (i = 0; i < model->n; ++i) {
00591         fprintf(fh, "ngram %d=%d\n", i+1, model->n_counts[i]);
00592     }
00593 
00594     /* Write N-grams */
00595     for (i = 0; i < model->n; ++i) {
00596         fprintf(fh, "\n\\%d-grams:\n", i + 1);
00597         for (itor = ngram_model_mgrams(model, i); itor; itor = ngram_iter_next(itor)) {
00598             int32 const *wids;
00599             int32 score, bowt;
00600             int j;
00601 
00602             wids = ngram_iter_get(itor, &score, &bowt);
00603             fprintf(fh, "%.4f ", logmath_log_to_log10(model->lmath, score));
00604             for (j = 0; j <= i; ++j) {
00605                 assert(wids[j] < model->n_counts[0]);
00606                 fprintf(fh, "%s ", model->word_str[wids[j]]);
00607             }
00608             if (i < model->n-1)
00609                 fprintf(fh, "%.4f", logmath_log_to_log10(model->lmath, bowt));
00610             fprintf(fh, "\n");
00611         }
00612     }
00613     fprintf(fh, "\n\\end\\\n");
00614     return fclose(fh);
00615 }
00616 
00617 static int
00618 ngram_model_arpa_apply_weights(ngram_model_t *base, float32 lw,
00619                               float32 wip, float32 uw)
00620 {
00621     ngram_model_arpa_t *model = (ngram_model_arpa_t *)base;
00622     lm3g_apply_weights(base, &model->lm3g, lw, wip, uw);
00623     return 0;
00624 }
00625 
00626 /* Lousy "templating" for things that are largely the same in DMP and
00627  * ARPA models, except for the bigram and trigram types and some
00628  * names. */
00629 #define NGRAM_MODEL_TYPE ngram_model_arpa_t
00630 #include "lm3g_templates.c"
00631 
00632 static void
00633 ngram_model_arpa_free(ngram_model_t *base)
00634 {
00635     ngram_model_arpa_t *model = (ngram_model_arpa_t *)base;
00636     ckd_free(model->lm3g.unigrams);
00637     ckd_free(model->lm3g.bigrams);
00638     ckd_free(model->lm3g.trigrams);
00639     ckd_free(model->lm3g.prob2);
00640     ckd_free(model->lm3g.bo_wt2);
00641     ckd_free(model->lm3g.prob3);
00642     lm3g_tginfo_free(base, &model->lm3g);
00643     ckd_free(model->lm3g.tseg_base);
00644 }
00645 
00646 static ngram_funcs_t ngram_model_arpa_funcs = {
00647     ngram_model_arpa_free,          /* free */
00648     ngram_model_arpa_apply_weights, /* apply_weights */
00649     lm3g_template_score,            /* score */
00650     lm3g_template_raw_score,        /* raw_score */
00651     lm3g_template_add_ug,           /* add_ug */
00652     lm3g_template_flush,            /* flush */
00653     lm3g_template_iter,             /* iter */
00654     lm3g_template_mgrams,           /* mgrams */
00655     lm3g_template_successors,       /* successors */
00656     lm3g_template_iter_get,         /* iter_get */
00657     lm3g_template_iter_next,        /* iter_next */
00658     lm3g_template_iter_free         /* iter_free */
00659 };