SphinxBase 0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 1999-2007 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * This work was supported in part by funding from the Defense Advanced 00019 * Research Projects Agency and the National Science Foundation of the 00020 * United States of America, and the CMU Sphinx Speech Consortium. 00021 * 00022 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00023 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00024 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00025 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00026 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00028 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00029 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00030 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00032 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 * 00034 * ==================================================================== 00035 * 00036 */ 00037 /* 00038 * \file ngram_model_dmp.c DMP format language models 00039 * 00040 * Author: David Huggins-Daines <dhuggins@cs.cmu.edu> 00041 */ 00042 00043 #include <assert.h> 00044 #include <stdio.h> 00045 #include <string.h> 00046 #include <stdlib.h> 00047 #include <limits.h> 00048 00049 #include "sphinxbase/ckd_alloc.h" 00050 #include "sphinxbase/pio.h" 00051 #include "sphinxbase/err.h" 00052 #include "sphinxbase/byteorder.h" 00053 #include "sphinxbase/listelem_alloc.h" 00054 00055 #include "ngram_model_dmp.h" 00056 00057 static const char darpa_hdr[] = "Darpa Trigram LM"; 00058 static ngram_funcs_t ngram_model_dmp_funcs; 00059 00060 #define TSEG_BASE(m,b) ((m)->lm3g.tseg_base[(b)>>LOG_BG_SEG_SZ]) 00061 #define FIRST_BG(m,u) ((m)->lm3g.unigrams[u].bigrams) 00062 #define FIRST_TG(m,b) (TSEG_BASE((m),(b))+((m)->lm3g.bigrams[b].trigrams)) 00063 00064 static unigram_t * 00065 new_unigram_table(int32 n_ug) 00066 { 00067 unigram_t *table; 00068 int32 i; 00069 00070 table = ckd_calloc(n_ug, sizeof(unigram_t)); 00071 for (i = 0; i < n_ug; i++) { 00072 table[i].prob1.f = -99.0; 00073 table[i].bo_wt1.f = -99.0; 00074 } 00075 return table; 00076 } 00077 00078 ngram_model_t * 00079 ngram_model_dmp_read(cmd_ln_t *config, 00080 const char *file_name, 00081 logmath_t *lmath) 00082 { 00083 ngram_model_t *base; 00084 ngram_model_dmp_t *model; 00085 FILE *fp; 00086 int do_mmap, do_swap; 00087 int32 is_pipe; 00088 int32 i, j, k, vn, n, ts; 00089 int32 n_unigram; 00090 int32 n_bigram; 00091 int32 n_trigram; 00092 char str[1024]; 00093 unigram_t *ugptr; 00094 bigram_t *bgptr; 00095 trigram_t *tgptr; 00096 char *tmp_word_str; 00097 char *map_base = NULL; 00098 size_t offset = 0; 00099 00100 base = NULL; 00101 do_mmap = FALSE; 00102 if (config) 00103 do_mmap = cmd_ln_boolean_r(config, "-mmap"); 00104 00105 if ((fp = fopen_comp(file_name, "rb", &is_pipe)) == NULL) { 00106 E_ERROR("Dump file %s not found\n", file_name); 00107 goto error_out; 00108 } 00109 00110 if (is_pipe && do_mmap) { 00111 E_WARN("Dump file is compressed, will not use memory-mapped I/O\n"); 00112 do_mmap = 0; 00113 } 00114 00115 do_swap = FALSE; 00116 if (fread(&k, sizeof(k), 1, fp) != 1) 00117 goto error_out; 00118 if (k != strlen(darpa_hdr)+1) { 00119 SWAP_INT32(&k); 00120 if (k != strlen(darpa_hdr)+1) { 00121 E_ERROR("Wrong magic header size number %x: %s is not a dump file\n", k, file_name); 00122 goto error_out; 00123 } 00124 do_swap = 1; 00125 } 00126 if (fread(str, 1, k, fp) != (size_t) k) { 00127 E_ERROR("Cannot read header\n"); 00128 goto error_out; 00129 } 00130 if (strncmp(str, darpa_hdr, k) != 0) { 00131 E_ERROR("Wrong header %s: %s is not a dump file\n", darpa_hdr); 00132 goto error_out; 00133 } 00134 00135 if (do_mmap) { 00136 if (do_swap) { 00137 E_INFO 00138 ("Byteswapping required, will not use memory-mapped I/O for LM file\n"); 00139 do_mmap = 0; 00140 } 00141 else { 00142 E_INFO("Will use memory-mapped I/O for LM file\n"); 00143 #ifdef __ADSPBLACKFIN__ /* This is true for both VisualDSP++ and uClinux. */ 00144 E_FATAL("memory mapping is not supported at the moment."); 00145 #else 00146 #endif 00147 } 00148 } 00149 00150 if (fread(&k, sizeof(k), 1, fp) != 1) 00151 goto error_out; 00152 if (do_swap) SWAP_INT32(&k); 00153 if (fread(str, 1, k, fp) != (size_t) k) { 00154 E_ERROR("Cannot read LM filename in header\n"); 00155 goto error_out; 00156 } 00157 00158 /* read version#, if present (must be <= 0) */ 00159 if (fread(&vn, sizeof(vn), 1, fp) != 1) 00160 goto error_out; 00161 if (do_swap) SWAP_INT32(&vn); 00162 if (vn <= 0) { 00163 /* read and don't compare timestamps (we don't care) */ 00164 if (fread(&ts, sizeof(ts), 1, fp) != 1) 00165 goto error_out; 00166 if (do_swap) SWAP_INT32(&ts); 00167 00168 /* read and skip format description */ 00169 for (;;) { 00170 if (fread(&k, sizeof(k), 1, fp) != 1) 00171 goto error_out; 00172 if (do_swap) SWAP_INT32(&k); 00173 if (k == 0) 00174 break; 00175 if (fread(str, 1, k, fp) != (size_t) k) { 00176 E_ERROR("Failed to read word\n"); 00177 goto error_out; 00178 } 00179 } 00180 /* read model->ucount */ 00181 if (fread(&n_unigram, sizeof(n_unigram), 1, fp) != 1) 00182 goto error_out; 00183 if (do_swap) SWAP_INT32(&n_unigram); 00184 } 00185 else { 00186 n_unigram = vn; 00187 } 00188 00189 /* read model->bcount, tcount */ 00190 if (fread(&n_bigram, sizeof(n_bigram), 1, fp) != 1) 00191 goto error_out; 00192 if (do_swap) SWAP_INT32(&n_bigram); 00193 if (fread(&n_trigram, sizeof(n_trigram), 1, fp) != 1) 00194 goto error_out; 00195 if (do_swap) SWAP_INT32(&n_trigram); 00196 E_INFO("ngrams 1=%d, 2=%d, 3=%d\n", n_unigram, n_bigram, n_trigram); 00197 00198 /* Allocate space for LM, including initial OOVs and placeholders; initialize it */ 00199 model = ckd_calloc(1, sizeof(*model)); 00200 base = &model->base; 00201 if (n_trigram > 0) 00202 n = 3; 00203 else if (n_bigram > 0) 00204 n = 2; 00205 else 00206 n = 1; 00207 ngram_model_init(base, &ngram_model_dmp_funcs, lmath, n, n_unigram); 00208 base->n_counts[0] = n_unigram; 00209 base->n_counts[1] = n_bigram; 00210 base->n_counts[2] = n_trigram; 00211 00212 /* read unigrams (always in memory, as they contain dictionary 00213 * mappings that can't be precomputed, and also could have OOVs added) */ 00214 model->lm3g.unigrams = new_unigram_table(n_unigram + 1); 00215 ugptr = model->lm3g.unigrams; 00216 for (i = 0; i <= n_unigram; ++i) { 00217 /* Skip over the mapping ID, we don't care about it. */ 00218 if (fread(ugptr, sizeof(int32), 1, fp) != 1) { 00219 E_ERROR("Failed to read maping id %d\n", i); 00220 goto error_out; 00221 } 00222 /* Read the actual unigram structure. */ 00223 if (fread(ugptr, sizeof(unigram_t), 1, fp) != 1) { 00224 E_ERROR("Failed to read unigrams data\n"); 00225 ngram_model_free(base); 00226 fclose_comp(fp, is_pipe); 00227 return NULL; 00228 } 00229 /* Byte swap if necessary. */ 00230 if (do_swap) { 00231 SWAP_INT32(&ugptr->prob1.l); 00232 SWAP_INT32(&ugptr->bo_wt1.l); 00233 SWAP_INT32(&ugptr->bigrams); 00234 } 00235 /* Convert values to log. */ 00236 ugptr->prob1.l = logmath_log10_to_log(lmath, ugptr->prob1.f); 00237 ugptr->bo_wt1.l = logmath_log10_to_log(lmath, ugptr->bo_wt1.f); 00238 E_DEBUG(2, ("ug %d: prob %d bo %d bigrams %d\n", 00239 i, ugptr->prob1.l, ugptr->bo_wt1.l, ugptr->bigrams)); 00240 ++ugptr; 00241 } 00242 E_INFO("%8d = LM.unigrams(+trailer) read\n", n_unigram); 00243 00244 /* Now mmap() the file and read in the rest of the (read-only) stuff. */ 00245 if (do_mmap) { 00246 offset = ftell(fp); 00247 00248 /* Check for improper word alignment. */ 00249 if (offset & 0x3) { 00250 E_WARN("-mmap specified, but trigram index is not word-aligned. Will not memory-map.\n"); 00251 do_mmap = FALSE; 00252 } 00253 else { 00254 model->dump_mmap = mmio_file_read(file_name); 00255 if (model->dump_mmap == NULL) { 00256 do_mmap = FALSE; 00257 } 00258 else { 00259 map_base = mmio_file_ptr(model->dump_mmap); 00260 } 00261 } 00262 } 00263 00264 if (n_bigram > 0) { 00265 /* read bigrams */ 00266 if (do_mmap) { 00267 model->lm3g.bigrams = (bigram_t *) (map_base + offset); 00268 offset += (n_bigram + 1) * sizeof(bigram_t); 00269 } 00270 else { 00271 model->lm3g.bigrams = 00272 ckd_calloc(n_bigram + 1, sizeof(bigram_t)); 00273 if (fread(model->lm3g.bigrams, sizeof(bigram_t), n_bigram + 1, fp) 00274 != (size_t) n_bigram + 1) { 00275 E_ERROR("Failed to read bigrams data\n"); 00276 goto error_out; 00277 } 00278 if (do_swap) { 00279 for (i = 0, bgptr = model->lm3g.bigrams; i <= n_bigram; 00280 i++, bgptr++) { 00281 SWAP_INT16(&bgptr->wid); 00282 SWAP_INT16(&bgptr->prob2); 00283 SWAP_INT16(&bgptr->bo_wt2); 00284 SWAP_INT16(&bgptr->trigrams); 00285 } 00286 } 00287 } 00288 E_INFO("%8d = LM.bigrams(+trailer) read\n", n_bigram); 00289 } 00290 00291 /* read trigrams */ 00292 if (n_trigram > 0) { 00293 if (do_mmap) { 00294 model->lm3g.trigrams = (trigram_t *) (map_base + offset); 00295 offset += n_trigram * sizeof(trigram_t); 00296 } 00297 else { 00298 model->lm3g.trigrams = 00299 ckd_calloc(n_trigram, sizeof(trigram_t)); 00300 if (fread 00301 (model->lm3g.trigrams, sizeof(trigram_t), n_trigram, fp) 00302 != (size_t) n_trigram) { 00303 E_ERROR("Failed to read trigrams data\n"); 00304 goto error_out; 00305 } 00306 if (do_swap) { 00307 for (i = 0, tgptr = model->lm3g.trigrams; i < n_trigram; 00308 i++, tgptr++) { 00309 SWAP_INT16(&tgptr->wid); 00310 SWAP_INT16(&tgptr->prob3); 00311 } 00312 } 00313 } 00314 E_INFO("%8d = LM.trigrams read\n", n_trigram); 00315 /* Initialize tginfo */ 00316 model->lm3g.tginfo = ckd_calloc(n_unigram, sizeof(tginfo_t *)); 00317 model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t)); 00318 } 00319 00320 if (n_bigram > 0) { 00321 /* read n_prob2 and prob2 array (in memory) */ 00322 if (do_mmap) 00323 fseek(fp, offset, SEEK_SET); 00324 if (fread(&k, sizeof(k), 1, fp) != 1) 00325 goto error_out; 00326 if (do_swap) SWAP_INT32(&k); 00327 model->lm3g.n_prob2 = k; 00328 model->lm3g.prob2 = ckd_calloc(k, sizeof(*model->lm3g.prob2)); 00329 if (fread(model->lm3g.prob2, sizeof(*model->lm3g.prob2), k, fp) != (size_t) k) { 00330 E_ERROR("fread(prob2) failed\n"); 00331 goto error_out; 00332 } 00333 for (i = 0; i < k; i++) { 00334 if (do_swap) 00335 SWAP_INT32(&model->lm3g.prob2[i].l); 00336 /* Convert values to log. */ 00337 model->lm3g.prob2[i].l = logmath_log10_to_log(lmath, model->lm3g.prob2[i].f); 00338 } 00339 E_INFO("%8d = LM.prob2 entries read\n", k); 00340 } 00341 00342 /* read n_bo_wt2 and bo_wt2 array (in memory) */ 00343 if (base->n > 2) { 00344 if (fread(&k, sizeof(k), 1, fp) != 1) 00345 goto error_out; 00346 if (do_swap) SWAP_INT32(&k); 00347 model->lm3g.n_bo_wt2 = k; 00348 model->lm3g.bo_wt2 = ckd_calloc(k, sizeof(*model->lm3g.bo_wt2)); 00349 if (fread(model->lm3g.bo_wt2, sizeof(*model->lm3g.bo_wt2), k, fp) != (size_t) k) { 00350 E_ERROR("Failed to read backoff weights\n"); 00351 goto error_out; 00352 } 00353 for (i = 0; i < k; i++) { 00354 if (do_swap) 00355 SWAP_INT32(&model->lm3g.bo_wt2[i].l); 00356 /* Convert values to log. */ 00357 model->lm3g.bo_wt2[i].l = logmath_log10_to_log(lmath, model->lm3g.bo_wt2[i].f); 00358 } 00359 E_INFO("%8d = LM.bo_wt2 entries read\n", k); 00360 } 00361 00362 /* read n_prob3 and prob3 array (in memory) */ 00363 if (base->n > 2) { 00364 if (fread(&k, sizeof(k), 1, fp) != 1) 00365 goto error_out; 00366 if (do_swap) SWAP_INT32(&k); 00367 model->lm3g.n_prob3 = k; 00368 model->lm3g.prob3 = ckd_calloc(k, sizeof(*model->lm3g.prob3)); 00369 if (fread(model->lm3g.prob3, sizeof(*model->lm3g.prob3), k, fp) != (size_t) k) { 00370 E_ERROR("Failed to read trigram probability\n"); 00371 goto error_out; 00372 } 00373 for (i = 0; i < k; i++) { 00374 if (do_swap) 00375 SWAP_INT32(&model->lm3g.prob3[i].l); 00376 /* Convert values to log. */ 00377 model->lm3g.prob3[i].l = logmath_log10_to_log(lmath, model->lm3g.prob3[i].f); 00378 } 00379 E_INFO("%8d = LM.prob3 entries read\n", k); 00380 } 00381 00382 /* read tseg_base size and tseg_base */ 00383 if (do_mmap) 00384 offset = ftell(fp); 00385 if (n_trigram > 0) { 00386 if (do_mmap) { 00387 memcpy(&k, map_base + offset, sizeof(k)); 00388 offset += sizeof(int32); 00389 model->lm3g.tseg_base = (int32 *) (map_base + offset); 00390 offset += k * sizeof(int32); 00391 } 00392 else { 00393 k = (n_bigram + 1) / BG_SEG_SZ + 1; 00394 if (fread(&k, sizeof(k), 1, fp) != 1) 00395 goto error_out; 00396 if (do_swap) SWAP_INT32(&k); 00397 model->lm3g.tseg_base = ckd_calloc(k, sizeof(int32)); 00398 if (fread(model->lm3g.tseg_base, sizeof(int32), k, fp) != 00399 (size_t) k) { 00400 E_ERROR("Failed to read trigram index\n"); 00401 goto error_out; 00402 } 00403 if (do_swap) 00404 for (i = 0; i < k; i++) 00405 SWAP_INT32(&model->lm3g.tseg_base[i]); 00406 } 00407 E_INFO("%8d = LM.tseg_base entries read\n", k); 00408 } 00409 00410 /* read ascii word strings */ 00411 if (do_mmap) { 00412 memcpy(&k, map_base + offset, sizeof(k)); 00413 offset += sizeof(int32); 00414 tmp_word_str = (char *) (map_base + offset); 00415 offset += k; 00416 } 00417 else { 00418 base->writable = TRUE; 00419 if (fread(&k, sizeof(k), 1, fp) != 1) 00420 goto error_out; 00421 if (do_swap) SWAP_INT32(&k); 00422 tmp_word_str = ckd_calloc(k, 1); 00423 if (fread(tmp_word_str, 1, k, fp) != (size_t) k) { 00424 E_ERROR("Failed to read words\n"); 00425 goto error_out; 00426 } 00427 } 00428 00429 /* First make sure string just read contains n_counts[0] words (PARANOIA!!) */ 00430 for (i = 0, j = 0; i < k; i++) 00431 if (tmp_word_str[i] == '\0') 00432 j++; 00433 if (j != n_unigram) { 00434 E_ERROR("Error reading word strings (%d doesn't match n_unigrams %d)\n", 00435 j, n_unigram); 00436 goto error_out; 00437 } 00438 00439 /* Break up string just read into words */ 00440 if (do_mmap) { 00441 j = 0; 00442 for (i = 0; i < n_unigram; i++) { 00443 base->word_str[i] = tmp_word_str + j; 00444 if (hash_table_enter(base->wid, base->word_str[i], 00445 (void *)(long)i) != (void *)(long)i) { 00446 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]); 00447 } 00448 j += strlen(base->word_str[i]) + 1; 00449 } 00450 } 00451 else { 00452 j = 0; 00453 for (i = 0; i < n_unigram; i++) { 00454 base->word_str[i] = ckd_salloc(tmp_word_str + j); 00455 if (hash_table_enter(base->wid, base->word_str[i], 00456 (void *)(long)i) != (void *)(long)i) { 00457 E_WARN("Duplicate word in dictionary: %s\n", base->word_str[i]); 00458 } 00459 j += strlen(base->word_str[i]) + 1; 00460 } 00461 free(tmp_word_str); 00462 } 00463 E_INFO("%8d = ascii word strings read\n", i); 00464 00465 fclose_comp(fp, is_pipe); 00466 return base; 00467 00468 error_out: 00469 if (fp) 00470 fclose_comp(fp, is_pipe); 00471 ngram_model_free(base); 00472 return NULL; 00473 } 00474 00475 ngram_model_dmp_t * 00476 ngram_model_dmp_build(ngram_model_t *base) 00477 { 00478 ngram_model_dmp_t *model; 00479 ngram_model_t *newbase; 00480 ngram_iter_t *itor; 00481 sorted_list_t sorted_prob2; 00482 sorted_list_t sorted_bo_wt2; 00483 sorted_list_t sorted_prob3; 00484 bigram_t *bgptr; 00485 trigram_t *tgptr; 00486 int i, bgcount, tgcount, seg; 00487 00488 if (base->funcs == &ngram_model_dmp_funcs) { 00489 E_INFO("Using existing DMP model.\n"); 00490 return (ngram_model_dmp_t *)ngram_model_retain(base); 00491 } 00492 00493 /* Initialize new base model structure with params from base. */ 00494 E_INFO("Building DMP model...\n"); 00495 model = ckd_calloc(1, sizeof(*model)); 00496 newbase = &model->base; 00497 ngram_model_init(newbase, &ngram_model_dmp_funcs, 00498 logmath_retain(base->lmath), 00499 base->n, base->n_counts[0]); 00500 /* Copy N-gram counts over. */ 00501 memcpy(newbase->n_counts, base->n_counts, 00502 base->n * sizeof(*base->n_counts)); 00503 /* Make sure word strings are freed. */ 00504 newbase->writable = TRUE; 00505 /* Initialize unigram table and string table. */ 00506 model->lm3g.unigrams = new_unigram_table(newbase->n_counts[0] + 1); 00507 for (itor = ngram_model_mgrams(base, 0); itor; 00508 itor = ngram_iter_next(itor)) { 00509 int32 prob1, bo_wt1; 00510 int32 const *wids; 00511 00512 /* Can't guarantee they will go in unigram order, so just to 00513 * be correct, we do this... */ 00514 wids = ngram_iter_get(itor, &prob1, &bo_wt1); 00515 model->lm3g.unigrams[wids[0]].prob1.l = prob1; 00516 model->lm3g.unigrams[wids[0]].bo_wt1.l = bo_wt1; 00517 newbase->word_str[wids[0]] = ckd_salloc(ngram_word(base, wids[0])); 00518 if ((hash_table_enter_int32(newbase->wid, 00519 newbase->word_str[wids[0]], wids[0])) 00520 != wids[0]) { 00521 E_WARN("Duplicate word in dictionary: %s\n", newbase->word_str[wids[0]]); 00522 } 00523 } 00524 E_INFO("%8d = #unigrams created\n", newbase->n_counts[0]); 00525 00526 if (newbase->n < 2) 00527 return model; 00528 00529 /* Construct quantized probability table for bigrams and 00530 * (optionally) trigrams. Hesitate to use the "sorted list" thing 00531 * since it isn't so useful, but it's there already. */ 00532 init_sorted_list(&sorted_prob2); 00533 if (newbase->n > 2) { 00534 init_sorted_list(&sorted_bo_wt2); 00535 init_sorted_list(&sorted_prob3); 00536 } 00537 /* Construct bigram and trigram arrays. */ 00538 bgptr = model->lm3g.bigrams = ckd_calloc(newbase->n_counts[1] + 1, sizeof(bigram_t)); 00539 if (newbase->n > 2) { 00540 tgptr = model->lm3g.trigrams = ckd_calloc(newbase->n_counts[2], sizeof(trigram_t)); 00541 model->lm3g.tseg_base = 00542 ckd_calloc((newbase->n_counts[1] + 1) / BG_SEG_SZ + 1, sizeof(int32)); 00543 } 00544 else 00545 tgptr = NULL; 00546 /* Since bigrams and trigrams have to be contiguous with others 00547 * with the same N-1-gram, we traverse them in depth-first order 00548 * to build the bigram and trigram arrays. */ 00549 for (i = 0; i < newbase->n_counts[0]; ++i) { 00550 ngram_iter_t *uitor; 00551 bgcount = bgptr - model->lm3g.bigrams; 00552 /* First bigram index (same as next if no bigrams...) */ 00553 model->lm3g.unigrams[i].bigrams = bgcount; 00554 E_DEBUG(2, ("unigram %d: %s => bigram %d\n", i, newbase->word_str[i], bgcount)); 00555 /* All bigrams corresponding to unigram i */ 00556 uitor = ngram_ng_iter(base, i, NULL, 0); 00557 for (itor = ngram_iter_successors(uitor); 00558 itor; ++bgptr, itor = ngram_iter_next(itor)) { 00559 int32 prob2, bo_wt2; 00560 int32 const *wids; 00561 ngram_iter_t *titor; 00562 00563 wids = ngram_iter_get(itor, &prob2, &bo_wt2); 00564 00565 assert (bgptr - model->lm3g.bigrams < newbase->n_counts[1]); 00566 00567 bgptr->wid = wids[1]; 00568 bgptr->prob2 = sorted_id(&sorted_prob2, &prob2); 00569 if (newbase->n > 2) { 00570 tgcount = (tgptr - model->lm3g.trigrams); 00571 bgcount = (bgptr - model->lm3g.bigrams); 00572 00573 /* Backoff weight (only if there are trigrams...) */ 00574 bgptr->bo_wt2 = sorted_id(&sorted_bo_wt2, &bo_wt2); 00575 00576 /* Find bigram segment for this bigram (this isn't 00577 * used unless there are trigrams) */ 00578 seg = bgcount >> LOG_BG_SEG_SZ; 00579 /* If we just crossed a bigram segment boundary, then 00580 * point tseg_base for the new segment to the current 00581 * trigram pointer. */ 00582 if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ) 00583 model->lm3g.tseg_base[seg] = tgcount; 00584 /* Now calculate the trigram offset. */ 00585 bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg]; 00586 E_DEBUG(2, ("bigram %d %s %s => trigram %d:%d\n", 00587 bgcount, 00588 newbase->word_str[wids[0]], 00589 newbase->word_str[wids[1]], 00590 seg, bgptr->trigrams)); 00591 00592 /* And fill in successors' trigram info. */ 00593 for (titor = ngram_iter_successors(itor); 00594 titor; ++tgptr, titor = ngram_iter_next(titor)) { 00595 int32 prob3, dummy; 00596 00597 assert(tgptr - model->lm3g.trigrams < newbase->n_counts[2]); 00598 wids = ngram_iter_get(titor, &prob3, &dummy); 00599 tgptr->wid = wids[2]; 00600 tgptr->prob3 = sorted_id(&sorted_prob3, &prob3); 00601 E_DEBUG(2, ("trigram %d %s %s %s => prob %d\n", 00602 tgcount, 00603 newbase->word_str[wids[0]], 00604 newbase->word_str[wids[1]], 00605 newbase->word_str[wids[2]], 00606 tgptr->prob3)); 00607 } 00608 } 00609 } 00610 ngram_iter_free(uitor); 00611 } 00612 /* Add sentinal unigram and bigram records. */ 00613 bgcount = bgptr - model->lm3g.bigrams; 00614 tgcount = tgptr - model->lm3g.trigrams; 00615 seg = bgcount >> LOG_BG_SEG_SZ; 00616 if (seg != (bgcount - 1) >> LOG_BG_SEG_SZ) 00617 model->lm3g.tseg_base[seg] = tgcount; 00618 model->lm3g.unigrams[i].bigrams = bgcount; 00619 if (newbase->n > 2) 00620 bgptr->trigrams = tgcount - model->lm3g.tseg_base[seg]; 00621 00622 /* Now create probability tables. */ 00623 model->lm3g.n_prob2 = sorted_prob2.free; 00624 model->lm3g.prob2 = vals_in_sorted_list(&sorted_prob2); 00625 E_INFO("%8d = #bigrams created\n", newbase->n_counts[1]); 00626 E_INFO("%8d = #prob2 entries\n", model->lm3g.n_prob2); 00627 free_sorted_list(&sorted_prob2); 00628 if (newbase->n > 2) { 00629 /* Create trigram bo-wts array. */ 00630 model->lm3g.n_bo_wt2 = sorted_bo_wt2.free; 00631 model->lm3g.bo_wt2 = vals_in_sorted_list(&sorted_bo_wt2); 00632 free_sorted_list(&sorted_bo_wt2); 00633 E_INFO("%8d = #bo_wt2 entries\n", model->lm3g.n_bo_wt2); 00634 /* Create trigram probability table. */ 00635 model->lm3g.n_prob3 = sorted_prob3.free; 00636 model->lm3g.prob3 = vals_in_sorted_list(&sorted_prob3); 00637 E_INFO("%8d = #trigrams created\n", newbase->n_counts[2]); 00638 E_INFO("%8d = #prob3 entries\n", model->lm3g.n_prob3); 00639 free_sorted_list(&sorted_prob3); 00640 /* Initialize tginfo */ 00641 model->lm3g.tginfo = ckd_calloc(newbase->n_counts[0], sizeof(tginfo_t *)); 00642 model->lm3g.le = listelem_alloc_init(sizeof(tginfo_t)); 00643 } 00644 00645 return model; 00646 } 00647 00648 static void 00649 fwrite_int32(FILE *fh, int32 val) 00650 { 00651 fwrite(&val, 4, 1, fh); 00652 } 00653 00654 static void 00655 fwrite_ug(FILE *fh, unigram_t *ug, logmath_t *lmath) 00656 { 00657 int32 bogus = -1; 00658 float32 log10val; 00659 00660 /* Bogus dictionary mapping field. */ 00661 fwrite(&bogus, 4, 1, fh); 00662 /* Convert values to log10. */ 00663 log10val = logmath_log_to_log10(lmath, ug->prob1.l); 00664 fwrite(&log10val, 4, 1, fh); 00665 log10val = logmath_log_to_log10(lmath, ug->bo_wt1.l); 00666 fwrite(&log10val, 4, 1, fh); 00667 fwrite_int32(fh, ug->bigrams); 00668 } 00669 00670 static void 00671 fwrite_bg(FILE *fh, bigram_t *bg) 00672 { 00673 fwrite(bg, sizeof(*bg), 1, fh); 00674 } 00675 00676 static void 00677 fwrite_tg(FILE *fh, trigram_t *tg) 00678 { 00679 fwrite(tg, sizeof(*tg), 1, fh); 00680 } 00681 00684 static char const *fmtdesc[] = { 00685 "BEGIN FILE FORMAT DESCRIPTION", 00686 "Header string length (int32) and string (including trailing 0)", 00687 "Original LM filename string-length (int32) and filename (including trailing 0)", 00688 "(int32) version number (present iff value <= 0)", 00689 "(int32) original LM file modification timestamp (iff version# present)", 00690 "(int32) string-length and string (including trailing 0) (iff version# present)", 00691 "... previous entry continued any number of times (iff version# present)", 00692 "(int32) 0 (terminating sequence of strings) (iff version# present)", 00693 "(int32) log_bg_seg_sz (present iff different from default value of LOG2_BG_SEG_SZ)", 00694 "(int32) lm_t.ucount (must be > 0)", 00695 "(int32) lm_t.bcount", 00696 "(int32) lm_t.tcount", 00697 "lm_t.ucount+1 unigrams (including sentinel)", 00698 "lm_t.bcount+1 bigrams (including sentinel 64 bits (bg_t) each if version=-1/-2, 128 bits (bg32_t) each if version=-3", 00699 "lm_t.tcount trigrams (present iff lm_t.tcount > 0 32 bits (tg_t) each if version=-1/-2, 64 bits (tg32_t) each if version=-3)", 00700 "(int32) lm_t.n_prob2", 00701 "(int32) lm_t.prob2[]", 00702 "(int32) lm_t.n_bo_wt2 (present iff lm_t.tcount > 0)", 00703 "(int32) lm_t.bo_wt2[] (present iff lm_t.tcount > 0)", 00704 "(int32) lm_t.n_prob3 (present iff lm_t.tcount > 0)", 00705 "(int32) lm_t.prob3[] (present iff lm_t.tcount > 0)", 00706 "(int32) (lm_t.bcount+1)/BG_SEG_SZ+1 (present iff lm_t.tcount > 0)", 00707 "(int32) lm_t.tseg_base[] (present iff lm_t.tcount > 0)", 00708 "(int32) Sum(all word string-lengths, including trailing 0 for each)", 00709 "All word strings (including trailing 0 for each)", 00710 "END FILE FORMAT DESCRIPTION", 00711 NULL, 00712 }; 00713 00714 static void 00715 ngram_model_dmp_write_header(FILE * fh) 00716 { 00717 int32 k; 00718 k = strlen(darpa_hdr) + 1; 00719 fwrite_int32(fh, k); 00720 fwrite(darpa_hdr, 1, k, fh); 00721 } 00722 00723 static void 00724 ngram_model_dmp_write_lm_filename(FILE * fh, const char *lmfile) 00725 { 00726 int32 k; 00727 00728 k = strlen(lmfile) + 1; 00729 fwrite_int32(fh, k); 00730 fwrite(lmfile, 1, k, fh); 00731 } 00732 00733 #define LMDMP_VERSION_TG_16BIT -1 00737 static void 00738 ngram_model_dmp_write_version(FILE * fh, int32 mtime) 00739 { 00740 fwrite_int32(fh, LMDMP_VERSION_TG_16BIT); /* version # */ 00741 fwrite_int32(fh, mtime); 00742 } 00743 00744 static void 00745 ngram_model_dmp_write_ngram_counts(FILE * fh, ngram_model_t *model) 00746 { 00747 fwrite_int32(fh, model->n_counts[0]); 00748 fwrite_int32(fh, model->n_counts[1]); 00749 fwrite_int32(fh, model->n_counts[2]); 00750 } 00751 00752 static void 00753 ngram_model_dmp_write_fmtdesc(FILE * fh) 00754 { 00755 int32 i, k; 00756 long pos; 00757 00758 /* Write file format description into header */ 00759 for (i = 0; fmtdesc[i] != NULL; i++) { 00760 k = strlen(fmtdesc[i]) + 1; 00761 fwrite_int32(fh, k); 00762 fwrite(fmtdesc[i], 1, k, fh); 00763 } 00764 /* Pad it out in order to achieve 32-bit alignment */ 00765 pos = ftell(fh); 00766 k = pos & 3; 00767 if (k) { 00768 fwrite_int32(fh, 4-k); 00769 fwrite("!!!!", 1, 4-k, fh); 00770 } 00771 fwrite_int32(fh, 0); 00772 } 00773 00774 static void 00775 ngram_model_dmp_write_unigram(FILE *fh, ngram_model_t *model) 00776 { 00777 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; 00778 int32 i; 00779 00780 for (i = 0; i <= model->n_counts[0]; i++) { 00781 fwrite_ug(fh, &(lm->lm3g.unigrams[i]), model->lmath); 00782 } 00783 } 00784 00785 00786 static void 00787 ngram_model_dmp_write_bigram(FILE *fh, ngram_model_t *model) 00788 { 00789 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; 00790 int32 i; 00791 00792 for (i = 0; i <= model->n_counts[1]; i++) { 00793 fwrite_bg(fh, &(lm->lm3g.bigrams[i])); 00794 } 00795 00796 } 00797 00798 static void 00799 ngram_model_dmp_write_trigram(FILE *fh, ngram_model_t *model) 00800 { 00801 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; 00802 int32 i; 00803 00804 for (i = 0; i < model->n_counts[2]; i++) { 00805 fwrite_tg(fh, &(lm->lm3g.trigrams[i])); 00806 } 00807 } 00808 00809 static void 00810 ngram_model_dmp_write_bgprob(FILE *fh, ngram_model_t *model) 00811 { 00812 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; 00813 int32 i; 00814 00815 fwrite_int32(fh, lm->lm3g.n_prob2); 00816 for (i = 0; i < lm->lm3g.n_prob2; i++) { 00817 float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob2[i].l); 00818 fwrite(&log10val, 4, 1, fh); 00819 } 00820 } 00821 00822 static void 00823 ngram_model_dmp_write_tgbowt(FILE *fh, ngram_model_t *model) 00824 { 00825 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; 00826 int32 i; 00827 00828 fwrite_int32(fh, lm->lm3g.n_bo_wt2); 00829 for (i = 0; i < lm->lm3g.n_bo_wt2; i++) { 00830 float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.bo_wt2[i].l); 00831 fwrite(&log10val, 4, 1, fh); 00832 } 00833 } 00834 00835 static void 00836 ngram_model_dmp_write_tgprob(FILE *fh, ngram_model_t *model) 00837 { 00838 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; 00839 int32 i; 00840 00841 fwrite_int32(fh, lm->lm3g.n_prob3); 00842 for (i = 0; i < lm->lm3g.n_prob3; i++) { 00843 float32 log10val = logmath_log_to_log10(model->lmath, lm->lm3g.prob3[i].l); 00844 fwrite(&log10val, 4, 1, fh); 00845 } 00846 } 00847 00848 static void 00849 ngram_model_dmp_write_tg_segbase(FILE *fh, ngram_model_t *model) 00850 { 00851 ngram_model_dmp_t *lm = (ngram_model_dmp_t *)model; 00852 int32 i, k; 00853 00854 k = (model->n_counts[1] + 1) / BG_SEG_SZ + 1; 00855 fwrite_int32(fh, k); 00856 for (i = 0; i < k; i++) 00857 fwrite_int32(fh, lm->lm3g.tseg_base[i]); 00858 } 00859 00860 static void 00861 ngram_model_dmp_write_wordstr(FILE *fh, ngram_model_t *model) 00862 { 00863 int32 i, k; 00864 00865 k = 0; 00866 for (i = 0; i < model->n_counts[0]; i++) 00867 k += strlen(model->word_str[i]) + 1; 00868 fwrite_int32(fh, k); 00869 for (i = 0; i < model->n_counts[0]; i++) 00870 fwrite(model->word_str[i], 1, 00871 strlen(model->word_str[i]) + 1, fh); 00872 } 00873 00874 int 00875 ngram_model_dmp_write(ngram_model_t *base, 00876 const char *file_name) 00877 { 00878 ngram_model_dmp_t *model; 00879 ngram_model_t *newbase; 00880 FILE *fh; 00881 00882 /* First, construct a DMP model from the base model. */ 00883 model = ngram_model_dmp_build(base); 00884 newbase = &model->base; 00885 00886 /* Now write it, confident in the knowledge that it's the right 00887 * kind of language model internally. */ 00888 if ((fh = fopen(file_name, "wb")) == NULL) { 00889 E_ERROR("Cannot create file %s\n", file_name); 00890 return -1; 00891 } 00892 ngram_model_dmp_write_header(fh); 00893 ngram_model_dmp_write_lm_filename(fh, file_name); 00894 ngram_model_dmp_write_version(fh, 0); 00895 ngram_model_dmp_write_fmtdesc(fh); 00896 ngram_model_dmp_write_ngram_counts(fh, newbase); 00897 ngram_model_dmp_write_unigram(fh, newbase); 00898 if (newbase->n > 1) { 00899 ngram_model_dmp_write_bigram(fh, newbase); 00900 if (newbase->n > 2) { 00901 ngram_model_dmp_write_trigram(fh, newbase); 00902 } 00903 ngram_model_dmp_write_bgprob(fh, newbase); 00904 if (newbase->n > 2) { 00905 ngram_model_dmp_write_tgbowt(fh, newbase); 00906 ngram_model_dmp_write_tgprob(fh, newbase); 00907 ngram_model_dmp_write_tg_segbase(fh, newbase); 00908 } 00909 } 00910 ngram_model_dmp_write_wordstr(fh, newbase); 00911 ngram_model_free(newbase); 00912 00913 return fclose(fh); 00914 } 00915 00916 static int 00917 ngram_model_dmp_apply_weights(ngram_model_t *base, float32 lw, 00918 float32 wip, float32 uw) 00919 { 00920 ngram_model_dmp_t *model = (ngram_model_dmp_t *)base; 00921 lm3g_apply_weights(base, &model->lm3g, lw, wip, uw); 00922 return 0; 00923 } 00924 00925 /* Lousy "templating" for things that are largely the same in DMP and 00926 * ARPA models, except for the bigram and trigram types and some 00927 * names. */ 00928 #define NGRAM_MODEL_TYPE ngram_model_dmp_t 00929 #include "lm3g_templates.c" 00930 00931 static void 00932 ngram_model_dmp_free(ngram_model_t *base) 00933 { 00934 ngram_model_dmp_t *model = (ngram_model_dmp_t *)base; 00935 00936 ckd_free(model->lm3g.unigrams); 00937 ckd_free(model->lm3g.prob2); 00938 if (model->dump_mmap) { 00939 mmio_file_unmap(model->dump_mmap); 00940 } 00941 else { 00942 ckd_free(model->lm3g.bigrams); 00943 if (base->n > 2) { 00944 ckd_free(model->lm3g.trigrams); 00945 ckd_free(model->lm3g.tseg_base); 00946 } 00947 } 00948 if (base->n > 2) { 00949 ckd_free(model->lm3g.bo_wt2); 00950 ckd_free(model->lm3g.prob3); 00951 } 00952 00953 lm3g_tginfo_free(base, &model->lm3g); 00954 } 00955 00956 static ngram_funcs_t ngram_model_dmp_funcs = { 00957 ngram_model_dmp_free, /* free */ 00958 ngram_model_dmp_apply_weights, /* apply_weights */ 00959 lm3g_template_score, /* score */ 00960 lm3g_template_raw_score, /* raw_score */ 00961 lm3g_template_add_ug, /* add_ug */ 00962 lm3g_template_flush, /* flush */ 00963 lm3g_template_iter, /* iter */ 00964 lm3g_template_mgrams, /* mgrams */ 00965 lm3g_template_successors, /* successors */ 00966 lm3g_template_iter_get, /* iter_get */ 00967 lm3g_template_iter_next, /* iter_next */ 00968 lm3g_template_iter_free /* iter_free */ 00969 };