SphinxBase 0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 1999-2007 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * This work was supported in part by funding from the Defense Advanced 00019 * Research Projects Agency and the National Science Foundation of the 00020 * United States of America, and the CMU Sphinx Speech Consortium. 00021 * 00022 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00023 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00024 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00025 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00026 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00028 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00029 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00030 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00032 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 * 00034 * ==================================================================== 00035 * 00036 */ 00037 /* 00038 * \file ngram_model.c N-Gram language models. 00039 * 00040 * Author: David Huggins-Daines, much code taken from sphinx3/src/libs3decoder/liblm 00041 */ 00042 00043 #include <config.h> 00044 00045 #include <string.h> 00046 #include <assert.h> 00047 00048 #ifdef HAVE_ICONV 00049 #include <iconv.h> 00050 #endif 00051 00052 #include "sphinxbase/ngram_model.h" 00053 #include "sphinxbase/ckd_alloc.h" 00054 #include "sphinxbase/filename.h" 00055 #include "sphinxbase/pio.h" 00056 #include "sphinxbase/err.h" 00057 #include "sphinxbase/logmath.h" 00058 #include "sphinxbase/strfuncs.h" 00059 #include "sphinxbase/case.h" 00060 00061 #include "ngram_model_internal.h" 00062 00063 ngram_file_type_t 00064 ngram_file_name_to_type(const char *file_name) 00065 { 00066 const char *ext; 00067 00068 ext = strrchr(file_name, '.'); 00069 if (ext == NULL) { 00070 return NGRAM_INVALID; 00071 } 00072 if (0 == strcmp_nocase(ext, ".gz")) { 00073 while (--ext >= file_name) { 00074 if (*ext == '.') break; 00075 } 00076 if (ext < file_name) { 00077 return NGRAM_INVALID; 00078 } 00079 } 00080 else if (0 == strcmp_nocase(ext, ".bz2")) { 00081 while (--ext >= file_name) { 00082 if (*ext == '.') break; 00083 } 00084 if (ext < file_name) { 00085 return NGRAM_INVALID; 00086 } 00087 } 00088 /* We use strncmp because there might be a .gz on the end. */ 00089 if (0 == strncmp_nocase(ext, ".ARPA", 5)) 00090 return NGRAM_ARPA; 00091 if (0 == strncmp_nocase(ext, ".DMP", 4)) 00092 return NGRAM_DMP; 00093 return NGRAM_INVALID; 00094 } 00095 00096 ngram_file_type_t 00097 ngram_str_to_type(const char *str_name) 00098 { 00099 if (0 == strcmp_nocase(str_name, "arpa")) 00100 return NGRAM_ARPA; 00101 if (0 == strcmp_nocase(str_name, "dmp")) 00102 return NGRAM_DMP; 00103 return NGRAM_INVALID; 00104 } 00105 00106 char const * 00107 ngram_type_to_str(int type) 00108 { 00109 switch (type) { 00110 case NGRAM_ARPA: 00111 return "arpa"; 00112 case NGRAM_DMP: 00113 return "dmp"; 00114 default: 00115 return NULL; 00116 } 00117 } 00118 00119 00120 ngram_model_t * 00121 ngram_model_read(cmd_ln_t *config, 00122 const char *file_name, 00123 ngram_file_type_t file_type, 00124 logmath_t *lmath) 00125 { 00126 ngram_model_t *model = NULL; 00127 00128 switch (file_type) { 00129 case NGRAM_AUTO: { 00130 if ((model = ngram_model_arpa_read(config, file_name, lmath)) != NULL) 00131 break; 00132 if ((model = ngram_model_dmp_read(config, file_name, lmath)) != NULL) 00133 break; 00134 return NULL; 00135 } 00136 case NGRAM_ARPA: 00137 model = ngram_model_arpa_read(config, file_name, lmath); 00138 break; 00139 case NGRAM_DMP: 00140 model = ngram_model_dmp_read(config, file_name, lmath); 00141 break; 00142 default: 00143 E_ERROR("language model file type not supported\n"); 00144 return NULL; 00145 } 00146 00147 /* Now set weights based on config if present. */ 00148 if (config) { 00149 float32 lw = 1.0; 00150 float32 wip = 1.0; 00151 float32 uw = 1.0; 00152 00153 if (cmd_ln_exists_r(config, "-lw")) 00154 lw = cmd_ln_float32_r(config, "-lw"); 00155 if (cmd_ln_exists_r(config, "-wip")) 00156 wip = cmd_ln_float32_r(config, "-wip"); 00157 if (cmd_ln_exists_r(config, "-uw")) 00158 uw = cmd_ln_float32_r(config, "-uw"); 00159 00160 ngram_model_apply_weights(model, lw, wip, uw); 00161 } 00162 00163 return model; 00164 } 00165 00166 int 00167 ngram_model_write(ngram_model_t *model, const char *file_name, 00168 ngram_file_type_t file_type) 00169 { 00170 switch (file_type) { 00171 case NGRAM_AUTO: { 00172 file_type = ngram_file_name_to_type(file_name); 00173 /* Default to ARPA (catches .lm and other things) */ 00174 if (file_type == NGRAM_INVALID) 00175 file_type = NGRAM_ARPA; 00176 return ngram_model_write(model, file_name, file_type); 00177 } 00178 case NGRAM_ARPA: 00179 return ngram_model_arpa_write(model, file_name); 00180 case NGRAM_DMP: 00181 return ngram_model_dmp_write(model, file_name); 00182 default: 00183 E_ERROR("language model file type not supported\n"); 00184 return -1; 00185 } 00186 E_ERROR("language model file type not supported\n"); 00187 return -1; 00188 } 00189 00190 int32 00191 ngram_model_init(ngram_model_t *base, 00192 ngram_funcs_t *funcs, 00193 logmath_t *lmath, 00194 int32 n, int32 n_unigram) 00195 { 00196 base->refcount = 1; 00197 base->funcs = funcs; 00198 base->n = n; 00199 /* If this was previously initialized... */ 00200 if (base->n_counts == NULL) 00201 base->n_counts = ckd_calloc(3, sizeof(*base->n_counts)); 00202 /* Don't reset weights if logmath object hasn't changed. */ 00203 if (base->lmath != lmath) { 00204 /* Set default values for weights. */ 00205 base->lw = 1.0; 00206 base->log_wip = 0; /* i.e. 1.0 */ 00207 base->log_uw = 0; /* i.e. 1.0 */ 00208 base->log_uniform = logmath_log(lmath, 1.0 / n_unigram); 00209 base->log_uniform_weight = logmath_get_zero(lmath); 00210 base->log_zero = logmath_get_zero(lmath); 00211 base->lmath = lmath; 00212 } 00213 /* Allocate or reallocate space for word strings. */ 00214 if (base->word_str) { 00215 /* Free all previous word strings if they were allocated. */ 00216 if (base->writable) { 00217 int32 i; 00218 for (i = 0; i < base->n_words; ++i) { 00219 ckd_free(base->word_str[i]); 00220 base->word_str[i] = NULL; 00221 } 00222 } 00223 base->word_str = ckd_realloc(base->word_str, n_unigram * sizeof(char *)); 00224 } 00225 else 00226 base->word_str = ckd_calloc(n_unigram, sizeof(char *)); 00227 /* NOTE: They are no longer case-insensitive since we are allowing 00228 * other encodings for word strings. Beware. */ 00229 if (base->wid) 00230 hash_table_empty(base->wid); 00231 else 00232 base->wid = hash_table_new(n_unigram, FALSE); 00233 base->n_counts[0] = base->n_1g_alloc = base->n_words = n_unigram; 00234 00235 return 0; 00236 } 00237 00238 ngram_model_t * 00239 ngram_model_retain(ngram_model_t *model) 00240 { 00241 ++model->refcount; 00242 return model; 00243 } 00244 00245 00246 void 00247 ngram_model_flush(ngram_model_t *model) 00248 { 00249 if (model->funcs && model->funcs->flush) 00250 (*model->funcs->flush)(model); 00251 } 00252 00253 int 00254 ngram_model_free(ngram_model_t *model) 00255 { 00256 int i; 00257 00258 if (model == NULL) 00259 return 0; 00260 if (--model->refcount > 0) 00261 return model->refcount; 00262 if (model->funcs && model->funcs->free) 00263 (*model->funcs->free)(model); 00264 if (model->writable) { 00265 /* Free all words. */ 00266 for (i = 0; i < model->n_words; ++i) { 00267 ckd_free(model->word_str[i]); 00268 } 00269 } 00270 else { 00271 /* Free all class words. */ 00272 for (i = 0; i < model->n_classes; ++i) { 00273 ngram_class_t *lmclass; 00274 int32 j; 00275 00276 lmclass = model->classes[i]; 00277 for (j = 0; j < lmclass->n_words; ++j) { 00278 ckd_free(model->word_str[lmclass->start_wid + j]); 00279 } 00280 for (j = 0; j < lmclass->n_hash; ++j) { 00281 if (lmclass->nword_hash[j].wid != -1) { 00282 ckd_free(model->word_str[lmclass->nword_hash[j].wid]); 00283 } 00284 } 00285 } 00286 } 00287 for (i = 0; i < model->n_classes; ++i) { 00288 ngram_class_free(model->classes[i]); 00289 } 00290 ckd_free(model->classes); 00291 hash_table_free(model->wid); 00292 ckd_free(model->word_str); 00293 ckd_free(model->n_counts); 00294 ckd_free(model); 00295 return 0; 00296 } 00297 00298 int 00299 ngram_model_casefold(ngram_model_t *model, int kase) 00300 { 00301 int writable, i; 00302 hash_table_t *new_wid; 00303 00304 /* Were word strings already allocated? */ 00305 writable = model->writable; 00306 /* Either way, we are going to allocate some word strings. */ 00307 model->writable = TRUE; 00308 00309 /* And, don't forget, we need to rebuild the word to unigram ID 00310 * mapping. */ 00311 new_wid = hash_table_new(model->n_words, FALSE); 00312 for (i = 0; i < model->n_words; ++i) { 00313 char *outstr; 00314 if (writable) { 00315 outstr = model->word_str[i]; 00316 } 00317 else { 00318 outstr = ckd_salloc(model->word_str[i]); 00319 } 00320 /* Don't case-fold <tags> or [classes] */ 00321 if (outstr[0] == '<' || outstr[0] == '[') { 00322 } 00323 else { 00324 switch (kase) { 00325 case NGRAM_UPPER: 00326 ucase(outstr); 00327 break; 00328 case NGRAM_LOWER: 00329 lcase(outstr); 00330 break; 00331 default: 00332 ; 00333 } 00334 } 00335 model->word_str[i] = outstr; 00336 00337 /* Now update the hash table. We might have terrible 00338 * collisions here, so warn about them. */ 00339 if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) { 00340 E_WARN("Duplicate word in dictionary after conversion: %s\n", 00341 model->word_str[i]); 00342 } 00343 } 00344 /* Swap out the hash table. */ 00345 hash_table_free(model->wid); 00346 model->wid = new_wid; 00347 return 0; 00348 } 00349 00350 #ifdef HAVE_ICONV 00351 int 00352 ngram_model_recode(ngram_model_t *model, const char *from, const char *to) 00353 { 00354 iconv_t ic; 00355 char *outbuf; 00356 size_t maxlen; 00357 int i, writable; 00358 hash_table_t *new_wid; 00359 00360 /* FIXME: Need to do a special case thing for the GB-HEX encoding 00361 * used in Sphinx3 Mandarin models. */ 00362 if ((ic = iconv_open(to, from)) == (iconv_t)-1) { 00363 E_ERROR_SYSTEM("iconv_open() failed"); 00364 return -1; 00365 } 00366 /* iconv(3) is a piece of crap and won't accept a NULL out buffer, 00367 * unlike wcstombs(3). So we have to either call it over and over 00368 * again until our buffer is big enough, or call it with a huge 00369 * buffer and then copy things back to the output. We will use a 00370 * mix of these two approaches here. We'll keep a single big 00371 * buffer around, and expand it as necessary. 00372 */ 00373 maxlen = 0; 00374 for (i = 0; i < model->n_words; ++i) { 00375 if (strlen(model->word_str[i]) > maxlen) 00376 maxlen = strlen(model->word_str[i]); 00377 } 00378 /* Were word strings already allocated? */ 00379 writable = model->writable; 00380 /* Either way, we are going to allocate some word strings. */ 00381 model->writable = TRUE; 00382 /* Really should be big enough except for pathological cases. */ 00383 maxlen = maxlen * sizeof(int) + 15; 00384 outbuf = ckd_calloc(maxlen, 1); 00385 /* And, don't forget, we need to rebuild the word to unigram ID 00386 * mapping. */ 00387 new_wid = hash_table_new(model->n_words, FALSE); 00388 for (i = 0; i < model->n_words; ++i) { 00389 ICONV_CONST char *in; 00390 char *out; 00391 size_t inleft, outleft, result; 00392 00393 start_conversion: 00394 in = (ICONV_CONST char *)model->word_str[i]; 00395 /* Yes, this assumes that we don't have any NUL bytes. */ 00396 inleft = strlen(in); 00397 out = outbuf; 00398 outleft = maxlen; 00399 00400 while ((result = iconv(ic, &in, &inleft, &out, &outleft)) == (size_t)-1) { 00401 if (errno != E2BIG) { 00402 /* FIXME: if we already converted any words, then they 00403 * are going to be in an inconsistent state. */ 00404 E_ERROR_SYSTEM("iconv() failed"); 00405 ckd_free(outbuf); 00406 hash_table_free(new_wid); 00407 return -1; 00408 } 00409 /* Reset the internal state of conversion. */ 00410 iconv(ic, NULL, NULL, NULL, NULL); 00411 /* Make everything bigger. */ 00412 maxlen *= 2; 00413 out = outbuf = ckd_realloc(outbuf, maxlen); 00414 /* Reset the input pointers. */ 00415 in = (ICONV_CONST char *)model->word_str[i]; 00416 inleft = strlen(in); 00417 } 00418 00419 /* Now flush a shift-out sequence, if any. */ 00420 if ((result = iconv(ic, NULL, NULL, &out, &outleft)) == (size_t)-1) { 00421 if (errno != E2BIG) { 00422 /* FIXME: if we already converted any words, then they 00423 * are going to be in an inconsistent state. */ 00424 E_ERROR_SYSTEM("iconv() failed (state reset sequence)"); 00425 ckd_free(outbuf); 00426 hash_table_free(new_wid); 00427 return -1; 00428 } 00429 /* Reset the internal state of conversion. */ 00430 iconv(ic, NULL, NULL, NULL, NULL); 00431 /* Make everything bigger. */ 00432 maxlen *= 2; 00433 outbuf = ckd_realloc(outbuf, maxlen); 00434 /* Be very evil. */ 00435 goto start_conversion; 00436 } 00437 00438 result = maxlen - outleft; 00439 /* Okay, that was hard, now let's go shopping. */ 00440 if (writable) { 00441 /* Grow or shrink the output string as necessary. */ 00442 model->word_str[i] = ckd_realloc(model->word_str[i], result + 1); 00443 model->word_str[i][result] = '\0'; 00444 } 00445 else { 00446 /* It actually was not allocated previously, so do that now. */ 00447 model->word_str[i] = ckd_calloc(result + 1, 1); 00448 } 00449 /* Copy the new thing in. */ 00450 memcpy(model->word_str[i], outbuf, result); 00451 00452 /* Now update the hash table. We might have terrible 00453 * collisions if a non-reversible conversion was requested., 00454 * so warn about them. */ 00455 if (hash_table_enter_int32(new_wid, model->word_str[i], i) != i) { 00456 E_WARN("Duplicate word in dictionary after conversion: %s\n", 00457 model->word_str[i]); 00458 } 00459 } 00460 ckd_free(outbuf); 00461 iconv_close(ic); 00462 /* Swap out the hash table. */ 00463 hash_table_free(model->wid); 00464 model->wid = new_wid; 00465 00466 return 0; 00467 } 00468 #else /* !HAVE_ICONV */ 00469 int 00470 ngram_model_recode(ngram_model_t *model, const char *from, const char *to) 00471 { 00472 return -1; 00473 } 00474 #endif /* !HAVE_ICONV */ 00475 00476 int 00477 ngram_model_apply_weights(ngram_model_t *model, 00478 float32 lw, float32 wip, float32 uw) 00479 { 00480 return (*model->funcs->apply_weights)(model, lw, wip, uw); 00481 } 00482 00483 float32 00484 ngram_model_get_weights(ngram_model_t *model, int32 *out_log_wip, 00485 int32 *out_log_uw) 00486 { 00487 if (out_log_wip) *out_log_wip = model->log_wip; 00488 if (out_log_uw) *out_log_uw = model->log_uw; 00489 return model->lw; 00490 } 00491 00492 00493 int32 00494 ngram_ng_score(ngram_model_t *model, int32 wid, int32 *history, 00495 int32 n_hist, int32 *n_used) 00496 { 00497 int32 score, class_weight = 0; 00498 int i; 00499 00500 /* Closed vocabulary, OOV word probability is zero */ 00501 if (wid == NGRAM_INVALID_WID) 00502 return model->log_zero; 00503 00504 /* "Declassify" wid and history */ 00505 if (NGRAM_IS_CLASSWID(wid)) { 00506 ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)]; 00507 00508 class_weight = ngram_class_prob(lmclass, wid); 00509 if (class_weight == 1) /* Meaning, not found in class. */ 00510 return model->log_zero; 00511 wid = lmclass->tag_wid; 00512 } 00513 for (i = 0; i < n_hist; ++i) { 00514 if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i])) 00515 history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid; 00516 } 00517 score = (*model->funcs->score)(model, wid, history, n_hist, n_used); 00518 00519 /* Multiply by unigram in-class weight. */ 00520 return score + class_weight; 00521 } 00522 00523 int32 00524 ngram_score(ngram_model_t *model, const char *word, ...) 00525 { 00526 va_list history; 00527 const char *hword; 00528 int32 *histid; 00529 int32 n_hist; 00530 int32 n_used; 00531 int32 prob; 00532 00533 va_start(history, word); 00534 n_hist = 0; 00535 while ((hword = va_arg(history, const char *)) != NULL) 00536 ++n_hist; 00537 va_end(history); 00538 00539 histid = ckd_calloc(n_hist, sizeof(*histid)); 00540 va_start(history, word); 00541 n_hist = 0; 00542 while ((hword = va_arg(history, const char *)) != NULL) { 00543 histid[n_hist] = ngram_wid(model, hword); 00544 ++n_hist; 00545 } 00546 va_end(history); 00547 00548 prob = ngram_ng_score(model, ngram_wid(model, word), 00549 histid, n_hist, &n_used); 00550 ckd_free(histid); 00551 return prob; 00552 } 00553 00554 int32 00555 ngram_tg_score(ngram_model_t *model, int32 w3, int32 w2, int32 w1, int32 *n_used) 00556 { 00557 int32 hist[2]; 00558 hist[0] = w2; 00559 hist[1] = w1; 00560 return ngram_ng_score(model, w3, hist, 2, n_used); 00561 } 00562 00563 int32 00564 ngram_bg_score(ngram_model_t *model, int32 w2, int32 w1, int32 *n_used) 00565 { 00566 return ngram_ng_score(model, w2, &w1, 1, n_used); 00567 } 00568 00569 int32 00570 ngram_ng_prob(ngram_model_t *model, int32 wid, int32 *history, 00571 int32 n_hist, int32 *n_used) 00572 { 00573 int32 prob, class_weight = 0; 00574 int i; 00575 00576 /* Closed vocabulary, OOV word probability is zero */ 00577 if (wid == NGRAM_INVALID_WID) 00578 return model->log_zero; 00579 00580 /* "Declassify" wid and history */ 00581 if (NGRAM_IS_CLASSWID(wid)) { 00582 ngram_class_t *lmclass = model->classes[NGRAM_CLASSID(wid)]; 00583 00584 class_weight = ngram_class_prob(lmclass, wid); 00585 if (class_weight == 1) /* Meaning, not found in class. */ 00586 return class_weight; 00587 wid = lmclass->tag_wid; 00588 } 00589 for (i = 0; i < n_hist; ++i) { 00590 if (history[i] != NGRAM_INVALID_WID && NGRAM_IS_CLASSWID(history[i])) 00591 history[i] = model->classes[NGRAM_CLASSID(history[i])]->tag_wid; 00592 } 00593 prob = (*model->funcs->raw_score)(model, wid, history, 00594 n_hist, n_used); 00595 /* Multiply by unigram in-class weight. */ 00596 return prob + class_weight; 00597 } 00598 00599 int32 00600 ngram_prob(ngram_model_t *model, const char *word, ...) 00601 { 00602 va_list history; 00603 const char *hword; 00604 int32 *histid; 00605 int32 n_hist; 00606 int32 n_used; 00607 int32 prob; 00608 00609 va_start(history, word); 00610 n_hist = 0; 00611 while ((hword = va_arg(history, const char *)) != NULL) 00612 ++n_hist; 00613 va_end(history); 00614 00615 histid = ckd_calloc(n_hist, sizeof(*histid)); 00616 va_start(history, word); 00617 n_hist = 0; 00618 while ((hword = va_arg(history, const char *)) != NULL) { 00619 histid[n_hist] = ngram_wid(model, hword); 00620 ++n_hist; 00621 } 00622 va_end(history); 00623 00624 prob = ngram_ng_prob(model, ngram_wid(model, word), 00625 histid, n_hist, &n_used); 00626 ckd_free(histid); 00627 return prob; 00628 } 00629 00630 int32 00631 ngram_score_to_prob(ngram_model_t *base, int32 score) 00632 { 00633 int32 prob; 00634 00635 /* Undo insertion penalty. */ 00636 prob = score - base->log_wip; 00637 /* Undo language weight. */ 00638 prob = (int32)(prob / base->lw); 00639 00640 return prob; 00641 } 00642 00643 int32 00644 ngram_unknown_wid(ngram_model_t *model) 00645 { 00646 int32 val; 00647 00648 /* FIXME: This could be memoized for speed if necessary. */ 00649 /* Look up <UNK>, if not found return NGRAM_INVALID_WID. */ 00650 if (hash_table_lookup_int32(model->wid, "<UNK>", &val) == -1) 00651 return NGRAM_INVALID_WID; 00652 else 00653 return val; 00654 } 00655 00656 int32 00657 ngram_zero(ngram_model_t *model) 00658 { 00659 return model->log_zero; 00660 } 00661 00662 int32 00663 ngram_model_get_size(ngram_model_t *model) 00664 { 00665 if (model != NULL) 00666 return model->n; 00667 return 0; 00668 } 00669 00670 int32 const * 00671 ngram_model_get_counts(ngram_model_t *model) 00672 { 00673 if (model != NULL) 00674 return model->n_counts; 00675 return NULL; 00676 } 00677 00678 void 00679 ngram_iter_init(ngram_iter_t *itor, ngram_model_t *model, 00680 int m, int successor) 00681 { 00682 itor->model = model; 00683 itor->wids = ckd_calloc(model->n, sizeof(*itor->wids)); 00684 itor->m = m; 00685 itor->successor = successor; 00686 } 00687 00688 ngram_iter_t * 00689 ngram_model_mgrams(ngram_model_t *model, int m) 00690 { 00691 ngram_iter_t *itor; 00692 /* The fact that m=n-1 is not exactly obvious. Prevent accidents. */ 00693 if (m >= model->n) 00694 return NULL; 00695 if (model->funcs->mgrams == NULL) 00696 return NULL; 00697 itor = (*model->funcs->mgrams)(model, m); 00698 return itor; 00699 } 00700 00701 ngram_iter_t * 00702 ngram_iter(ngram_model_t *model, const char *word, ...) 00703 { 00704 va_list history; 00705 const char *hword; 00706 int32 *histid; 00707 int32 n_hist; 00708 ngram_iter_t *itor; 00709 00710 va_start(history, word); 00711 n_hist = 0; 00712 while ((hword = va_arg(history, const char *)) != NULL) 00713 ++n_hist; 00714 va_end(history); 00715 00716 histid = ckd_calloc(n_hist, sizeof(*histid)); 00717 va_start(history, word); 00718 n_hist = 0; 00719 while ((hword = va_arg(history, const char *)) != NULL) { 00720 histid[n_hist] = ngram_wid(model, hword); 00721 ++n_hist; 00722 } 00723 va_end(history); 00724 00725 itor = ngram_ng_iter(model, ngram_wid(model, word), histid, n_hist); 00726 ckd_free(histid); 00727 return itor; 00728 } 00729 00730 ngram_iter_t * 00731 ngram_ng_iter(ngram_model_t *model, int32 wid, int32 *history, int32 n_hist) 00732 { 00733 if (n_hist >= model->n) 00734 return NULL; 00735 if (model->funcs->iter == NULL) 00736 return NULL; 00737 return (*model->funcs->iter)(model, wid, history, n_hist); 00738 } 00739 00740 ngram_iter_t * 00741 ngram_iter_successors(ngram_iter_t *itor) 00742 { 00743 /* Stop when we are at the highest order N-Gram. */ 00744 if (itor->m == itor->model->n - 1) 00745 return NULL; 00746 return (*itor->model->funcs->successors)(itor); 00747 } 00748 00749 int32 const * 00750 ngram_iter_get(ngram_iter_t *itor, 00751 int32 *out_score, 00752 int32 *out_bowt) 00753 { 00754 return (*itor->model->funcs->iter_get)(itor, out_score, out_bowt); 00755 } 00756 00757 ngram_iter_t * 00758 ngram_iter_next(ngram_iter_t *itor) 00759 { 00760 return (*itor->model->funcs->iter_next)(itor); 00761 } 00762 00763 void 00764 ngram_iter_free(ngram_iter_t *itor) 00765 { 00766 ckd_free(itor->wids); 00767 (*itor->model->funcs->iter_free)(itor); 00768 } 00769 00770 int32 00771 ngram_wid(ngram_model_t *model, const char *word) 00772 { 00773 int32 val; 00774 00775 if (hash_table_lookup_int32(model->wid, word, &val) == -1) 00776 return ngram_unknown_wid(model); 00777 else 00778 return val; 00779 } 00780 00781 const char * 00782 ngram_word(ngram_model_t *model, int32 wid) 00783 { 00784 /* Remove any class tag */ 00785 wid = NGRAM_BASEWID(wid); 00786 if (wid >= model->n_words) 00787 return NULL; 00788 return model->word_str[wid]; 00789 } 00790 00794 int32 00795 ngram_add_word_internal(ngram_model_t *model, 00796 const char *word, 00797 int32 classid) 00798 { 00799 void *dummy; 00800 int32 wid; 00801 00802 /* Take the next available word ID */ 00803 wid = model->n_words; 00804 if (classid >= 0) { 00805 wid = NGRAM_CLASSWID(wid, classid); 00806 } 00807 /* Check for hash collisions. */ 00808 if (hash_table_lookup(model->wid, word, &dummy) == 0) { 00809 E_ERROR("Duplicate definition of word %s\n", word); 00810 return NGRAM_INVALID_WID; 00811 } 00812 /* Reallocate word_str if necessary. */ 00813 if (model->n_words >= model->n_1g_alloc) { 00814 model->n_1g_alloc += UG_ALLOC_STEP; 00815 model->word_str = ckd_realloc(model->word_str, 00816 sizeof(*model->word_str) * model->n_1g_alloc); 00817 } 00818 /* Add the word string in the appropriate manner. */ 00819 /* Class words are always dynamically allocated. */ 00820 model->word_str[model->n_words] = ckd_salloc(word); 00821 /* Now enter it into the hash table. */ 00822 if (hash_table_enter_int32(model->wid, model->word_str[model->n_words], wid) != wid) { 00823 E_ERROR("Hash insertion failed for word %s => %p (should not happen)\n", 00824 model->word_str[model->n_words], (void *)(long)(wid)); 00825 } 00826 /* Increment number of words. */ 00827 ++model->n_words; 00828 return wid; 00829 } 00830 00831 int32 00832 ngram_model_add_word(ngram_model_t *model, 00833 const char *word, float32 weight) 00834 { 00835 int32 wid, prob = model->log_zero; 00836 00837 /* If we add word to unwritable model, we need to make it writable */ 00838 if (!model->writable) { 00839 E_WARN("Can't add word '%s' to read-only language model. " 00840 "Disable mmap with '-mmap no' to make it writable\n", word); 00841 return -1; 00842 } 00843 00844 wid = ngram_add_word_internal(model, word, -1); 00845 if (wid == NGRAM_INVALID_WID) 00846 return wid; 00847 00848 /* Do what needs to be done to add the word to the unigram. */ 00849 if (model->funcs && model->funcs->add_ug) 00850 prob = (*model->funcs->add_ug)(model, wid, logmath_log(model->lmath, weight)); 00851 if (prob == 0) { 00852 return -1; 00853 } 00854 return wid; 00855 } 00856 00857 ngram_class_t * 00858 ngram_class_new(ngram_model_t *model, int32 tag_wid, int32 start_wid, glist_t classwords) 00859 { 00860 ngram_class_t *lmclass; 00861 gnode_t *gn; 00862 float32 tprob; 00863 int i; 00864 00865 lmclass = ckd_calloc(1, sizeof(*lmclass)); 00866 lmclass->tag_wid = tag_wid; 00867 /* wid_base is the wid (minus class tag) of the first word in the list. */ 00868 lmclass->start_wid = start_wid; 00869 lmclass->n_words = glist_count(classwords); 00870 lmclass->prob1 = ckd_calloc(lmclass->n_words, sizeof(*lmclass->prob1)); 00871 lmclass->nword_hash = NULL; 00872 lmclass->n_hash = 0; 00873 tprob = 0.0; 00874 for (gn = classwords; gn; gn = gnode_next(gn)) { 00875 tprob += gnode_float32(gn); 00876 } 00877 if (tprob > 1.1 || tprob < 0.9) { 00878 E_WARN("Total class probability is %f, will normalize\n", tprob); 00879 for (gn = classwords; gn; gn = gnode_next(gn)) { 00880 gn->data.fl /= tprob; 00881 } 00882 } 00883 for (i = 0, gn = classwords; gn; ++i, gn = gnode_next(gn)) { 00884 lmclass->prob1[i] = logmath_log(model->lmath, gnode_float32(gn)); 00885 } 00886 00887 return lmclass; 00888 } 00889 00890 int32 00891 ngram_class_add_word(ngram_class_t *lmclass, int32 wid, int32 lweight) 00892 { 00893 int32 hash; 00894 00895 if (lmclass->nword_hash == NULL) { 00896 /* Initialize everything in it to -1 */ 00897 lmclass->nword_hash = ckd_malloc(NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash)); 00898 memset(lmclass->nword_hash, 0xff, NGRAM_HASH_SIZE * sizeof(*lmclass->nword_hash)); 00899 lmclass->n_hash = NGRAM_HASH_SIZE; 00900 lmclass->n_hash_inuse = 0; 00901 } 00902 /* Stupidest possible hash function. This will work pretty well 00903 * when this function is called repeatedly with contiguous word 00904 * IDs, though... */ 00905 hash = wid & (lmclass->n_hash - 1); 00906 if (lmclass->nword_hash[hash].wid == -1) { 00907 /* Good, no collision. */ 00908 lmclass->nword_hash[hash].wid = wid; 00909 lmclass->nword_hash[hash].prob1 = lweight; 00910 ++lmclass->n_hash_inuse; 00911 return hash; 00912 } 00913 else { 00914 int32 next; 00915 /* Collision... Find the end of the hash chain. */ 00916 while (lmclass->nword_hash[hash].next != -1) 00917 hash = lmclass->nword_hash[hash].next; 00918 assert(hash != -1); 00919 /* Does we has any more bukkit? */ 00920 if (lmclass->n_hash_inuse == lmclass->n_hash) { 00921 /* Oh noes! Ok, we makes more. */ 00922 lmclass->nword_hash = ckd_realloc(lmclass->nword_hash, 00923 lmclass->n_hash * 2 * sizeof(*lmclass->nword_hash)); 00924 memset(lmclass->nword_hash + lmclass->n_hash, 00925 0xff, lmclass->n_hash * sizeof(*lmclass->nword_hash)); 00926 /* Just use the next allocated one (easy) */ 00927 next = lmclass->n_hash; 00928 lmclass->n_hash *= 2; 00929 } 00930 else { 00931 /* Look for any available bucket. We hope this doesn't happen. */ 00932 for (next = 0; next < lmclass->n_hash; ++next) 00933 if (lmclass->nword_hash[next].wid == -1) 00934 break; 00935 /* This should absolutely not happen. */ 00936 assert(next != lmclass->n_hash); 00937 } 00938 lmclass->nword_hash[next].wid = wid; 00939 lmclass->nword_hash[next].prob1 = lweight; 00940 lmclass->nword_hash[hash].next = next; 00941 ++lmclass->n_hash_inuse; 00942 return next; 00943 } 00944 } 00945 00946 void 00947 ngram_class_free(ngram_class_t *lmclass) 00948 { 00949 ckd_free(lmclass->nword_hash); 00950 ckd_free(lmclass->prob1); 00951 ckd_free(lmclass); 00952 } 00953 00954 int32 00955 ngram_model_add_class_word(ngram_model_t *model, 00956 const char *classname, 00957 const char *word, 00958 float32 weight) 00959 { 00960 ngram_class_t *lmclass; 00961 int32 classid, tag_wid, wid, i, scale; 00962 float32 fprob; 00963 00964 /* Find the class corresponding to classname. Linear search 00965 * probably okay here since there won't be very many classes, and 00966 * this doesn't have to be fast. */ 00967 tag_wid = ngram_wid(model, classname); 00968 if (tag_wid == NGRAM_INVALID_WID) { 00969 E_ERROR("No such word or class tag: %s\n", classname); 00970 return tag_wid; 00971 } 00972 for (classid = 0; classid < model->n_classes; ++classid) { 00973 if (model->classes[classid]->tag_wid == tag_wid) 00974 break; 00975 } 00976 /* Hmm, no such class. It's probably not a good idea to create one. */ 00977 if (classid == model->n_classes) { 00978 E_ERROR("Word %s is not a class tag (call ngram_model_add_class() first)\n", classname); 00979 return NGRAM_INVALID_WID; 00980 } 00981 lmclass = model->classes[classid]; 00982 00983 /* Add this word to the model's set of words. */ 00984 wid = ngram_add_word_internal(model, word, classid); 00985 if (wid == NGRAM_INVALID_WID) 00986 return wid; 00987 00988 /* This is the fixed probability of the new word. */ 00989 fprob = weight * 1.0f / (lmclass->n_words + lmclass->n_hash_inuse + 1); 00990 /* Now normalize everything else to fit it in. This is 00991 * accomplished by simply scaling all the other probabilities 00992 * by (1-fprob). */ 00993 scale = logmath_log(model->lmath, 1.0 - fprob); 00994 for (i = 0; i < lmclass->n_words; ++i) 00995 lmclass->prob1[i] += scale; 00996 for (i = 0; i < lmclass->n_hash; ++i) 00997 if (lmclass->nword_hash[i].wid != -1) 00998 lmclass->nword_hash[i].prob1 += scale; 00999 01000 /* Now add it to the class hash table. */ 01001 return ngram_class_add_word(lmclass, wid, logmath_log(model->lmath, fprob)); 01002 } 01003 01004 int32 01005 ngram_model_add_class(ngram_model_t *model, 01006 const char *classname, 01007 float32 classweight, 01008 char **words, 01009 const float32 *weights, 01010 int32 n_words) 01011 { 01012 ngram_class_t *lmclass; 01013 glist_t classwords = NULL; 01014 int32 i, start_wid = -1; 01015 int32 classid, tag_wid; 01016 01017 /* Check if classname already exists in model. If not, add it.*/ 01018 if ((tag_wid = ngram_wid(model, classname)) == ngram_unknown_wid(model)) { 01019 tag_wid = ngram_model_add_word(model, classname, classweight); 01020 if (tag_wid == NGRAM_INVALID_WID) 01021 return -1; 01022 } 01023 01024 if (model->n_classes == 128) { 01025 E_ERROR("Number of classes cannot exceed 128 (sorry)\n"); 01026 return -1; 01027 } 01028 classid = model->n_classes; 01029 for (i = 0; i < n_words; ++i) { 01030 int32 wid; 01031 01032 wid = ngram_add_word_internal(model, words[i], classid); 01033 if (wid == NGRAM_INVALID_WID) 01034 return -1; 01035 if (start_wid == -1) 01036 start_wid = NGRAM_BASEWID(wid); 01037 classwords = glist_add_float32(classwords, weights[i]); 01038 } 01039 classwords = glist_reverse(classwords); 01040 lmclass = ngram_class_new(model, tag_wid, start_wid, classwords); 01041 glist_free(classwords); 01042 if (lmclass == NULL) 01043 return -1; 01044 01045 ++model->n_classes; 01046 if (model->classes == NULL) 01047 model->classes = ckd_calloc(1, sizeof(*model->classes)); 01048 else 01049 model->classes = ckd_realloc(model->classes, 01050 model->n_classes * sizeof(*model->classes)); 01051 model->classes[classid] = lmclass; 01052 return classid; 01053 } 01054 01055 int32 01056 ngram_class_prob(ngram_class_t *lmclass, int32 wid) 01057 { 01058 int32 base_wid = NGRAM_BASEWID(wid); 01059 01060 if (base_wid < lmclass->start_wid 01061 || base_wid > lmclass->start_wid + lmclass->n_words) { 01062 int32 hash; 01063 01064 /* Look it up in the hash table. */ 01065 hash = wid & (lmclass->n_hash - 1); 01066 while (hash != -1 && lmclass->nword_hash[hash].wid != wid) 01067 hash = lmclass->nword_hash[hash].next; 01068 if (hash == -1) 01069 return 1; 01070 return lmclass->nword_hash[hash].prob1; 01071 } 01072 else { 01073 return lmclass->prob1[base_wid - lmclass->start_wid]; 01074 } 01075 } 01076 01077 int32 01078 read_classdef_file(hash_table_t *classes, const char *file_name) 01079 { 01080 FILE *fp; 01081 int32 is_pipe; 01082 int inclass; 01083 int32 rv = -1; 01084 gnode_t *gn; 01085 glist_t classwords = NULL; 01086 glist_t classprobs = NULL; 01087 char *classname = NULL; 01088 01089 if ((fp = fopen_comp(file_name, "r", &is_pipe)) == NULL) { 01090 E_ERROR("File %s not found\n", file_name); 01091 return -1; 01092 } 01093 01094 inclass = FALSE; 01095 while (!feof(fp)) { 01096 char line[512]; 01097 char *wptr[2]; 01098 int n_words; 01099 01100 if (fgets(line, sizeof(line), fp) == NULL) 01101 break; 01102 01103 n_words = str2words(line, wptr, 2); 01104 if (n_words <= 0) 01105 continue; 01106 01107 if (inclass) { 01108 /* Look for an end of class marker. */ 01109 if (n_words == 2 && 0 == strcmp(wptr[0], "END")) { 01110 classdef_t *classdef; 01111 gnode_t *word, *weight; 01112 int32 i; 01113 01114 if (classname == NULL || 0 != strcmp(wptr[1], classname)) 01115 goto error_out; 01116 inclass = FALSE; 01117 01118 /* Construct a class from the list of words collected. */ 01119 classdef = ckd_calloc(1, sizeof(*classdef)); 01120 classwords = glist_reverse(classwords); 01121 classprobs = glist_reverse(classprobs); 01122 classdef->n_words = glist_count(classwords); 01123 classdef->words = ckd_calloc(classdef->n_words, 01124 sizeof(*classdef->words)); 01125 classdef->weights = ckd_calloc(classdef->n_words, 01126 sizeof(*classdef->weights)); 01127 word = classwords; 01128 weight = classprobs; 01129 for (i = 0; i < classdef->n_words; ++i) { 01130 classdef->words[i] = gnode_ptr(word); 01131 classdef->weights[i] = gnode_float32(weight); 01132 word = gnode_next(word); 01133 weight = gnode_next(weight); 01134 } 01135 01136 /* Add this class to the hash table. */ 01137 if (hash_table_enter(classes, classname, classdef) != classdef) { 01138 classdef_free(classdef); 01139 goto error_out; 01140 } 01141 01142 /* Reset everything. */ 01143 glist_free(classwords); 01144 glist_free(classprobs); 01145 classwords = NULL; 01146 classprobs = NULL; 01147 classname = NULL; 01148 } 01149 else { 01150 float32 fprob; 01151 01152 if (n_words == 2) 01153 fprob = (float32)atof_c(wptr[1]); 01154 else 01155 fprob = 1.0f; 01156 /* Add it to the list of words for this class. */ 01157 classwords = glist_add_ptr(classwords, ckd_salloc(wptr[0])); 01158 classprobs = glist_add_float32(classprobs, fprob); 01159 } 01160 } 01161 else { 01162 /* Start a new LM class if the LMCLASS marker is seen */ 01163 if (n_words == 2 && 0 == strcmp(wptr[0], "LMCLASS")) { 01164 if (inclass) 01165 goto error_out; 01166 inclass = TRUE; 01167 classname = ckd_salloc(wptr[1]); 01168 } 01169 /* Otherwise, just ignore whatever junk we got */ 01170 } 01171 } 01172 rv = 0; /* Success. */ 01173 01174 error_out: 01175 /* Free all the stuff we might have allocated. */ 01176 fclose_comp(fp, is_pipe); 01177 for (gn = classwords; gn; gn = gnode_next(gn)) 01178 ckd_free(gnode_ptr(gn)); 01179 glist_free(classwords); 01180 glist_free(classprobs); 01181 ckd_free(classname); 01182 01183 return rv; 01184 } 01185 01186 void 01187 classdef_free(classdef_t *classdef) 01188 { 01189 int32 i; 01190 for (i = 0; i < classdef->n_words; ++i) 01191 ckd_free(classdef->words[i]); 01192 ckd_free(classdef->words); 01193 ckd_free(classdef->weights); 01194 ckd_free(classdef); 01195 } 01196 01197 01198 int32 01199 ngram_model_read_classdef(ngram_model_t *model, 01200 const char *file_name) 01201 { 01202 hash_table_t *classes; 01203 glist_t hl = NULL; 01204 gnode_t *gn; 01205 int32 rv = -1; 01206 01207 classes = hash_table_new(0, FALSE); 01208 if (read_classdef_file(classes, file_name) < 0) { 01209 hash_table_free(classes); 01210 return -1; 01211 } 01212 01213 /* Create a new class in the language model for each classdef. */ 01214 hl = hash_table_tolist(classes, NULL); 01215 for (gn = hl; gn; gn = gnode_next(gn)) { 01216 hash_entry_t *he = gnode_ptr(gn); 01217 classdef_t *classdef = he->val; 01218 01219 if (ngram_model_add_class(model, he->key, 1.0, 01220 classdef->words, 01221 classdef->weights, 01222 classdef->n_words) < 0) 01223 goto error_out; 01224 } 01225 rv = 0; 01226 01227 error_out: 01228 for (gn = hl; gn; gn = gnode_next(gn)) { 01229 hash_entry_t *he = gnode_ptr(gn); 01230 ckd_free((char *)he->key); 01231 classdef_free(he->val); 01232 } 01233 glist_free(hl); 01234 hash_table_free(classes); 01235 return rv; 01236 }