SphinxBase 0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 2008 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * This work was supported in part by funding from the Defense Advanced 00019 * Research Projects Agency and the National Science Foundation of the 00020 * United States of America, and the CMU Sphinx Speech Consortium. 00021 * 00022 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00023 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00024 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00025 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00026 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00028 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00029 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00030 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00032 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 * 00034 * ==================================================================== 00035 * 00036 */ 00042 #include <string.h> 00043 #include <stdlib.h> 00044 00045 #include "sphinxbase/err.h" 00046 #include "sphinxbase/ckd_alloc.h" 00047 #include "sphinxbase/strfuncs.h" 00048 #include "sphinxbase/filename.h" 00049 00050 #include "ngram_model_set.h" 00051 00052 static ngram_funcs_t ngram_model_set_funcs; 00053 00054 static int 00055 my_compare(const void *a, const void *b) 00056 { 00057 /* Make sure <UNK> floats to the beginning. */ 00058 if (strcmp(*(char * const *)a, "<UNK>") == 0) 00059 return -1; 00060 else if (strcmp(*(char * const *)b, "<UNK>") == 0) 00061 return 1; 00062 else 00063 return strcmp(*(char * const *)a, *(char * const *)b); 00064 } 00065 00066 static void 00067 build_widmap(ngram_model_t *base, logmath_t *lmath, int32 n) 00068 { 00069 ngram_model_set_t *set = (ngram_model_set_t *)base; 00070 ngram_model_t **models = set->lms; 00071 hash_table_t *vocab; 00072 glist_t hlist; 00073 gnode_t *gn; 00074 int32 i; 00075 00076 /* Construct a merged vocabulary and a set of word-ID mappings. */ 00077 vocab = hash_table_new(models[0]->n_words, FALSE); 00078 /* Create the set of merged words. */ 00079 for (i = 0; i < set->n_models; ++i) { 00080 int32 j; 00081 for (j = 0; j < models[i]->n_words; ++j) { 00082 /* Ignore collisions. */ 00083 (void)hash_table_enter_int32(vocab, models[i]->word_str[j], j); 00084 } 00085 } 00086 /* Create the array of words, then sort it. */ 00087 if (hash_table_lookup(vocab, "<UNK>", NULL) != 0) 00088 (void)hash_table_enter_int32(vocab, "<UNK>", 0); 00089 /* Now we know the number of unigrams, initialize the base model. */ 00090 ngram_model_init(base, &ngram_model_set_funcs, lmath, n, hash_table_inuse(vocab)); 00091 base->writable = FALSE; /* We will reuse the pointers from the submodels. */ 00092 i = 0; 00093 hlist = hash_table_tolist(vocab, NULL); 00094 for (gn = hlist; gn; gn = gnode_next(gn)) { 00095 hash_entry_t *ent = gnode_ptr(gn); 00096 base->word_str[i++] = (char *)ent->key; 00097 } 00098 glist_free(hlist); 00099 qsort(base->word_str, base->n_words, sizeof(*base->word_str), my_compare); 00100 00101 /* Now create the word ID mappings. */ 00102 if (set->widmap) 00103 ckd_free_2d((void **)set->widmap); 00104 set->widmap = (int32 **) ckd_calloc_2d(base->n_words, set->n_models, 00105 sizeof(**set->widmap)); 00106 for (i = 0; i < base->n_words; ++i) { 00107 int32 j; 00108 /* Also create the master wid mapping. */ 00109 (void)hash_table_enter_int32(base->wid, base->word_str[i], i); 00110 /* printf("%s: %d => ", base->word_str[i], i); */ 00111 for (j = 0; j < set->n_models; ++j) { 00112 set->widmap[i][j] = ngram_wid(models[j], base->word_str[i]); 00113 /* printf("%d ", set->widmap[i][j]); */ 00114 } 00115 /* printf("\n"); */ 00116 } 00117 hash_table_free(vocab); 00118 } 00119 00120 ngram_model_t * 00121 ngram_model_set_init(cmd_ln_t *config, 00122 ngram_model_t **models, 00123 char **names, 00124 const float32 *weights, 00125 int32 n_models) 00126 { 00127 ngram_model_set_t *model; 00128 ngram_model_t *base; 00129 logmath_t *lmath; 00130 int32 i, n; 00131 00132 if (n_models == 0) /* WTF */ 00133 return NULL; 00134 00135 /* Do consistency checking on the models. They must all use the 00136 * same logbase and shift. */ 00137 lmath = models[0]->lmath; 00138 for (i = 1; i < n_models; ++i) { 00139 if (logmath_get_base(models[i]->lmath) != logmath_get_base(lmath) 00140 || logmath_get_shift(models[i]->lmath) != logmath_get_shift(lmath)) { 00141 E_ERROR("Log-math parameters don't match, will not create LM set\n"); 00142 return NULL; 00143 } 00144 } 00145 00146 /* Allocate the combined model, initialize it. */ 00147 model = ckd_calloc(1, sizeof(*model)); 00148 base = &model->base; 00149 model->n_models = n_models; 00150 model->lms = ckd_calloc(n_models, sizeof(*model->lms)); 00151 model->names = ckd_calloc(n_models, sizeof(*model->names)); 00152 /* Initialize weights to a uniform distribution */ 00153 model->lweights = ckd_calloc(n_models, sizeof(*model->lweights)); 00154 { 00155 int32 uniform = logmath_log(lmath, 1.0/n_models); 00156 for (i = 0; i < n_models; ++i) 00157 model->lweights[i] = uniform; 00158 } 00159 /* Default to interpolate if weights were given. */ 00160 if (weights) 00161 model->cur = -1; 00162 00163 n = 0; 00164 for (i = 0; i < n_models; ++i) { 00165 model->lms[i] = models[i]; 00166 model->names[i] = ckd_salloc(names[i]); 00167 if (weights) 00168 model->lweights[i] = logmath_log(lmath, weights[i]); 00169 /* N is the maximum of all merged models. */ 00170 if (models[i]->n > n) 00171 n = models[i]->n; 00172 } 00173 /* Allocate the history mapping table. */ 00174 model->maphist = ckd_calloc(n - 1, sizeof(*model->maphist)); 00175 00176 /* Now build the word-ID mapping and merged vocabulary. */ 00177 build_widmap(base, lmath, n); 00178 return base; 00179 } 00180 00181 ngram_model_t * 00182 ngram_model_set_read(cmd_ln_t *config, 00183 const char *lmctlfile, 00184 logmath_t *lmath) 00185 { 00186 FILE *ctlfp; 00187 glist_t lms = NULL; 00188 glist_t lmnames = NULL; 00189 __BIGSTACKVARIABLE__ char str[1024]; 00190 ngram_model_t *set = NULL; 00191 hash_table_t *classes; 00192 char *basedir, *c; 00193 00194 /* Read all the class definition files to accumulate a mapping of 00195 * classnames to definitions. */ 00196 classes = hash_table_new(0, FALSE); 00197 if ((ctlfp = fopen(lmctlfile, "r")) == NULL) { 00198 E_ERROR_SYSTEM("Failed to open %s", lmctlfile); 00199 return NULL; 00200 } 00201 00202 /* Try to find the base directory to append to relative paths in 00203 * the lmctl file. */ 00204 if ((c = strrchr(lmctlfile, '/')) || (c = strrchr(lmctlfile, '\\'))) { 00205 /* Include the trailing slash. */ 00206 basedir = ckd_calloc(c - lmctlfile + 2, 1); 00207 memcpy(basedir, lmctlfile, c - lmctlfile + 1); 00208 } 00209 else { 00210 basedir = NULL; 00211 } 00212 E_INFO("Reading LM control file '%s'\n", lmctlfile); 00213 if (basedir) 00214 E_INFO("Will prepend '%s' to unqualified paths\n", basedir); 00215 00216 if (fscanf(ctlfp, "%1023s", str) == 1) { 00217 if (strcmp(str, "{") == 0) { 00218 /* Load LMclass files */ 00219 while ((fscanf(ctlfp, "%1023s", str) == 1) 00220 && (strcmp(str, "}") != 0)) { 00221 char *deffile; 00222 if (basedir && !path_is_absolute(str)) 00223 deffile = string_join(basedir, str, NULL); 00224 else 00225 deffile = ckd_salloc(str); 00226 E_INFO("Reading classdef from '%s'\n", deffile); 00227 if (read_classdef_file(classes, deffile) < 0) { 00228 ckd_free(deffile); 00229 goto error_out; 00230 } 00231 ckd_free(deffile); 00232 } 00233 00234 if (strcmp(str, "}") != 0) { 00235 E_ERROR("Unexpected EOF in %s\n", lmctlfile); 00236 goto error_out; 00237 } 00238 00239 /* This might be the first LM name. */ 00240 if (fscanf(ctlfp, "%1023s", str) != 1) 00241 str[0] = '\0'; 00242 } 00243 } 00244 else 00245 str[0] = '\0'; 00246 00247 /* Read in one LM at a time and add classes to them as necessary. */ 00248 while (str[0] != '\0') { 00249 char *lmfile; 00250 ngram_model_t *lm; 00251 00252 if (basedir && str[0] != '/' && str[0] != '\\') 00253 lmfile = string_join(basedir, str, NULL); 00254 else 00255 lmfile = ckd_salloc(str); 00256 E_INFO("Reading lm from '%s'\n", lmfile); 00257 lm = ngram_model_read(config, lmfile, NGRAM_AUTO, lmath); 00258 if (lm == NULL) { 00259 ckd_free(lmfile); 00260 goto error_out; 00261 } 00262 if (fscanf(ctlfp, "%1023s", str) != 1) { 00263 E_ERROR("LMname missing after LMFileName '%s'\n", lmfile); 00264 ckd_free(lmfile); 00265 goto error_out; 00266 } 00267 ckd_free(lmfile); 00268 lms = glist_add_ptr(lms, lm); 00269 lmnames = glist_add_ptr(lmnames, ckd_salloc(str)); 00270 00271 if (fscanf(ctlfp, "%1023s", str) == 1) { 00272 if (strcmp(str, "{") == 0) { 00273 /* LM uses classes; read their names */ 00274 while ((fscanf(ctlfp, "%1023s", str) == 1) && 00275 (strcmp(str, "}") != 0)) { 00276 void *val; 00277 classdef_t *classdef; 00278 00279 if (hash_table_lookup(classes, str, &val) == -1) { 00280 E_ERROR("Unknown class %s in control file\n", str); 00281 goto error_out; 00282 } 00283 classdef = val; 00284 if (ngram_model_add_class(lm, str, 1.0, 00285 classdef->words, classdef->weights, 00286 classdef->n_words) < 0) { 00287 goto error_out; 00288 } 00289 E_INFO("Added class %s containing %d words\n", 00290 str, classdef->n_words); 00291 } 00292 if (strcmp(str, "}") != 0) { 00293 E_ERROR("Unexpected EOF in %s\n", lmctlfile); 00294 goto error_out; 00295 } 00296 if (fscanf(ctlfp, "%1023s", str) != 1) 00297 str[0] = '\0'; 00298 } 00299 } 00300 else 00301 str[0] = '\0'; 00302 } 00303 fclose(ctlfp); 00304 00305 /* Now construct arrays out of lms and lmnames, and build an 00306 * ngram_model_set. */ 00307 lms = glist_reverse(lms); 00308 lmnames = glist_reverse(lmnames); 00309 { 00310 int32 n_models; 00311 ngram_model_t **lm_array; 00312 char **name_array; 00313 gnode_t *lm_node, *name_node; 00314 int32 i; 00315 00316 n_models = glist_count(lms); 00317 lm_array = ckd_calloc(n_models, sizeof(*lm_array)); 00318 name_array = ckd_calloc(n_models, sizeof(*name_array)); 00319 lm_node = lms; 00320 name_node = lmnames; 00321 for (i = 0; i < n_models; ++i) { 00322 lm_array[i] = gnode_ptr(lm_node); 00323 name_array[i] = gnode_ptr(name_node); 00324 lm_node = gnode_next(lm_node); 00325 name_node = gnode_next(name_node); 00326 } 00327 set = ngram_model_set_init(config, lm_array, name_array, 00328 NULL, n_models); 00329 ckd_free(lm_array); 00330 ckd_free(name_array); 00331 } 00332 error_out: 00333 { 00334 gnode_t *gn; 00335 glist_t hlist; 00336 00337 if (set == NULL) { 00338 for (gn = lms; gn; gn = gnode_next(gn)) { 00339 ngram_model_free(gnode_ptr(gn)); 00340 } 00341 } 00342 glist_free(lms); 00343 for (gn = lmnames; gn; gn = gnode_next(gn)) { 00344 ckd_free(gnode_ptr(gn)); 00345 } 00346 glist_free(lmnames); 00347 hlist = hash_table_tolist(classes, NULL); 00348 for (gn = hlist; gn; gn = gnode_next(gn)) { 00349 hash_entry_t *he = gnode_ptr(gn); 00350 ckd_free((char *)he->key); 00351 classdef_free(he->val); 00352 } 00353 glist_free(hlist); 00354 hash_table_free(classes); 00355 ckd_free(basedir); 00356 } 00357 return set; 00358 } 00359 00360 int32 00361 ngram_model_set_count(ngram_model_t *base) 00362 { 00363 ngram_model_set_t *set = (ngram_model_set_t *)base; 00364 return set->n_models; 00365 } 00366 00367 ngram_model_set_iter_t * 00368 ngram_model_set_iter(ngram_model_t *base) 00369 { 00370 ngram_model_set_t *set = (ngram_model_set_t *)base; 00371 ngram_model_set_iter_t *itor; 00372 00373 if (set == NULL || set->n_models == 0) 00374 return NULL; 00375 itor = ckd_calloc(1, sizeof(*itor)); 00376 itor->set = set; 00377 return itor; 00378 } 00379 00380 ngram_model_set_iter_t * 00381 ngram_model_set_iter_next(ngram_model_set_iter_t *itor) 00382 { 00383 if (++itor->cur == itor->set->n_models) { 00384 ngram_model_set_iter_free(itor); 00385 return NULL; 00386 } 00387 return itor; 00388 } 00389 00390 void 00391 ngram_model_set_iter_free(ngram_model_set_iter_t *itor) 00392 { 00393 ckd_free(itor); 00394 } 00395 00396 ngram_model_t * 00397 ngram_model_set_iter_model(ngram_model_set_iter_t *itor, 00398 char const **lmname) 00399 { 00400 if (lmname) *lmname = itor->set->names[itor->cur]; 00401 return itor->set->lms[itor->cur]; 00402 } 00403 00404 ngram_model_t * 00405 ngram_model_set_lookup(ngram_model_t *base, 00406 const char *name) 00407 { 00408 ngram_model_set_t *set = (ngram_model_set_t *)base; 00409 int32 i; 00410 00411 if (name == NULL) { 00412 if (set->cur == -1) 00413 return NULL; 00414 else 00415 return set->lms[set->cur]; 00416 } 00417 00418 /* There probably won't be very many submodels. */ 00419 for (i = 0; i < set->n_models; ++i) 00420 if (0 == strcmp(set->names[i], name)) 00421 break; 00422 if (i == set->n_models) 00423 return NULL; 00424 return set->lms[i]; 00425 } 00426 00427 ngram_model_t * 00428 ngram_model_set_select(ngram_model_t *base, 00429 const char *name) 00430 { 00431 ngram_model_set_t *set = (ngram_model_set_t *)base; 00432 int32 i; 00433 00434 /* There probably won't be very many submodels. */ 00435 for (i = 0; i < set->n_models; ++i) 00436 if (0 == strcmp(set->names[i], name)) 00437 break; 00438 if (i == set->n_models) 00439 return NULL; 00440 set->cur = i; 00441 return set->lms[set->cur]; 00442 } 00443 00444 const char * 00445 ngram_model_set_current(ngram_model_t *base) 00446 { 00447 ngram_model_set_t *set = (ngram_model_set_t *)base; 00448 00449 if (set->cur == -1) 00450 return NULL; 00451 else 00452 return set->names[set->cur]; 00453 } 00454 00455 int32 00456 ngram_model_set_current_wid(ngram_model_t *base, 00457 int32 set_wid) 00458 { 00459 ngram_model_set_t *set = (ngram_model_set_t *)base; 00460 00461 if (set->cur == -1 || set_wid >= base->n_words) 00462 return NGRAM_INVALID_WID; 00463 else 00464 return set->widmap[set_wid][set->cur]; 00465 } 00466 00467 int32 00468 ngram_model_set_known_wid(ngram_model_t *base, 00469 int32 set_wid) 00470 { 00471 ngram_model_set_t *set = (ngram_model_set_t *)base; 00472 00473 if (set_wid >= base->n_words) 00474 return FALSE; 00475 else if (set->cur == -1) { 00476 int32 i; 00477 for (i = 0; i < set->n_models; ++i) { 00478 if (set->widmap[set_wid][i] != ngram_unknown_wid(set->lms[i])) 00479 return TRUE; 00480 } 00481 return FALSE; 00482 } 00483 else 00484 return (set->widmap[set_wid][set->cur] 00485 != ngram_unknown_wid(set->lms[set->cur])); 00486 } 00487 00488 ngram_model_t * 00489 ngram_model_set_interp(ngram_model_t *base, 00490 const char **names, 00491 const float32 *weights) 00492 { 00493 ngram_model_set_t *set = (ngram_model_set_t *)base; 00494 00495 /* If we have a set of weights here, then set them. */ 00496 if (names && weights) { 00497 int32 i, j; 00498 00499 /* We hope there aren't many models. */ 00500 for (i = 0; i < set->n_models; ++i) { 00501 for (j = 0; j < set->n_models; ++j) 00502 if (0 == strcmp(names[i], set->names[j])) 00503 break; 00504 if (j == set->n_models) { 00505 E_ERROR("Unknown LM name %s\n", names[i]); 00506 return NULL; 00507 } 00508 set->lweights[j] = logmath_log(base->lmath, weights[i]); 00509 } 00510 } 00511 else if (weights) { 00512 memcpy(set->lweights, weights, set->n_models * sizeof(*set->lweights)); 00513 } 00514 /* Otherwise just enable existing weights. */ 00515 set->cur = -1; 00516 return base; 00517 } 00518 00519 ngram_model_t * 00520 ngram_model_set_add(ngram_model_t *base, 00521 ngram_model_t *model, 00522 const char *name, 00523 float32 weight, 00524 int reuse_widmap) 00525 00526 { 00527 ngram_model_set_t *set = (ngram_model_set_t *)base; 00528 float32 fprob; 00529 int32 scale, i; 00530 00531 /* Add it to the array of lms. */ 00532 ++set->n_models; 00533 set->lms = ckd_realloc(set->lms, set->n_models * sizeof(*set->lms)); 00534 set->lms[set->n_models - 1] = model; 00535 set->names = ckd_realloc(set->names, set->n_models * sizeof(*set->names)); 00536 set->names[set->n_models - 1] = ckd_salloc(name); 00537 /* Expand the history mapping table if necessary. */ 00538 if (model->n > base->n) { 00539 base->n = model->n; 00540 set->maphist = ckd_realloc(set->maphist, 00541 (model->n - 1) * sizeof(*set->maphist)); 00542 } 00543 00544 /* Renormalize the interpolation weights. */ 00545 fprob = weight * 1.0 / set->n_models; 00546 set->lweights = ckd_realloc(set->lweights, 00547 set->n_models * sizeof(*set->lweights)); 00548 set->lweights[set->n_models - 1] = logmath_log(base->lmath, fprob); 00549 /* Now normalize everything else to fit it in. This is 00550 * accomplished by simply scaling all the other probabilities 00551 * by (1-fprob). */ 00552 scale = logmath_log(base->lmath, 1.0 - fprob); 00553 for (i = 0; i < set->n_models - 1; ++i) 00554 set->lweights[i] += scale; 00555 00556 /* Reuse the old word ID mapping if requested. */ 00557 if (reuse_widmap) { 00558 int32 **new_widmap; 00559 00560 /* Tack another column onto the widmap array. */ 00561 new_widmap = (int32 **)ckd_calloc_2d(base->n_words, set->n_models, 00562 sizeof (**new_widmap)); 00563 for (i = 0; i < base->n_words; ++i) { 00564 /* Copy all the existing mappings. */ 00565 memcpy(new_widmap[i], set->widmap[i], 00566 (set->n_models - 1) * sizeof(**new_widmap)); 00567 /* Create the new mapping. */ 00568 new_widmap[i][set->n_models-1] = ngram_wid(model, base->word_str[i]); 00569 } 00570 ckd_free_2d((void **)set->widmap); 00571 set->widmap = new_widmap; 00572 } 00573 else { 00574 build_widmap(base, base->lmath, base->n); 00575 } 00576 return model; 00577 } 00578 00579 ngram_model_t * 00580 ngram_model_set_remove(ngram_model_t *base, 00581 const char *name, 00582 int reuse_widmap) 00583 { 00584 ngram_model_set_t *set = (ngram_model_set_t *)base; 00585 ngram_model_t *submodel; 00586 int32 lmidx, scale, n, i; 00587 float32 fprob; 00588 00589 for (lmidx = 0; lmidx < set->n_models; ++lmidx) 00590 if (0 == strcmp(name, set->names[lmidx])) 00591 break; 00592 if (lmidx == set->n_models) 00593 return NULL; 00594 submodel = set->lms[lmidx]; 00595 00596 /* Renormalize the interpolation weights by scaling them by 00597 * 1/(1-fprob) */ 00598 fprob = logmath_exp(base->lmath, set->lweights[lmidx]); 00599 scale = logmath_log(base->lmath, 1.0 - fprob); 00600 00601 /* Remove it from the array of lms, renormalize remaining weights, 00602 * and recalcluate n. */ 00603 --set->n_models; 00604 n = 0; 00605 ckd_free(set->names[lmidx]); 00606 set->names[lmidx] = NULL; 00607 for (i = 0; i < set->n_models; ++i) { 00608 if (i >= lmidx) { 00609 set->lms[i] = set->lms[i+1]; 00610 set->names[i] = set->names[i+1]; 00611 set->lweights[i] = set->lweights[i+1]; 00612 } 00613 set->lweights[i] -= scale; 00614 if (set->lms[i]->n > n) 00615 n = set->lms[i]->n; 00616 } 00617 /* There's no need to shrink these arrays. */ 00618 set->lms[set->n_models] = NULL; 00619 set->lweights[set->n_models] = base->log_zero; 00620 /* No need to shrink maphist either. */ 00621 00622 /* Reuse the existing word ID mapping if requested. */ 00623 if (reuse_widmap) { 00624 /* Just go through and shrink each row. */ 00625 for (i = 0; i < base->n_words; ++i) { 00626 memmove(set->widmap[i] + lmidx, set->widmap[i] + lmidx + 1, 00627 (set->n_models - lmidx) * sizeof(**set->widmap)); 00628 } 00629 } 00630 else { 00631 build_widmap(base, base->lmath, n); 00632 } 00633 return submodel; 00634 } 00635 00636 void 00637 ngram_model_set_map_words(ngram_model_t *base, 00638 const char **words, 00639 int32 n_words) 00640 { 00641 ngram_model_set_t *set = (ngram_model_set_t *)base; 00642 int32 i; 00643 00644 /* Recreate the word mapping. */ 00645 if (base->writable) { 00646 for (i = 0; i < base->n_words; ++i) { 00647 ckd_free(base->word_str[i]); 00648 } 00649 } 00650 ckd_free(base->word_str); 00651 ckd_free_2d((void **)set->widmap); 00652 base->writable = TRUE; 00653 base->n_words = base->n_1g_alloc = n_words; 00654 base->word_str = ckd_calloc(n_words, sizeof(*base->word_str)); 00655 set->widmap = (int32 **)ckd_calloc_2d(n_words, set->n_models, sizeof(**set->widmap)); 00656 hash_table_empty(base->wid); 00657 for (i = 0; i < n_words; ++i) { 00658 int32 j; 00659 base->word_str[i] = ckd_salloc(words[i]); 00660 (void)hash_table_enter_int32(base->wid, base->word_str[i], i); 00661 for (j = 0; j < set->n_models; ++j) { 00662 set->widmap[i][j] = ngram_wid(set->lms[j], base->word_str[i]); 00663 } 00664 } 00665 } 00666 00667 static int 00668 ngram_model_set_apply_weights(ngram_model_t *base, float32 lw, 00669 float32 wip, float32 uw) 00670 { 00671 ngram_model_set_t *set = (ngram_model_set_t *)base; 00672 int32 i; 00673 00674 /* Apply weights to each sub-model. */ 00675 for (i = 0; i < set->n_models; ++i) 00676 ngram_model_apply_weights(set->lms[i], lw, wip, uw); 00677 return 0; 00678 } 00679 00680 static int32 00681 ngram_model_set_score(ngram_model_t *base, int32 wid, 00682 int32 *history, int32 n_hist, 00683 int32 *n_used) 00684 { 00685 ngram_model_set_t *set = (ngram_model_set_t *)base; 00686 int32 mapwid; 00687 int32 score; 00688 int32 i; 00689 00690 /* Truncate the history. */ 00691 if (n_hist > base->n - 1) 00692 n_hist = base->n - 1; 00693 00694 /* Interpolate if there is no current. */ 00695 if (set->cur == -1) { 00696 score = base->log_zero; 00697 for (i = 0; i < set->n_models; ++i) { 00698 int32 j; 00699 /* Map word and history IDs for each model. */ 00700 mapwid = set->widmap[wid][i]; 00701 for (j = 0; j < n_hist; ++j) { 00702 if (history[j] == NGRAM_INVALID_WID) 00703 set->maphist[j] = NGRAM_INVALID_WID; 00704 else 00705 set->maphist[j] = set->widmap[history[j]][i]; 00706 } 00707 score = logmath_add(base->lmath, score, 00708 set->lweights[i] + 00709 ngram_ng_score(set->lms[i], 00710 mapwid, set->maphist, n_hist, n_used)); 00711 } 00712 } 00713 else { 00714 int32 j; 00715 /* Map word and history IDs (FIXME: do this in a function?) */ 00716 mapwid = set->widmap[wid][set->cur]; 00717 for (j = 0; j < n_hist; ++j) { 00718 if (history[j] == NGRAM_INVALID_WID) 00719 set->maphist[j] = NGRAM_INVALID_WID; 00720 else 00721 set->maphist[j] = set->widmap[history[j]][set->cur]; 00722 } 00723 score = ngram_ng_score(set->lms[set->cur], 00724 mapwid, set->maphist, n_hist, n_used); 00725 } 00726 00727 return score; 00728 } 00729 00730 static int32 00731 ngram_model_set_raw_score(ngram_model_t *base, int32 wid, 00732 int32 *history, int32 n_hist, 00733 int32 *n_used) 00734 { 00735 ngram_model_set_t *set = (ngram_model_set_t *)base; 00736 int32 mapwid; 00737 int32 score; 00738 int32 i; 00739 00740 /* Truncate the history. */ 00741 if (n_hist > base->n - 1) 00742 n_hist = base->n - 1; 00743 00744 /* Interpolate if there is no current. */ 00745 if (set->cur == -1) { 00746 score = base->log_zero; 00747 for (i = 0; i < set->n_models; ++i) { 00748 int32 j; 00749 /* Map word and history IDs for each model. */ 00750 mapwid = set->widmap[wid][i]; 00751 for (j = 0; j < n_hist; ++j) { 00752 if (history[j] == NGRAM_INVALID_WID) 00753 set->maphist[j] = NGRAM_INVALID_WID; 00754 else 00755 set->maphist[j] = set->widmap[history[j]][i]; 00756 } 00757 score = logmath_add(base->lmath, score, 00758 set->lweights[i] + 00759 ngram_ng_prob(set->lms[i], 00760 mapwid, set->maphist, n_hist, n_used)); 00761 } 00762 } 00763 else { 00764 int32 j; 00765 /* Map word and history IDs (FIXME: do this in a function?) */ 00766 mapwid = set->widmap[wid][set->cur]; 00767 for (j = 0; j < n_hist; ++j) { 00768 if (history[j] == NGRAM_INVALID_WID) 00769 set->maphist[j] = NGRAM_INVALID_WID; 00770 else 00771 set->maphist[j] = set->widmap[history[j]][set->cur]; 00772 } 00773 score = ngram_ng_prob(set->lms[set->cur], 00774 mapwid, set->maphist, n_hist, n_used); 00775 } 00776 00777 return score; 00778 } 00779 00780 static int32 00781 ngram_model_set_add_ug(ngram_model_t *base, 00782 int32 wid, int32 lweight) 00783 { 00784 ngram_model_set_t *set = (ngram_model_set_t *)base; 00785 int32 *newwid; 00786 int32 i, prob; 00787 00788 /* At this point the word has already been added to the master 00789 model and we have a new word ID for it. Add it to active 00790 submodels and track the word IDs. */ 00791 newwid = ckd_calloc(set->n_models, sizeof(*newwid)); 00792 prob = base->log_zero; 00793 for (i = 0; i < set->n_models; ++i) { 00794 int32 wprob, n_hist; 00795 00796 /* Only add to active models. */ 00797 if (set->cur == -1 || set->cur == i) { 00798 /* Did this word already exist? */ 00799 newwid[i] = ngram_wid(set->lms[i], base->word_str[wid]); 00800 if (newwid[i] == NGRAM_INVALID_WID) { 00801 /* Add it to the submodel. */ 00802 newwid[i] = ngram_model_add_word(set->lms[i], base->word_str[wid], 00803 logmath_exp(base->lmath, lweight)); 00804 if (newwid[i] == NGRAM_INVALID_WID) { 00805 ckd_free(newwid); 00806 return base->log_zero; 00807 } 00808 } 00809 /* Now get the unigram probability for the new word and either 00810 * interpolate it or use it (if this is the current model). */ 00811 wprob = ngram_ng_prob(set->lms[i], newwid[i], NULL, 0, &n_hist); 00812 if (set->cur == i) 00813 prob = wprob; 00814 else if (set->cur == -1) 00815 prob = logmath_add(base->lmath, prob, set->lweights[i] + wprob); 00816 } 00817 else { 00818 newwid[i] = NGRAM_INVALID_WID; 00819 } 00820 } 00821 /* Okay we have the word IDs for this in all the submodels. Now 00822 do some complicated memory mangling to add this to the 00823 widmap. */ 00824 set->widmap = ckd_realloc(set->widmap, base->n_words * sizeof(*set->widmap)); 00825 set->widmap[0] = ckd_realloc(set->widmap[0], 00826 base->n_words 00827 * set->n_models 00828 * sizeof(**set->widmap)); 00829 for (i = 0; i < base->n_words; ++i) 00830 set->widmap[i] = set->widmap[0] + i * set->n_models; 00831 memcpy(set->widmap[wid], newwid, set->n_models * sizeof(*newwid)); 00832 ckd_free(newwid); 00833 return prob; 00834 } 00835 00836 static void 00837 ngram_model_set_free(ngram_model_t *base) 00838 { 00839 ngram_model_set_t *set = (ngram_model_set_t *)base; 00840 int32 i; 00841 00842 for (i = 0; i < set->n_models; ++i) 00843 ngram_model_free(set->lms[i]); 00844 ckd_free(set->lms); 00845 for (i = 0; i < set->n_models; ++i) 00846 ckd_free(set->names[i]); 00847 ckd_free(set->names); 00848 ckd_free(set->lweights); 00849 ckd_free(set->maphist); 00850 ckd_free_2d((void **)set->widmap); 00851 } 00852 00853 static void 00854 ngram_model_set_flush(ngram_model_t *base) 00855 { 00856 ngram_model_set_t *set = (ngram_model_set_t *)base; 00857 int32 i; 00858 00859 for (i = 0; i < set->n_models; ++i) 00860 ngram_model_flush(set->lms[i]); 00861 } 00862 00863 static ngram_funcs_t ngram_model_set_funcs = { 00864 ngram_model_set_free, /* free */ 00865 ngram_model_set_apply_weights, /* apply_weights */ 00866 ngram_model_set_score, /* score */ 00867 ngram_model_set_raw_score, /* raw_score */ 00868 ngram_model_set_add_ug, /* add_ug */ 00869 ngram_model_set_flush /* flush */ 00870 };