SphinxBase 0.6
|
00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */ 00002 /* ==================================================================== 00003 * Copyright (c) 1999-2007 Carnegie Mellon University. All rights 00004 * reserved. 00005 * 00006 * Redistribution and use in source and binary forms, with or without 00007 * modification, are permitted provided that the following conditions 00008 * are met: 00009 * 00010 * 1. Redistributions of source code must retain the above copyright 00011 * notice, this list of conditions and the following disclaimer. 00012 * 00013 * 2. Redistributions in binary form must reproduce the above copyright 00014 * notice, this list of conditions and the following disclaimer in 00015 * the documentation and/or other materials provided with the 00016 * distribution. 00017 * 00018 * This work was supported in part by funding from the Defense Advanced 00019 * Research Projects Agency and the National Science Foundation of the 00020 * United States of America, and the CMU Sphinx Speech Consortium. 00021 * 00022 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 00023 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 00024 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 00025 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY 00026 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 00027 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 00028 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 00029 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 00030 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 00031 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 00032 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00033 * 00034 * ==================================================================== 00035 * 00036 */ 00037 /* 00038 * \file lm3g_templates.c Core Sphinx 3-gram code used in 00039 * DMP/DMP32/ARPA (for now) model code. 00040 */ 00041 00042 #include <assert.h> 00043 00044 /* Locate a specific bigram within a bigram list */ 00045 #define BINARY_SEARCH_THRESH 16 00046 static int32 00047 find_bg(bigram_t * bg, int32 n, int32 w) 00048 { 00049 int32 i, b, e; 00050 00051 /* Binary search until segment size < threshold */ 00052 b = 0; 00053 e = n; 00054 while (e - b > BINARY_SEARCH_THRESH) { 00055 i = (b + e) >> 1; 00056 if (bg[i].wid < w) 00057 b = i + 1; 00058 else if (bg[i].wid > w) 00059 e = i; 00060 else 00061 return i; 00062 } 00063 00064 /* Linear search within narrowed segment */ 00065 for (i = b; (i < e) && (bg[i].wid != w); i++); 00066 return ((i < e) ? i : -1); 00067 } 00068 00069 static int32 00070 lm3g_bg_score(NGRAM_MODEL_TYPE *model, 00071 int32 lw1, int32 lw2, int32 *n_used) 00072 { 00073 int32 i, n, b, score; 00074 bigram_t *bg; 00075 00076 if (lw1 < 0 || model->base.n < 2) { 00077 *n_used = 1; 00078 return model->lm3g.unigrams[lw2].prob1.l; 00079 } 00080 00081 b = FIRST_BG(model, lw1); 00082 n = FIRST_BG(model, lw1 + 1) - b; 00083 bg = model->lm3g.bigrams + b; 00084 00085 if ((i = find_bg(bg, n, lw2)) >= 0) { 00086 /* Access mode = bigram */ 00087 *n_used = 2; 00088 score = model->lm3g.prob2[bg[i].prob2].l; 00089 } 00090 else { 00091 /* Access mode = unigram */ 00092 *n_used = 1; 00093 score = model->lm3g.unigrams[lw1].bo_wt1.l + model->lm3g.unigrams[lw2].prob1.l; 00094 } 00095 00096 return (score); 00097 } 00098 00099 static void 00100 load_tginfo(NGRAM_MODEL_TYPE *model, int32 lw1, int32 lw2) 00101 { 00102 int32 i, n, b, t; 00103 bigram_t *bg; 00104 tginfo_t *tginfo; 00105 00106 /* First allocate space for tg information for bg lw1,lw2 */ 00107 tginfo = (tginfo_t *) listelem_malloc(model->lm3g.le); 00108 tginfo->w1 = lw1; 00109 tginfo->tg = NULL; 00110 tginfo->next = model->lm3g.tginfo[lw2]; 00111 model->lm3g.tginfo[lw2] = tginfo; 00112 00113 /* Locate bigram lw1,lw2 */ 00114 b = model->lm3g.unigrams[lw1].bigrams; 00115 n = model->lm3g.unigrams[lw1 + 1].bigrams - b; 00116 bg = model->lm3g.bigrams + b; 00117 00118 if ((n > 0) && ((i = find_bg(bg, n, lw2)) >= 0)) { 00119 tginfo->bowt = model->lm3g.bo_wt2[bg[i].bo_wt2].l; 00120 00121 /* Find t = Absolute first trigram index for bigram lw1,lw2 */ 00122 b += i; /* b = Absolute index of bigram lw1,lw2 on disk */ 00123 t = FIRST_TG(model, b); 00124 00125 tginfo->tg = model->lm3g.trigrams + t; 00126 00127 /* Find #tg for bigram w1,w2 */ 00128 tginfo->n_tg = FIRST_TG(model, b + 1) - t; 00129 } 00130 else { /* No bigram w1,w2 */ 00131 tginfo->bowt = 0; 00132 tginfo->n_tg = 0; 00133 } 00134 } 00135 00136 /* Similar to find_bg */ 00137 static int32 00138 find_tg(trigram_t * tg, int32 n, int32 w) 00139 { 00140 int32 i, b, e; 00141 00142 b = 0; 00143 e = n; 00144 while (e - b > BINARY_SEARCH_THRESH) { 00145 i = (b + e) >> 1; 00146 if (tg[i].wid < w) 00147 b = i + 1; 00148 else if (tg[i].wid > w) 00149 e = i; 00150 else 00151 return i; 00152 } 00153 00154 for (i = b; (i < e) && (tg[i].wid != w); i++); 00155 return ((i < e) ? i : -1); 00156 } 00157 00158 static int32 00159 lm3g_tg_score(NGRAM_MODEL_TYPE *model, int32 lw1, 00160 int32 lw2, int32 lw3, int32 *n_used) 00161 { 00162 ngram_model_t *base = &model->base; 00163 int32 i, n, score; 00164 trigram_t *tg; 00165 tginfo_t *tginfo, *prev_tginfo; 00166 00167 if ((base->n < 3) || (lw1 < 0) || (lw2 < 0)) 00168 return (lm3g_bg_score(model, lw2, lw3, n_used)); 00169 00170 prev_tginfo = NULL; 00171 for (tginfo = model->lm3g.tginfo[lw2]; tginfo; tginfo = tginfo->next) { 00172 if (tginfo->w1 == lw1) 00173 break; 00174 prev_tginfo = tginfo; 00175 } 00176 00177 if (!tginfo) { 00178 load_tginfo(model, lw1, lw2); 00179 tginfo = model->lm3g.tginfo[lw2]; 00180 } 00181 else if (prev_tginfo) { 00182 prev_tginfo->next = tginfo->next; 00183 tginfo->next = model->lm3g.tginfo[lw2]; 00184 model->lm3g.tginfo[lw2] = tginfo; 00185 } 00186 00187 tginfo->used = 1; 00188 00189 /* Trigrams for w1,w2 now pointed to by tginfo */ 00190 n = tginfo->n_tg; 00191 tg = tginfo->tg; 00192 if ((i = find_tg(tg, n, lw3)) >= 0) { 00193 /* Access mode = trigram */ 00194 *n_used = 3; 00195 score = model->lm3g.prob3[tg[i].prob3].l; 00196 } 00197 else { 00198 score = tginfo->bowt + lm3g_bg_score(model, lw2, lw3, n_used); 00199 } 00200 00201 return (score); 00202 } 00203 00204 static int32 00205 lm3g_template_score(ngram_model_t *base, int32 wid, 00206 int32 *history, int32 n_hist, 00207 int32 *n_used) 00208 { 00209 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base; 00210 switch (n_hist) { 00211 case 0: 00212 /* Access mode: unigram */ 00213 *n_used = 1; 00214 return model->lm3g.unigrams[wid].prob1.l; 00215 case 1: 00216 return lm3g_bg_score(model, history[0], wid, n_used); 00217 case 2: 00218 default: 00219 /* Anything greater than 2 is the same as a trigram for now. */ 00220 return lm3g_tg_score(model, history[1], history[0], wid, n_used); 00221 } 00222 } 00223 00224 static int32 00225 lm3g_template_raw_score(ngram_model_t *base, int32 wid, 00226 int32 *history, int32 n_hist, 00227 int32 *n_used) 00228 { 00229 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base; 00230 int32 score; 00231 00232 switch (n_hist) { 00233 case 0: 00234 /* Access mode: unigram */ 00235 *n_used = 1; 00236 /* Undo insertion penalty. */ 00237 score = model->lm3g.unigrams[wid].prob1.l - base->log_wip; 00238 /* Undo language weight. */ 00239 score = (int32)(score / base->lw); 00240 /* Undo unigram interpolation */ 00241 if (strcmp(base->word_str[wid], "<s>") != 0) { /* FIXME: configurable start_sym */ 00242 /* This operation is numerically unstable, so try to avoid it 00243 * as possible */ 00244 if (base->log_uniform + base->log_uniform_weight > logmath_get_zero(base->lmath)) { 00245 score = logmath_log(base->lmath, 00246 logmath_exp(base->lmath, score) 00247 - logmath_exp(base->lmath, 00248 base->log_uniform + base->log_uniform_weight)); 00249 } 00250 } 00251 return score; 00252 case 1: 00253 score = lm3g_bg_score(model, history[0], wid, n_used); 00254 break; 00255 case 2: 00256 default: 00257 /* Anything greater than 2 is the same as a trigram for now. */ 00258 score = lm3g_tg_score(model, history[1], history[0], wid, n_used); 00259 break; 00260 } 00261 /* FIXME (maybe): This doesn't undo unigram weighting in backoff cases. */ 00262 return (int32)((score - base->log_wip) / base->lw); 00263 } 00264 00265 static int32 00266 lm3g_template_add_ug(ngram_model_t *base, 00267 int32 wid, int32 lweight) 00268 { 00269 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base; 00270 return lm3g_add_ug(base, &model->lm3g, wid, lweight); 00271 } 00272 00273 static void 00274 lm3g_template_flush(ngram_model_t *base) 00275 { 00276 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base; 00277 lm3g_tginfo_reset(base, &model->lm3g); 00278 } 00279 00280 typedef struct lm3g_iter_s { 00281 ngram_iter_t base; 00282 unigram_t *ug; 00283 bigram_t *bg; 00284 trigram_t *tg; 00285 } lm3g_iter_t; 00286 00287 static ngram_iter_t * 00288 lm3g_template_iter(ngram_model_t *base, int32 wid, 00289 int32 *history, int32 n_hist) 00290 { 00291 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base; 00292 lm3g_iter_t *itor = ckd_calloc(1, sizeof(*itor)); 00293 00294 ngram_iter_init((ngram_iter_t *)itor, base, n_hist, FALSE); 00295 00296 if (n_hist == 0) { 00297 /* Unigram is the easiest. */ 00298 itor->ug = model->lm3g.unigrams + wid; 00299 return (ngram_iter_t *)itor; 00300 } 00301 else if (n_hist == 1) { 00302 int32 i, n, b; 00303 /* Find the bigram, as in bg_score above (duplicate code...) */ 00304 itor->ug = model->lm3g.unigrams + history[0]; 00305 b = FIRST_BG(model, history[0]); 00306 n = FIRST_BG(model, history[0] + 1) - b; 00307 itor->bg = model->lm3g.bigrams + b; 00308 /* If no such bigram exists then fail. */ 00309 if ((i = find_bg(itor->bg, n, wid)) < 0) { 00310 ngram_iter_free((ngram_iter_t *)itor); 00311 return NULL; 00312 } 00313 itor->bg += i; 00314 return (ngram_iter_t *)itor; 00315 } 00316 else if (n_hist == 2) { 00317 int32 i, n; 00318 tginfo_t *tginfo, *prev_tginfo; 00319 /* Find the trigram, as in tg_score above (duplicate code...) */ 00320 itor->ug = model->lm3g.unigrams + history[1]; 00321 prev_tginfo = NULL; 00322 for (tginfo = model->lm3g.tginfo[history[0]]; 00323 tginfo; tginfo = tginfo->next) { 00324 if (tginfo->w1 == history[1]) 00325 break; 00326 prev_tginfo = tginfo; 00327 } 00328 00329 if (!tginfo) { 00330 load_tginfo(model, history[1], history[0]); 00331 tginfo = model->lm3g.tginfo[history[0]]; 00332 } 00333 else if (prev_tginfo) { 00334 prev_tginfo->next = tginfo->next; 00335 tginfo->next = model->lm3g.tginfo[history[0]]; 00336 model->lm3g.tginfo[history[0]] = tginfo; 00337 } 00338 00339 tginfo->used = 1; 00340 00341 /* Trigrams for w1,w2 now pointed to by tginfo */ 00342 n = tginfo->n_tg; 00343 itor->tg = tginfo->tg; 00344 if ((i = find_tg(itor->tg, n, wid)) >= 0) { 00345 itor->tg += i; 00346 /* Now advance the bigram pointer accordingly. FIXME: 00347 * Note that we actually already found the relevant bigram 00348 * in load_tginfo. */ 00349 itor->bg = model->lm3g.bigrams; 00350 while (FIRST_TG(model, (itor->bg - model->lm3g.bigrams + 1)) 00351 <= (itor->tg - model->lm3g.trigrams)) 00352 ++itor->bg; 00353 return (ngram_iter_t *)itor; 00354 } 00355 else { 00356 ngram_iter_free((ngram_iter_t *)itor); 00357 return (ngram_iter_t *)NULL; 00358 } 00359 } 00360 else { 00361 /* Should not happen. */ 00362 assert(n_hist == 0); /* Guaranteed to fail. */ 00363 ngram_iter_free((ngram_iter_t *)itor); 00364 return NULL; 00365 } 00366 } 00367 00368 static ngram_iter_t * 00369 lm3g_template_mgrams(ngram_model_t *base, int m) 00370 { 00371 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base; 00372 lm3g_iter_t *itor = ckd_calloc(1, sizeof(*itor)); 00373 ngram_iter_init((ngram_iter_t *)itor, base, m, FALSE); 00374 00375 itor->ug = model->lm3g.unigrams; 00376 itor->bg = model->lm3g.bigrams; 00377 itor->tg = model->lm3g.trigrams; 00378 00379 /* Advance bigram pointer to match first trigram. */ 00380 if (m > 1 && base->n_counts[1] > 1) { 00381 while (FIRST_TG(model, (itor->bg - model->lm3g.bigrams + 1)) 00382 <= (itor->tg - model->lm3g.trigrams)) 00383 ++itor->bg; 00384 } 00385 00386 /* Advance unigram pointer to match first bigram. */ 00387 if (m > 0 && base->n_counts[0] > 1) { 00388 while (itor->ug[1].bigrams <= (itor->bg - model->lm3g.bigrams)) 00389 ++itor->ug; 00390 } 00391 00392 return (ngram_iter_t *)itor; 00393 } 00394 00395 static ngram_iter_t * 00396 lm3g_template_successors(ngram_iter_t *bitor) 00397 { 00398 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)bitor->model; 00399 lm3g_iter_t *from = (lm3g_iter_t *)bitor; 00400 lm3g_iter_t *itor = ckd_calloc(1, sizeof(*itor)); 00401 00402 itor->ug = from->ug; 00403 switch (bitor->m) { 00404 case 0: 00405 /* Next itor bigrams is the same as this itor bigram or 00406 itor bigrams is more than total count. This means no successors */ 00407 if (((itor->ug + 1) - model->lm3g.unigrams < bitor->model->n_counts[0] && 00408 itor->ug->bigrams == (itor->ug + 1)->bigrams) || 00409 itor->ug->bigrams == bitor->model->n_counts[1]) 00410 goto done; 00411 00412 /* Start iterating from first bigram successor of from->ug. */ 00413 itor->bg = model->lm3g.bigrams + itor->ug->bigrams; 00414 break; 00415 case 1: 00416 itor->bg = from->bg; 00417 00418 /* This indicates no successors */ 00419 if (((itor->bg + 1) - model->lm3g.bigrams < bitor->model->n_counts[1] && 00420 FIRST_TG (model, itor->bg - model->lm3g.bigrams) == 00421 FIRST_TG (model, (itor->bg + 1) - model->lm3g.bigrams)) || 00422 FIRST_TG (model, itor->bg - model->lm3g.bigrams) == bitor->model->n_counts[2]) 00423 goto done; 00424 00425 /* Start iterating from first trigram successor of from->bg. */ 00426 itor->tg = (model->lm3g.trigrams 00427 + FIRST_TG(model, (itor->bg - model->lm3g.bigrams))); 00428 #if 0 00429 printf("%s %s => %d (%s)\n", 00430 model->base.word_str[itor->ug - model->lm3g.unigrams], 00431 model->base.word_str[itor->bg->wid], 00432 FIRST_TG(model, (itor->bg - model->lm3g.bigrams)), 00433 model->base.word_str[itor->tg->wid]); 00434 #endif 00435 break; 00436 case 2: 00437 default: 00438 /* All invalid! */ 00439 goto done; 00440 } 00441 00442 ngram_iter_init((ngram_iter_t *)itor, bitor->model, bitor->m + 1, TRUE); 00443 return (ngram_iter_t *)itor; 00444 done: 00445 ckd_free(itor); 00446 return NULL; 00447 } 00448 00449 static int32 const * 00450 lm3g_template_iter_get(ngram_iter_t *base, 00451 int32 *out_score, int32 *out_bowt) 00452 { 00453 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base->model; 00454 lm3g_iter_t *itor = (lm3g_iter_t *)base; 00455 00456 base->wids[0] = itor->ug - model->lm3g.unigrams; 00457 if (itor->bg) base->wids[1] = itor->bg->wid; 00458 if (itor->tg) base->wids[2] = itor->tg->wid; 00459 #if 0 00460 printf("itor_get: %d %d %d\n", base->wids[0], base->wids[1], base->wids[2]); 00461 #endif 00462 00463 switch (base->m) { 00464 case 0: 00465 *out_score = itor->ug->prob1.l; 00466 *out_bowt = itor->ug->bo_wt1.l; 00467 break; 00468 case 1: 00469 *out_score = model->lm3g.prob2[itor->bg->prob2].l; 00470 if (model->lm3g.bo_wt2) 00471 *out_bowt = model->lm3g.bo_wt2[itor->bg->bo_wt2].l; 00472 else 00473 *out_bowt = 0; 00474 break; 00475 case 2: 00476 *out_score = model->lm3g.prob3[itor->tg->prob3].l; 00477 *out_bowt = 0; 00478 break; 00479 default: /* Should not happen. */ 00480 return NULL; 00481 } 00482 return base->wids; 00483 } 00484 00485 static ngram_iter_t * 00486 lm3g_template_iter_next(ngram_iter_t *base) 00487 { 00488 NGRAM_MODEL_TYPE *model = (NGRAM_MODEL_TYPE *)base->model; 00489 lm3g_iter_t *itor = (lm3g_iter_t *)base; 00490 00491 switch (base->m) { 00492 case 0: 00493 ++itor->ug; 00494 /* Check for end condition. */ 00495 if (itor->ug - model->lm3g.unigrams >= base->model->n_counts[0]) 00496 goto done; 00497 break; 00498 case 1: 00499 ++itor->bg; 00500 /* Check for end condition. */ 00501 if (itor->bg - model->lm3g.bigrams >= base->model->n_counts[1]) 00502 goto done; 00503 /* Advance unigram pointer if necessary in order to get one 00504 * that points to this bigram. */ 00505 while (itor->bg - model->lm3g.bigrams >= itor->ug[1].bigrams) { 00506 /* Stop if this is a successor iterator, since we don't 00507 * want a new unigram. */ 00508 if (base->successor) 00509 goto done; 00510 ++itor->ug; 00511 if (itor->ug == model->lm3g.unigrams + base->model->n_counts[0]) { 00512 E_ERROR("Bigram %d has no valid unigram parent\n", 00513 itor->bg - model->lm3g.bigrams); 00514 goto done; 00515 } 00516 } 00517 break; 00518 case 2: 00519 ++itor->tg; 00520 /* Check for end condition. */ 00521 if (itor->tg - model->lm3g.trigrams >= base->model->n_counts[2]) 00522 goto done; 00523 /* Advance bigram pointer if necessary. */ 00524 while (itor->tg - model->lm3g.trigrams >= 00525 FIRST_TG(model, (itor->bg - model->lm3g.bigrams + 1))) { 00526 if (base->successor) 00527 goto done; 00528 ++itor->bg; 00529 if (itor->bg == model->lm3g.bigrams + base->model->n_counts[1]) { 00530 E_ERROR("Trigram %d has no valid bigram parent\n", 00531 itor->tg - model->lm3g.trigrams); 00532 00533 goto done; 00534 } 00535 } 00536 /* Advance unigram pointer if necessary. */ 00537 while (itor->bg - model->lm3g.bigrams >= itor->ug[1].bigrams) { 00538 ++itor->ug; 00539 if (itor->ug == model->lm3g.unigrams + base->model->n_counts[0]) { 00540 E_ERROR("Trigram %d has no valid unigram parent\n", 00541 itor->tg - model->lm3g.trigrams); 00542 goto done; 00543 } 00544 } 00545 break; 00546 default: /* Should not happen. */ 00547 goto done; 00548 } 00549 00550 return (ngram_iter_t *)itor; 00551 done: 00552 ngram_iter_free(base); 00553 return NULL; 00554 } 00555 00556 static void 00557 lm3g_template_iter_free(ngram_iter_t *base) 00558 { 00559 ckd_free(base); 00560 }