SphinxBase 0.6

src/libsphinxbase/util/hash_table.c

00001 /* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
00002 /* ====================================================================
00003  * Copyright (c) 1999-2004 Carnegie Mellon University.  All rights
00004  * reserved.
00005  *
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions
00008  * are met:
00009  *
00010  * 1. Redistributions of source code must retain the above copyright
00011  *    notice, this list of conditions and the following disclaimer. 
00012  *
00013  * 2. Redistributions in binary form must reproduce the above copyright
00014  *    notice, this list of conditions and the following disclaimer in
00015  *    the documentation and/or other materials provided with the
00016  *    distribution.
00017  *
00018  * This work was supported in part by funding from the Defense Advanced 
00019  * Research Projects Agency and the National Science Foundation of the 
00020  * United States of America, and the CMU Sphinx Speech Consortium.
00021  *
00022  * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND 
00023  * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
00024  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
00025  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
00026  * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
00027  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 
00028  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
00029  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 
00030  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
00031  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
00032  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00033  *
00034  * ====================================================================
00035  *
00036  */
00037 /*
00038  * hash.c -- Hash table module.
00039  *
00040  * **********************************************
00041  * CMU ARPA Speech Project
00042  *
00043  * Copyright (c) 1999 Carnegie Mellon University.
00044  * ALL RIGHTS RESERVED.
00045  * **********************************************
00046  * 
00047  * HISTORY
00048  * $Log: hash.c,v $
00049  * Revision 1.5  2005/06/22 03:04:01  arthchan2003
00050  * 1, Implemented hash_delete and hash_display, 2, Fixed doxygen documentation, 3, Added  keyword.
00051  *
00052  * Revision 1.9  2005/05/25 06:17:53  archan
00053  * Delete the test code in cmd_ln.c and fixed platform specific code of hash.c
00054  *
00055  * Revision 1.8  2005/05/24 01:10:54  archan
00056  * Fix a bug when the value only appear in the hash but there is no chain.   Also make sure that prev was initialized to NULL. All success cases were tested, but not tested with the deletion is tested.
00057  *
00058  * Revision 1.6  2005/05/24 00:00:45  archan
00059  * Added basic functionalities to hash_t: 1, display and 2, delete a key from a hash. \n
00060  *
00061  * Revision 1.5  2005/05/11 07:01:38  archan
00062  * Added comments on the usage of the current implementation of hash tables.
00063  *
00064  * Revision 1.4  2005/05/03 04:09:11  archan
00065  * Implemented the heart of word copy search. For every ci-phone, every word end, a tree will be allocated to preserve its pathscore.  This is different from 3.5 or below, only the best score for a particular ci-phone, regardless of the word-ends will be preserved at every frame.  The graph propagation will not collect unused word tree at this point. srch_WST_propagate_wd_lv2 is also as the most stupid in the century.  But well, after all, everything needs a start.  I will then really get the results from the search and see how it looks.
00066  *
00067  * Revision 1.3  2005/03/30 01:22:48  archan
00068  * Fixed mistakes in last updates. Add
00069  *
00070  * 
00071  * 05-May-1999  M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon
00072  *              Removed hash_key2hash().  Added hash_enter_bkey() and hash_lookup_bkey(),
00073  *              and len attribute to hash_entry_t.
00074  * 
00075  * 30-Apr-1999  M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon
00076  *              Added hash_key2hash().
00077  * 
00078  * 18-Jun-97    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon
00079  *              Included case sensitive/insensitive option.  Removed local, static
00080  *              maintenance of all hash tables.
00081  * 
00082  * 31-Jul-95    M K Ravishankar (rkm@cs.cmu.edu) at Carnegie Mellon
00083  *              Created.
00084  */
00085 
00086 
00087 #include <stdio.h>
00088 #include <stdlib.h>
00089 #include <string.h>
00090 #include <assert.h>
00091 
00092 #ifdef _MSC_VER
00093 #pragma warning (disable: 4018)
00094 #endif
00095 
00096 #include "sphinxbase/hash_table.h"
00097 #include "sphinxbase/err.h"
00098 #include "sphinxbase/ckd_alloc.h"
00099 #include "sphinxbase/case.h"
00100 
00101 
00102 #if 0
00103 static void
00104 prime_sieve(int32 max)
00105 {
00106     char *notprime;
00107     int32 p, pp;
00108 
00109     notprime = (char *) ckd_calloc(max + 1, 1);
00110     p = 2;
00111     for (;;) {
00112         printf("%d\n", p);
00113         for (pp = p + p; pp <= max; pp += p)
00114             notprime[pp] = 1;
00115         for (++p; (p <= max) && notprime[p]; p++);
00116         if (p > max)
00117             break;
00118     }
00119 }
00120 #endif
00121 
00122 
00123 /*
00124  * HACK!!  Initial hash table size is restricted by this set of primes.  (Of course,
00125  * collision resolution by chaining will accommodate more entries indefinitely, but
00126  * efficiency will drop.)
00127  */
00128 const int32 prime[] = {
00129     101, 211, 307, 401, 503, 601, 701, 809, 907,
00130     1009, 1201, 1601, 2003, 2411, 3001, 4001, 5003, 6007, 7001, 8009,
00131     9001,
00132     10007, 12007, 16001, 20011, 24001, 30011, 40009, 50021, 60013,
00133     70001, 80021, 90001,
00134     100003, 120011, 160001, 200003, 240007, 300007, 400009, 500009,
00135     600011, 700001, 800011, 900001,
00136     -1
00137 };
00138 
00139 
00143 static int32
00144 prime_size(int32 size)
00145 {
00146     int32 i;
00147 
00148     for (i = 0; (prime[i] > 0) && (prime[i] < size); i++);
00149     if (prime[i] <= 0) {
00150         E_WARN("Very large hash table requested (%d entries)\n", size);
00151         --i;
00152     }
00153     return (prime[i]);
00154 }
00155 
00156 
00157 hash_table_t *
00158 hash_table_new(int32 size, int32 casearg)
00159 {
00160     hash_table_t *h;
00161 
00162     h = (hash_table_t *) ckd_calloc(1, sizeof(hash_table_t));
00163     h->size = prime_size(size + (size >> 1));
00164     h->nocase = (casearg == HASH_CASE_NO);
00165     h->table = (hash_entry_t *) ckd_calloc(h->size, sizeof(hash_entry_t));
00166     /* The above calloc clears h->table[*].key and .next to NULL, i.e. an empty table */
00167 
00168     return h;
00169 }
00170 
00171 
00172 /*
00173  * Compute hash value for given key string.
00174  * Somewhat tuned for English text word strings.
00175  */
00176 static uint32
00177 key2hash(hash_table_t * h, const char *key)
00178 {
00179 
00180     register const char *cp;
00181 
00191     /*register char c; */
00192     register unsigned char c;
00193     register int32 s;
00194     register uint32 hash;
00195 
00196     hash = 0;
00197     s = 0;
00198 
00199     if (h->nocase) {
00200         for (cp = key; *cp; cp++) {
00201             c = *cp;
00202             c = UPPER_CASE(c);
00203             hash += c << s;
00204             s += 5;
00205             if (s >= 25)
00206                 s -= 24;
00207         }
00208     }
00209     else {
00210         for (cp = key; *cp; cp++) {
00211             hash += (*cp) << s;
00212             s += 5;
00213             if (s >= 25)
00214                 s -= 24;
00215         }
00216     }
00217 
00218     return (hash % h->size);
00219 }
00220 
00221 
00222 static char *
00223 makekey(uint8 * data, int32 len, char *key)
00224 {
00225     int32 i, j;
00226 
00227     if (!key)
00228         key = (char *) ckd_calloc(len * 2 + 1, sizeof(char));
00229 
00230     for (i = 0, j = 0; i < len; i++, j += 2) {
00231         key[j] = 'A' + (data[i] & 0x000f);
00232         key[j + 1] = 'J' + ((data[i] >> 4) & 0x000f);
00233     }
00234     key[j] = '\0';
00235 
00236     return key;
00237 }
00238 
00239 
00240 static int32
00241 keycmp_nocase(hash_entry_t * entry, const char *key)
00242 {
00243     char c1, c2;
00244     int32 i;
00245     const char *str;
00246 
00247     str = entry->key;
00248     for (i = 0; i < entry->len; i++) {
00249         c1 = *(str++);
00250         c1 = UPPER_CASE(c1);
00251         c2 = *(key++);
00252         c2 = UPPER_CASE(c2);
00253         if (c1 != c2)
00254             return (c1 - c2);
00255     }
00256 
00257     return 0;
00258 }
00259 
00260 
00261 static int32
00262 keycmp_case(hash_entry_t * entry, const char *key)
00263 {
00264     char c1, c2;
00265     int32 i;
00266     const char *str;
00267 
00268     str = entry->key;
00269     for (i = 0; i < entry->len; i++) {
00270         c1 = *(str++);
00271         c2 = *(key++);
00272         if (c1 != c2)
00273             return (c1 - c2);
00274     }
00275 
00276     return 0;
00277 }
00278 
00279 
00280 /*
00281  * Lookup entry with hash-value hash in table h for given key
00282  * Return value: hash_entry_t for key
00283  */
00284 static hash_entry_t *
00285 lookup(hash_table_t * h, uint32 hash, const char *key, size_t len)
00286 {
00287     hash_entry_t *entry;
00288 
00289     entry = &(h->table[hash]);
00290     if (entry->key == NULL)
00291         return NULL;
00292 
00293     if (h->nocase) {
00294         while (entry && ((entry->len != len)
00295                          || (keycmp_nocase(entry, key) != 0)))
00296             entry = entry->next;
00297     }
00298     else {
00299         while (entry && ((entry->len != len)
00300                          || (keycmp_case(entry, key) != 0)))
00301             entry = entry->next;
00302     }
00303 
00304     return entry;
00305 }
00306 
00307 
00308 int32
00309 hash_table_lookup(hash_table_t * h, const char *key, void ** val)
00310 {
00311     hash_entry_t *entry;
00312     uint32 hash;
00313     int32 len;
00314 
00315     hash = key2hash(h, key);
00316     len = strlen(key);
00317 
00318     entry = lookup(h, hash, key, len);
00319     if (entry) {
00320         if (val)
00321             *val = entry->val;
00322         return 0;
00323     }
00324     else
00325         return -1;
00326 }
00327 
00328 int32
00329 hash_table_lookup_int32(hash_table_t * h, const char *key, int32 *val)
00330 {
00331     void *vval;
00332     int32 rv;
00333 
00334     rv = hash_table_lookup(h, key, &vval);
00335     if (rv != 0)
00336         return rv;
00337     if (val)
00338         *val = (int32)(long)vval;
00339     return 0;
00340 }
00341 
00342 
00343 int32
00344 hash_table_lookup_bkey(hash_table_t * h, const char *key, size_t len, void ** val)
00345 {
00346     hash_entry_t *entry;
00347     uint32 hash;
00348     char *str;
00349 
00350     str = makekey((uint8 *) key, len, NULL);
00351     hash = key2hash(h, str);
00352     ckd_free(str);
00353 
00354     entry = lookup(h, hash, key, len);
00355     if (entry) {
00356         if (val)
00357             *val = entry->val;
00358         return 0;
00359     }
00360     else
00361         return -1;
00362 }
00363 
00364 int32
00365 hash_table_lookup_bkey_int32(hash_table_t * h, const char *key, size_t len, int32 *val)
00366 {
00367     void *vval;
00368     int32 rv;
00369 
00370     rv = hash_table_lookup_bkey(h, key, len, &vval);
00371     if (rv != 0)
00372         return rv;
00373     if (val)
00374         *val = (int32)(long)vval;
00375     return 0;
00376 }
00377 
00378 
00379 static void *
00380 enter(hash_table_t * h, uint32 hash, const char *key, size_t len, void *val, int32 replace)
00381 {
00382     hash_entry_t *cur, *new;
00383 
00384     if ((cur = lookup(h, hash, key, len)) != NULL) {
00385         void *oldval;
00386         /* Key already exists. */
00387         oldval = cur->val;
00388         if (replace) {
00389             /* Replace the pointer if replacement is requested,
00390              * because this might be a different instance of the same
00391              * string (this verges on magic, sorry) */
00392             cur->key = key;
00393             cur->val = val;
00394         }
00395         return oldval;
00396     }
00397 
00398     cur = &(h->table[hash]);
00399     if (cur->key == NULL) {
00400         /* Empty slot at hashed location; add this entry */
00401         cur->key = key;
00402         cur->len = len;
00403         cur->val = val;
00404 
00405         /* Added by ARCHAN at 20050515. This allows deletion could work. */
00406         cur->next = NULL;
00407 
00408     }
00409     else {
00410         /* Key collision; create new entry and link to hashed location */
00411         new = (hash_entry_t *) ckd_calloc(1, sizeof(hash_entry_t));
00412         new->key = key;
00413         new->len = len;
00414         new->val = val;
00415         new->next = cur->next;
00416         cur->next = new;
00417     }
00418     ++h->inuse;
00419 
00420     return val;
00421 }
00422 
00423 /* 20050523 Added by ARCHAN  to delete a key from a hash table */
00424 static void *
00425 delete(hash_table_t * h, uint32 hash, const char *key, size_t len)
00426 {
00427     hash_entry_t *entry, *prev;
00428     void *val;
00429 
00430     prev = NULL;
00431     entry = &(h->table[hash]);
00432     if (entry->key == NULL)
00433         return NULL;
00434 
00435     if (h->nocase) {
00436         while (entry && ((entry->len != len)
00437                          || (keycmp_nocase(entry, key) != 0))) {
00438             prev = entry;
00439             entry = entry->next;
00440         }
00441     }
00442     else {
00443         while (entry && ((entry->len != len)
00444                          || (keycmp_case(entry, key) != 0))) {
00445             prev = entry;
00446             entry = entry->next;
00447         }
00448     }
00449 
00450     if (entry == NULL)
00451         return NULL;
00452 
00453     /* At this point, entry will be the one required to be deleted, prev
00454        will contain the previous entry
00455      */
00456     val = entry->val;
00457 
00458     if (prev == NULL) {
00459         /* That is to say the entry in the hash table (not the chain) matched the key. */
00460         /* We will then copy the things from the next entry to the hash table */
00461         prev = entry;
00462         if (entry->next) {      /* There is a next entry, great, copy it. */
00463             entry = entry->next;
00464             prev->key = entry->key;
00465             prev->len = entry->len;
00466             prev->val = entry->val;
00467             prev->next = entry->next;
00468             ckd_free(entry);
00469         }
00470         else {                  /* There is not a next entry, just set the key to null */
00471             prev->key = NULL;
00472             prev->len = 0;
00473             prev->next = NULL;
00474         }
00475 
00476     }
00477     else {                      /* This case is simple */
00478         prev->next = entry->next;
00479         ckd_free(entry);
00480     }
00481 
00482     /* Do wiring and free the entry */
00483 
00484     --h->inuse;
00485 
00486     return val;
00487 }
00488 
00489 void
00490 hash_table_empty(hash_table_t *h)
00491 {
00492     hash_entry_t *e, *e2;
00493     int32 i;
00494 
00495     for (i = 0; i < h->size; i++) {
00496         /* Free collision lists. */
00497         for (e = h->table[i].next; e; e = e2) {
00498             e2 = e->next;
00499             ckd_free((void *) e);
00500         }
00501         memset(&h->table[i], 0, sizeof(h->table[i]));
00502     }
00503     h->inuse = 0;
00504 }
00505 
00506 
00507 void *
00508 hash_table_enter(hash_table_t * h, const char *key, void *val)
00509 {
00510     uint32 hash;
00511     size_t len;
00512 
00513     hash = key2hash(h, key);
00514     len = strlen(key);
00515     return (enter(h, hash, key, len, val, 0));
00516 }
00517 
00518 void *
00519 hash_table_replace(hash_table_t * h, const char *key, void *val)
00520 {
00521     uint32 hash;
00522     size_t len;
00523 
00524     hash = key2hash(h, key);
00525     len = strlen(key);
00526     return (enter(h, hash, key, len, val, 1));
00527 }
00528 
00529 void *
00530 hash_table_delete(hash_table_t * h, const char *key)
00531 {
00532     uint32 hash;
00533     size_t len;
00534 
00535     hash = key2hash(h, key);
00536     len = strlen(key);
00537 
00538     return (delete(h, hash, key, len));
00539 }
00540 
00541 void *
00542 hash_table_enter_bkey(hash_table_t * h, const char *key, size_t len, void *val)
00543 {
00544     uint32 hash;
00545     char *str;
00546 
00547     str = makekey((uint8 *) key, len, NULL);
00548     hash = key2hash(h, str);
00549     ckd_free(str);
00550 
00551     return (enter(h, hash, key, len, val, 0));
00552 }
00553 
00554 void *
00555 hash_table_replace_bkey(hash_table_t * h, const char *key, size_t len, void *val)
00556 {
00557     uint32 hash;
00558     char *str;
00559 
00560     str = makekey((uint8 *) key, len, NULL);
00561     hash = key2hash(h, str);
00562     ckd_free(str);
00563 
00564     return (enter(h, hash, key, len, val, 1));
00565 }
00566 
00567 void *
00568 hash_table_delete_bkey(hash_table_t * h, const char *key, size_t len)
00569 {
00570     uint32 hash;
00571     char *str;
00572 
00573     str = makekey((uint8 *) key, len, NULL);
00574     hash = key2hash(h, str);
00575     ckd_free(str);
00576 
00577     return (delete(h, hash, key, len));
00578 }
00579 
00580 void
00581 hash_table_display(hash_table_t * h, int32 showdisplay)
00582 {
00583     hash_entry_t *e;
00584     int i, j;
00585     j = 0;
00586 
00587     E_INFOCONT("Hash with chaining representation of the hash table\n");
00588 
00589     for (i = 0; i < h->size; i++) {
00590         e = &(h->table[i]);
00591         if (e->key != NULL) {
00592             E_INFOCONT("|key:");
00593             if (showdisplay)
00594                 E_INFOCONT("%s", e->key);
00595             else
00596                 E_INFOCONT("%p", e->key);
00597 
00598             E_INFOCONT("|len:%d|val=%ld|->", e->len, (long)e->val);
00599             if (e->next == NULL) {
00600                 E_INFOCONT("NULL\n");
00601             }
00602             j++;
00603 
00604             for (e = e->next; e; e = e->next) {
00605                 E_INFOCONT("|key:");
00606                 if (showdisplay)
00607                     E_INFOCONT("%s", e->key);
00608 
00609                 E_INFOCONT("|len:%d|val=%ld|->", e->len, (long)e->val);
00610                 if (e->next == NULL) {
00611                     E_INFOCONT("NULL\n");
00612                 }
00613                 j++;
00614             }
00615         }
00616     }
00617 
00618     E_INFOCONT("The total number of keys =%d\n", j);
00619 }
00620 
00621 
00622 glist_t
00623 hash_table_tolist(hash_table_t * h, int32 * count)
00624 {
00625     glist_t g;
00626     hash_entry_t *e;
00627     int32 i, j;
00628 
00629     g = NULL;
00630 
00631     j = 0;
00632     for (i = 0; i < h->size; i++) {
00633         e = &(h->table[i]);
00634 
00635         if (e->key != NULL) {
00636             g = glist_add_ptr(g, (void *) e);
00637             j++;
00638 
00639             for (e = e->next; e; e = e->next) {
00640                 g = glist_add_ptr(g, (void *) e);
00641                 j++;
00642             }
00643         }
00644     }
00645 
00646     if (count)
00647         *count = j;
00648 
00649     return g;
00650 }
00651 
00652 hash_iter_t *
00653 hash_table_iter(hash_table_t *h)
00654 {
00655         hash_iter_t *itor;
00656 
00657         itor = ckd_calloc(1, sizeof(*itor));
00658         itor->ht = h;
00659         return hash_table_iter_next(itor);
00660 }
00661 
00662 hash_iter_t *
00663 hash_table_iter_next(hash_iter_t *itor)
00664 {
00665         /* If there is an entry, walk down its list. */
00666         if (itor->ent)
00667                 itor->ent = itor->ent->next;
00668         /* If we got to the end of the chain, or we had no entry, scan
00669          * forward in the table to find the next non-empty bucket. */
00670         if (itor->ent == NULL) {
00671                 while (itor->idx < itor->ht->size
00672                        && itor->ht->table[itor->idx].key == NULL) 
00673                         ++itor->idx;
00674                 /* If we did not find one then delete the iterator and
00675                  * return NULL. */
00676                 if (itor->idx == itor->ht->size) {
00677                         hash_table_iter_free(itor);
00678                         return NULL;
00679                 }
00680                 /* Otherwise use this next entry. */
00681                 itor->ent = itor->ht->table + itor->idx;
00682                 /* Increase idx for the next time around. */
00683                 ++itor->idx;
00684         }
00685         return itor;
00686 }
00687 
00688 void
00689 hash_table_iter_free(hash_iter_t *itor)
00690 {
00691         ckd_free(itor);
00692 }
00693 
00694 void
00695 hash_table_free(hash_table_t * h)
00696 {
00697     hash_entry_t *e, *e2;
00698     int32 i;
00699 
00700     if (h == NULL)
00701         return;
00702 
00703     /* Free additional entries created for key collision cases */
00704     for (i = 0; i < h->size; i++) {
00705         for (e = h->table[i].next; e; e = e2) {
00706             e2 = e->next;
00707             ckd_free((void *) e);
00708         }
00709     }
00710 
00711     ckd_free((void *) h->table);
00712     ckd_free((void *) h);
00713 }