/* * HTML Entity & Encoding normalization. * * Copyright (C) 2006-2007 Török Edvin * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, * MA 02110-1301, USA. * */ #include #include #include #include #include "cltypes.h" #include "clamav.h" #include "others.h" #include "hashtab.h" static const size_t prime_list[] = { 53ul, 97ul, 193ul, 389ul, 769ul, 1543ul, 3079ul, 6151ul, 12289ul, 24593ul, 49157ul, 98317ul, 196613ul, 393241ul, 786433ul, 1572869ul, 3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul, 100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul, 3221225473ul }; static const size_t prime_n = sizeof(prime_list)/sizeof(prime_list[0]); static unsigned char DELETED_KEY[] = ""; static size_t get_nearest_capacity(const size_t capacity) { size_t i; for(i=0 ;i < prime_n; i++) { if (prime_list[i] > capacity) return prime_list[i]; } cli_errmsg("Requested hashtable size is too big!"); return prime_list[prime_n-1]; } #ifdef PROFILE_HASHTABLE /* I know, this is ugly, most of these functions get a const s, that gets its const-ness discarded, * and then these functions modify something the compiler assumes is readonly. * Please, never use PROFILE_HASHTABLE in production code, and in releases. Use it for development only!*/ static inline void PROFILE_INIT(struct hashtable *s) { memset(&s->PROFILE_STRUCT,0,sizeof(s->PROFILE_STRUCT)); } static inline void PROFILE_CALC_HASH(struct hashtable *s) { s->PROFILE_STRUCT.calc_hash++; } static inline void PROFILE_FIND_ELEMENT(struct hashtable *s) { s->PROFILE_STRUCT.find_req++; } static inline void PROFILE_FIND_NOTFOUND(struct hashtable *s, size_t tries) { s->PROFILE_STRUCT.not_found++; s->PROFILE_STRUCT.not_found_tries += tries; } static inline void PROFILE_FIND_FOUND(struct hashtable *s, size_t tries) { s->PROFILE_STRUCT.found++; s->PROFILE_STRUCT.found_tries += tries; } static inline void PROFILE_HASH_EXHAUSTED(struct hashtable *s) { s->PROFILE_STRUCT.hash_exhausted++; } static inline void PROFILE_GROW_START(struct hashtable *s) { s->PROFILE_STRUCT.grow++; } static inline void PROFILE_GROW_FOUND(struct hashtable *s, size_t tries) { s->PROFILE_STRUCT.grow_found++; s->PROFILE_STRUCT.grow_found_tries += tries; } static inline void PROFILE_GROW_DONE(struct hashtable *s) { } static inline void PROFILE_DELETED_REUSE(struct hashtable *s, size_t tries) { s->PROFILE_STRUCT.deleted_reuse++; s->PROFILE_STRUCT.deleted_tries += tries; } static inline void PROFILE_INSERT(struct hashtable *s, size_t tries) { s->PROFILE_STRUCT.inserts++; s->PROFILE_STRUCT.insert_tries += tries; } static inline void PROFILE_DATA_UPDATE(struct hashtable *s, size_t tries) { s->PROFILE_STRUCT.update++; s->PROFILE_STRUCT.update_tries += tries; } static inline void PROFILE_HASH_DELETE(struct hashtable *s) { s->PROFILE_STRUCT.deletes++; } static inline void PROFILE_HASH_CLEAR(struct hashtable *s) { s->PROFILE_STRUCT.clear++; } static inline void PROFILE_REPORT(const struct hashtable *s) { size_t lookups, queries, insert_tries, inserts; cli_dbgmsg("--------Hashtable usage report for %p--------------\n",(const void*)s); cli_dbgmsg("hash function calculations:%ld\n",s->PROFILE_STRUCT.calc_hash); cli_dbgmsg("successfull finds/total searches: %ld/%ld; lookups: %ld\n", s->PROFILE_STRUCT.found, s->PROFILE_STRUCT.find_req, s->PROFILE_STRUCT.found_tries); cli_dbgmsg("unsuccessfull finds/total searches: %ld/%ld; lookups: %ld\n", s->PROFILE_STRUCT.not_found, s->PROFILE_STRUCT.find_req , s->PROFILE_STRUCT.not_found_tries); cli_dbgmsg("successfull finds during grow:%ld; lookups: %ld\n",s->PROFILE_STRUCT.grow_found, s->PROFILE_STRUCT.grow_found_tries); lookups = s->PROFILE_STRUCT.found_tries + s->PROFILE_STRUCT.not_found_tries + s->PROFILE_STRUCT.grow_found_tries; queries = s->PROFILE_STRUCT.find_req + s->PROFILE_STRUCT.grow_found; cli_dbgmsg("Find Lookups/total queries: %ld/%ld = %3f\n", lookups, queries, lookups*1.0/queries); insert_tries = s->PROFILE_STRUCT.insert_tries + s->PROFILE_STRUCT.update_tries + s->PROFILE_STRUCT.deleted_tries; cli_dbgmsg("new item insert tries/new items: %ld/%ld\n", s->PROFILE_STRUCT.insert_tries, s->PROFILE_STRUCT.inserts); cli_dbgmsg("update tries/updates: %ld/%ld\n", s->PROFILE_STRUCT.update_tries, s->PROFILE_STRUCT.update); cli_dbgmsg("deleted item reuse tries/deleted&reused items: %ld/%ld\n", s->PROFILE_STRUCT.deleted_tries, s->PROFILE_STRUCT.deleted_reuse); inserts = s->PROFILE_STRUCT.inserts + s->PROFILE_STRUCT.update + s->PROFILE_STRUCT.deleted_reuse; cli_dbgmsg("Insert tries/total inserts: %ld/%ld = %3f\n", insert_tries, inserts, insert_tries*1.0/inserts); cli_dbgmsg("Grows: %ld, Deletes : %ld, hashtable clears: %ld\n",s->PROFILE_STRUCT.grow,s->PROFILE_STRUCT.deletes, s->PROFILE_STRUCT.clear); cli_dbgmsg("--------Report end-------------\n"); } #else #define PROFILE_INIT(s) #define PROFILE_CALC_HASH(s) #define PROFILE_FIND_ELEMENT(s) #define PROFILE_FIND_NOTFOUND(s, tries) #define PROFILE_FIND_FOUND(s, tries) #define PROFILE_HASH_EXHAUSTED(s) #define PROFILE_GROW_START(s) #define PROFILE_GROW_FOUND(s, tries) #define PROFILE_GROW_DONE(s) #define PROFILE_DELETED_REUSE(s, tries) #define PROFILE_INSERT(s, tries) #define PROFILE_DATA_UPDATE(s, tries) #define PROFILE_HASH_DELETE(s) #define PROFILE_HASH_CLEAR(s) #define PROFILE_REPORT(s) #endif int hashtab_init(struct hashtable *s,size_t capacity) { if(!s) return CL_ENULLARG; PROFILE_INIT(s); capacity = get_nearest_capacity(capacity); s->htable = cli_calloc(capacity,sizeof(*s->htable)); if(!s->htable) return CL_EMEM; s->capacity = capacity; s->used = 0; s->maxfill = 8*capacity/10; return 0; } static size_t hash(const unsigned char* k,const size_t len,const size_t SIZE) { size_t Hash = 0; size_t i; for(i=len;i>0;i--) Hash = ((Hash << 8) + k[i-1]) % SIZE; return Hash; } /* if returned element has key==NULL, then key was not found in table */ struct element* hashtab_find(const struct hashtable *s,const unsigned char* key,const size_t len) { struct element* element; size_t tries = 1; size_t idx; if(!s) return NULL; PROFILE_CALC_HASH(s); PROFILE_FIND_ELEMENT(s); idx = hash(key, len, s->capacity); element = &s->htable[idx]; do { if(!element->key) { PROFILE_FIND_NOTFOUND(s, tries); return NULL; /* element not found, place is empty*/ } else if(element->key != DELETED_KEY && strncmp((const char*)key,(const char*)element->key,len)==0) { PROFILE_FIND_FOUND(s, tries); return element;/* found */ } else { idx = (idx + tries++) % s->capacity; element = &s->htable[idx]; } } while (tries <= s->capacity); PROFILE_HASH_EXHAUSTED(s); return NULL; /* not found */ } static int hashtab_grow(struct hashtable *s) { const size_t new_capacity = get_nearest_capacity(s->capacity); struct element* htable = cli_calloc(new_capacity, sizeof(*s->htable)); size_t i,idx, used = 0; if(new_capacity == s->capacity || !htable) return CL_EMEM; PROFILE_GROW_START(s); cli_dbgmsg("hashtab.c: Warning: growing open-addressing hashtables is slow. Either allocate more storage when initializing, or use other hashtable types!\n"); for(i=0; i < s->capacity;i++) { if(s->htable[i].key && s->htable[i].key != DELETED_KEY) { struct element* element; size_t tries = 1; PROFILE_CALC_HASH(s); idx = hash(s->htable[i].key, strlen((const char*)s->htable[i].key), new_capacity); element = &htable[idx]; while(element->key && tries <= new_capacity) { idx = (idx + tries++) % new_capacity; element = &htable[idx]; } if(!element->key) { /* copy element from old hashtable to new */ PROFILE_GROW_FOUND(s, tries); *element = s->htable[i]; used++; } else { cli_errmsg("hashtab.c: Impossible - unable to rehash table"); return CL_EMEM;/* this means we didn't find enough room for all elements in the new table, should never happen */ } } } free(s->htable); s->htable = htable; s->used = used; s->capacity = new_capacity; s->maxfill = new_capacity*8/10; cli_dbgmsg("Table %p size after grow:%ld\n",(void*)s,s->capacity); PROFILE_GROW_DONE(s); return CL_SUCCESS; } int hashtab_insert(struct hashtable *s,const unsigned char* key,const size_t len,const element_data data) { struct element* element; struct element* deleted_element = NULL; size_t tries = 1; size_t idx; if(!s) return CL_ENULLARG; do { PROFILE_CALC_HASH(s); idx = hash(key, len, s->capacity); element = &s->htable[idx]; do { if(!element->key) { unsigned char* thekey; /* element not found, place is empty, insert*/ if(deleted_element) { /* reuse deleted elements*/ element = deleted_element; PROFILE_DELETED_REUSE(s, tries); } else { PROFILE_INSERT(s, tries); } thekey = cli_malloc(len+1); if(!thekey) return CL_EMEM; strncpy((char*)thekey,(const char*)key,len+1); element->key = thekey; element->data = data; s->used++; if(s->used > s->maxfill) { cli_dbgmsg("hashtab.c:Growing hashtable %p, because it has exceeded maxfill, old size:%ld\n",(void*)s,s->capacity); hashtab_grow(s); } return 0; } else if(element->key == DELETED_KEY) { deleted_element = element; } else if(strncmp((const char*)key,(const char*)element->key,len)==0) { PROFILE_DATA_UPDATE(s, tries); element->data = data;/* key found, update */ return 0; } else { idx = (idx + tries++) % s->capacity; element = &s->htable[idx]; } } while (tries <= s->capacity); /* no free place found*/ PROFILE_HASH_EXHAUSTED(s); cli_dbgmsg("hashtab.c: Growing hashtable %p, because its full, old size:%ld.\n",(void*)s,s->capacity); } while( hashtab_grow(s) >= 0 ); cli_warnmsg("hashtab.c: Unable to grow hashtable\n"); return CL_EMEM; } void hashtab_delete(struct hashtable *s,const unsigned char* key,const size_t len) { struct element* e = hashtab_find(s,key,len); if(e && e->key) { PROFILE_HASH_DELETE(s); free(e->key);/*FIXME: any way to shut up warnings here? if I make key char*, I get tons of warnings in entitylist.h */ e->key = DELETED_KEY; s->used--; } } void hashtab_clear(struct hashtable *s) { size_t i; PROFILE_HASH_CLEAR(s); for(i=0;i < s->capacity;i++) { if(s->htable[i].key && s->htable[i].key != DELETED_KEY) free(s->htable[i].key);/*FIXME: shut up warnings */ } memset(s->htable, 0, s->capacity); s->used = 0; } int hashtab_store(const struct hashtable *s,FILE* out) { size_t i; for(i=0; i < s->capacity; i++) { const struct element* e = &s->htable[i]; if(e->key && e->key != DELETED_KEY) { fprintf(out,"%ld %s\n",e->data,e->key); } } return CL_SUCCESS; } int hashtab_generate_c(const struct hashtable *s,const char* name) { size_t i; printf("/* TODO: include GPL headers */\n"); printf("#include \n"); printf("static struct element %s_elements[] = {\n",name); for(i=0; i < s->capacity; i++) { const struct element* e = &s->htable[i]; if(!e->key) printf("\t{NULL, 0},\n"); else if(e->key == DELETED_KEY) printf("\t{DELETED_KEY,0},\n"); else printf("\t{(const unsigned char*)\"%s\", %ld},\n", e->key, e->data); } printf("};\n"); printf("const struct hashtable %s = {\n",name); printf("\t%s_elements, %ld, %ld, %ld", name, s->capacity, s->used, s->maxfill); printf("\n};\n"); PROFILE_REPORT(s); return 0; } int hashtab_load(FILE* in, struct hashtable *s) { char line[1024]; while (fgets(line, sizeof(line), in)) { unsigned char l[1024]; int val; sscanf(line,"%d %1023s",&val,l); hashtab_insert(s,l,strlen((const char*)l),val); } return CL_SUCCESS; }