libclamav/hashtab.c
3506ac49
 /*
  *  HTML Entity & Encoding normalization.
  *
43ecd9a1
3506ac49
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation; either version 2 of the License, or
  *  (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program; if not, write to the Free Software
  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  *  MA 02110-1301, USA.
  *
  */
 #include <clamav-config.h>
 
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 
98b42c3c
 #include "cltypes.h"
3506ac49
 #include "clamav.h"
 #include "others.h"
 #include "hashtab.h"
 
43ecd9a1
 
98b42c3c
 static const size_t prime_list[] =
3506ac49
 {
      53ul,         97ul,         193ul,       389ul,       769ul,
      1543ul,       3079ul,       6151ul,      12289ul,     24593ul,
      49157ul,      98317ul,      196613ul,    393241ul,    786433ul,
      1572869ul,    3145739ul,    6291469ul,   12582917ul,  25165843ul,
      50331653ul,   100663319ul,  201326611ul, 402653189ul, 805306457ul,
      1610612741ul, 3221225473ul
 };
 
 
 static const size_t prime_n = sizeof(prime_list)/sizeof(prime_list[0]);
 
 static unsigned char DELETED_KEY[] = "";
 
 static size_t get_nearest_capacity(const size_t capacity)
 {
 	size_t i;
 	for(i=0 ;i < prime_n; i++) {
 		if (prime_list[i] > capacity)
 			return prime_list[i];
 	}
 	cli_errmsg("Requested hashtable size is too big!");
 	return prime_list[prime_n-1];
 }
 
 #ifdef PROFILE_HASHTABLE
 /* I know, this is ugly, most of these functions get a const s, that gets its const-ness discarded,
  * and then these functions modify something the compiler assumes is readonly.
  * Please, never use PROFILE_HASHTABLE in production code, and in releases. Use it for development only!*/
 
 static inline void PROFILE_INIT(struct hashtable *s)
 {
 	memset(&s->PROFILE_STRUCT,0,sizeof(s->PROFILE_STRUCT));
 }
 
 static inline void PROFILE_CALC_HASH(struct hashtable *s)
 {
 	s->PROFILE_STRUCT.calc_hash++;
 }
 
 static inline void PROFILE_FIND_ELEMENT(struct hashtable *s)
 {
 	s->PROFILE_STRUCT.find_req++;
 }
 
 static inline void PROFILE_FIND_NOTFOUND(struct hashtable *s, size_t tries)
 {
 	s->PROFILE_STRUCT.not_found++;
43ecd9a1
 	s->PROFILE_STRUCT.not_found_tries += tries;
3506ac49
 }
 
 static inline void PROFILE_FIND_FOUND(struct hashtable *s, size_t tries)
 {
 	s->PROFILE_STRUCT.found++;
43ecd9a1
 	s->PROFILE_STRUCT.found_tries += tries;
3506ac49
 }
 
 static inline void PROFILE_HASH_EXHAUSTED(struct hashtable *s)
 {
 	s->PROFILE_STRUCT.hash_exhausted++;
 }
 
 static inline void PROFILE_GROW_START(struct hashtable *s)
 {
 	s->PROFILE_STRUCT.grow++;
 }
 
 static inline void PROFILE_GROW_FOUND(struct hashtable *s, size_t tries)
 {
 	s->PROFILE_STRUCT.grow_found++;
43ecd9a1
 	s->PROFILE_STRUCT.grow_found_tries += tries;
3506ac49
 }
 
 static inline void PROFILE_GROW_DONE(struct hashtable *s)
 {
 }
 
 static inline void PROFILE_DELETED_REUSE(struct hashtable *s, size_t tries)
 {
 	s->PROFILE_STRUCT.deleted_reuse++;
43ecd9a1
 	s->PROFILE_STRUCT.deleted_tries += tries;
3506ac49
 }
 
 static inline void PROFILE_INSERT(struct hashtable *s, size_t tries)
 {
 	s->PROFILE_STRUCT.inserts++;
43ecd9a1
 	s->PROFILE_STRUCT.insert_tries += tries;
3506ac49
 }
 
 static inline void PROFILE_DATA_UPDATE(struct hashtable *s, size_t tries)
 {
 	s->PROFILE_STRUCT.update++;
43ecd9a1
 	s->PROFILE_STRUCT.update_tries += tries;
3506ac49
 }
 
 static inline void PROFILE_HASH_DELETE(struct hashtable *s)
 {
 	s->PROFILE_STRUCT.deletes++;
 }
 
 static inline void PROFILE_HASH_CLEAR(struct hashtable *s)
 {
 	s->PROFILE_STRUCT.clear++;
 }
 
 static inline void PROFILE_REPORT(const struct hashtable *s)
 {
 	size_t lookups, queries, insert_tries, inserts;
43ecd9a1
 	cli_dbgmsg("--------Hashtable usage report for %p--------------\n",(const void*)s);
3506ac49
 	cli_dbgmsg("hash function calculations:%ld\n",s->PROFILE_STRUCT.calc_hash);
 	cli_dbgmsg("successfull finds/total searches: %ld/%ld; lookups: %ld\n", s->PROFILE_STRUCT.found, s->PROFILE_STRUCT.find_req, s->PROFILE_STRUCT.found_tries);
 	cli_dbgmsg("unsuccessfull finds/total searches: %ld/%ld; lookups: %ld\n", s->PROFILE_STRUCT.not_found, s->PROFILE_STRUCT.find_req , s->PROFILE_STRUCT.not_found_tries);
 	cli_dbgmsg("successfull finds during grow:%ld; lookups: %ld\n",s->PROFILE_STRUCT.grow_found, s->PROFILE_STRUCT.grow_found_tries);
 	lookups = s->PROFILE_STRUCT.found_tries + s->PROFILE_STRUCT.not_found_tries + s->PROFILE_STRUCT.grow_found_tries;
 	queries = s->PROFILE_STRUCT.find_req + s->PROFILE_STRUCT.grow_found;
 	cli_dbgmsg("Find Lookups/total queries: %ld/%ld = %3f\n", lookups, queries, lookups*1.0/queries);
 	insert_tries = s->PROFILE_STRUCT.insert_tries + s->PROFILE_STRUCT.update_tries + s->PROFILE_STRUCT.deleted_tries;
 
 	cli_dbgmsg("new item insert tries/new items: %ld/%ld\n", s->PROFILE_STRUCT.insert_tries, s->PROFILE_STRUCT.inserts);
 	cli_dbgmsg("update tries/updates: %ld/%ld\n", s->PROFILE_STRUCT.update_tries, s->PROFILE_STRUCT.update);
 	cli_dbgmsg("deleted item reuse tries/deleted&reused items: %ld/%ld\n", s->PROFILE_STRUCT.deleted_tries, s->PROFILE_STRUCT.deleted_reuse);
 	inserts = s->PROFILE_STRUCT.inserts + s->PROFILE_STRUCT.update + s->PROFILE_STRUCT.deleted_reuse;
 	cli_dbgmsg("Insert tries/total inserts: %ld/%ld = %3f\n", insert_tries, inserts, insert_tries*1.0/inserts);
 
 	cli_dbgmsg("Grows: %ld, Deletes : %ld, hashtable clears: %ld\n",s->PROFILE_STRUCT.grow,s->PROFILE_STRUCT.deletes, s->PROFILE_STRUCT.clear);
         cli_dbgmsg("--------Report end-------------\n");	
 }
 
 #else
 #define PROFILE_INIT(s) 
 #define PROFILE_CALC_HASH(s) 
 #define PROFILE_FIND_ELEMENT(s) 
 #define PROFILE_FIND_NOTFOUND(s, tries) 
 #define PROFILE_FIND_FOUND(s, tries)
 #define PROFILE_HASH_EXHAUSTED(s)
 #define PROFILE_GROW_START(s)
 #define PROFILE_GROW_FOUND(s, tries)
 #define PROFILE_GROW_DONE(s)
 #define PROFILE_DELETED_REUSE(s, tries)
 #define PROFILE_INSERT(s, tries)
 #define PROFILE_DATA_UPDATE(s, tries)
 #define PROFILE_HASH_DELETE(s)
 #define PROFILE_HASH_CLEAR(s)
 #define PROFILE_REPORT(s)
 #endif
 
 int hashtab_init(struct hashtable *s,size_t capacity)
 {
 	if(!s)
 		return CL_ENULLARG;
 
 	PROFILE_INIT(s);
 
 	capacity = get_nearest_capacity(capacity);
 	s->htable = cli_calloc(capacity,sizeof(*s->htable));
 	if(!s->htable)
 		return CL_EMEM;
 	s->capacity = capacity;
 	s->used = 0;
 	s->maxfill = 8*capacity/10;
 	return 0;
 }
 
43ecd9a1
 static size_t hash(const unsigned char* k,const size_t len,const size_t SIZE)
3506ac49
 {
 	size_t Hash = 0;	
 	size_t i;
 	for(i=len;i>0;i--)
 		Hash = ((Hash << 8) + k[i-1]) % SIZE;
 	return Hash;
 }
 
 /* if returned element has key==NULL, then key was not found in table */
 struct element* hashtab_find(const struct hashtable *s,const unsigned char* key,const size_t len)
 {
 	struct element* element;
43ecd9a1
 	size_t tries = 1; 
3506ac49
 	size_t idx;
 
 	if(!s)
 		return NULL; 
 	PROFILE_CALC_HASH(s);
 	PROFILE_FIND_ELEMENT(s);
 	idx = hash(key, len, s->capacity); 
 	element = &s->htable[idx];
 	do {
 		if(!element->key) {
 			PROFILE_FIND_NOTFOUND(s, tries);
 			return NULL; /* element not found, place is empty*/
 		}
 		else if(element->key != DELETED_KEY && strncmp((const char*)key,(const char*)element->key,len)==0) {
 			PROFILE_FIND_FOUND(s, tries);
 			return element;/* found */
 		}
 		else {
 			idx = (idx + tries++) % s->capacity;
 			element = &s->htable[idx];
 		}
 	} while (tries <= s->capacity);
 	PROFILE_HASH_EXHAUSTED(s);
 	return NULL; /* not found */
 }
 
 static int hashtab_grow(struct hashtable *s)
 {
 	const size_t new_capacity = get_nearest_capacity(s->capacity);
 	struct element* htable = cli_calloc(new_capacity, sizeof(*s->htable));
 	size_t i,idx, used = 0;
 	if(new_capacity == s->capacity || !htable)
 		return CL_EMEM;
 
 	PROFILE_GROW_START(s);
 	cli_dbgmsg("hashtab.c: Warning: growing open-addressing hashtables is slow. Either allocate more storage when initializing, or use other hashtable types!\n");
 	for(i=0; i < s->capacity;i++) {
 		if(s->htable[i].key && s->htable[i].key != DELETED_KEY) {
 			struct element* element;
43ecd9a1
 			size_t tries = 1;				
3506ac49
 
 			PROFILE_CALC_HASH(s);
3c882206
 			idx = hash(s->htable[i].key, strlen((const char*)s->htable[i].key), new_capacity);
3506ac49
 			element = &htable[idx];
 
 			while(element->key && tries <= new_capacity) {
 				idx = (idx + tries++) % new_capacity;
 				element = &htable[idx];
 			}
 			if(!element->key) {
 				/* copy element from old hashtable to new */
 				PROFILE_GROW_FOUND(s, tries);
 				*element = s->htable[i];
 				used++;
 			}
 			else {
 				cli_errmsg("hashtab.c: Impossible - unable to rehash table");
 				return CL_EMEM;/* this means we didn't find enough room for all elements in the new table, should never happen */ 
 			}
 		}
 	}
 	free(s->htable);
 	s->htable = htable;
 	s->used = used;
 	s->capacity = new_capacity;
 	s->maxfill = new_capacity*8/10;
43ecd9a1
 	cli_dbgmsg("Table %p size after grow:%ld\n",(void*)s,s->capacity);
3506ac49
 	PROFILE_GROW_DONE(s);
 	return CL_SUCCESS;
 }
 
 
 int hashtab_insert(struct hashtable *s,const unsigned char* key,const size_t len,const element_data data)
 {
 	struct element* element;
 	struct element* deleted_element = NULL;
43ecd9a1
 	size_t tries = 1; 
3506ac49
 	size_t idx;
 	if(!s)
 		return CL_ENULLARG; 
 	do {
 		PROFILE_CALC_HASH(s);
 		idx = hash(key, len, s->capacity); 
 		element = &s->htable[idx];
 
 		do {
 			if(!element->key) {
 				unsigned char* thekey;
 				/* element not found, place is empty, insert*/
 				if(deleted_element) {
 					/* reuse deleted elements*/
 					element = deleted_element;
 					PROFILE_DELETED_REUSE(s, tries);
 				}
 				else {
 					PROFILE_INSERT(s, tries);
 				}
 				thekey = cli_malloc(len+1);
 				if(!thekey)
 					return CL_EMEM;
3c882206
 				strncpy((char*)thekey,(const char*)key,len+1);
3506ac49
 				element->key = thekey;
 				element->data = data;
 				s->used++;		
 				if(s->used > s->maxfill) {
43ecd9a1
 					cli_dbgmsg("hashtab.c:Growing hashtable %p, because it has exceeded maxfill, old size:%ld\n",(void*)s,s->capacity);
3506ac49
 					hashtab_grow(s);
 				}
 				return 0;
 			}
 			else if(element->key == DELETED_KEY) {
 				deleted_element = element;
 			}
 			else if(strncmp((const char*)key,(const char*)element->key,len)==0) {
 				PROFILE_DATA_UPDATE(s, tries);
 				element->data = data;/* key found, update */
 				return 0;		
 			}
 			else {
 				idx = (idx + tries++) % s->capacity;
 				element = &s->htable[idx];
 			}
 		} while (tries <= s->capacity);
 		/* no free place found*/
 		PROFILE_HASH_EXHAUSTED(s);
43ecd9a1
 		cli_dbgmsg("hashtab.c: Growing hashtable %p, because its full, old size:%ld.\n",(void*)s,s->capacity);
3506ac49
 	} while( hashtab_grow(s) >= 0 );
 	cli_warnmsg("hashtab.c: Unable to grow hashtable\n");
 	return CL_EMEM;
 }
 
 void hashtab_delete(struct hashtable *s,const unsigned char* key,const size_t len)
 {
 	struct element* e = hashtab_find(s,key,len);
 	if(e && e->key) {	
 		PROFILE_HASH_DELETE(s);
 		free(e->key);/*FIXME: any way to shut up warnings here? if I make key char*, I get tons of warnings in entitylist.h */
 		e->key = DELETED_KEY;
 		s->used--;
 	}
 }
 
 void hashtab_clear(struct hashtable *s)
 {
 	size_t i;
 	PROFILE_HASH_CLEAR(s);
 	for(i=0;i < s->capacity;i++) {
 		if(s->htable[i].key && s->htable[i].key != DELETED_KEY)
 			free(s->htable[i].key);/*FIXME: shut up warnings */
 	}
 	memset(s->htable, 0, s->capacity);
 	s->used = 0;
 }
 
 
 int hashtab_store(const struct hashtable *s,FILE* out)
 {
 	size_t i;
 	for(i=0; i < s->capacity; i++) {
 		const struct element* e = &s->htable[i];
 		if(e->key && e->key != DELETED_KEY) {
 			fprintf(out,"%ld %s\n",e->data,e->key);
 		}
 	}
 	return CL_SUCCESS;
 }
 
 int hashtab_generate_c(const struct hashtable *s,const char* name)
 {
 	size_t i;
 	printf("/* TODO: include GPL headers */\n");
 	printf("#include <hashtab.h>\n");
 	printf("static struct element %s_elements[] = {\n",name);
 	for(i=0; i < s->capacity; i++) {
 		const struct element* e = &s->htable[i];
 		if(!e->key)
 			printf("\t{NULL, 0},\n");
 		else if(e->key == DELETED_KEY)
 			printf("\t{DELETED_KEY,0},\n");
 		else
 			printf("\t{(const unsigned char*)\"%s\", %ld},\n", e->key, e->data);
 	}
 	printf("};\n");
 	printf("const struct hashtable %s = {\n",name);
 	printf("\t%s_elements, %ld, %ld, %ld", name, s->capacity, s->used, s->maxfill);
 	printf("\n};\n");
 
 	PROFILE_REPORT(s);
 	return 0;
 }
 
 int hashtab_load(FILE* in, struct hashtable *s)
 {
 	char line[1024];
 	while (fgets(line, sizeof(line), in)) {
 		unsigned char l[1024];
 		int val;
 		sscanf(line,"%d %1023s",&val,l);
 		hashtab_insert(s,l,strlen((const char*)l),val);
 	}
 	return CL_SUCCESS;
 }