问题描述:
1.在计算机科学中,hash table 是一种常用的数据结构,它本质上是一种关联容器,它映射一个key 到value。它通过一个hash function把key映射成一个整数值,这个整数值对应存放value值的容器的下标。
2.它主要支持三种操作:插入key,value对(insert),查询(search)给定key, 删除key, value对(delete);
3.它三种操作的平均时间复杂度为O(1),最糟糕情况下的时间复杂度为O(n);
4.hash table要处理核心问题是选择好的hash function,这能使得key映射出的index足够分散均匀,尽量减少碰撞(不同的key映射到同一个index),为了消除碰撞产生,一般常用几种方法:Separate chaining, Linear probing,Quadratic probing, Rehash, double Hashing,详细细节请参看*;
5.本文给出利用Rehash的策略来消除碰撞;
6.本文设计测试用例,比较了hashtable 和STL map 的操作性能。发现执行相同的操作,hashtable 所消耗时间为STL map 九分之一;
程序代码:
#ifndef _HASH_TABLE_H_ #define _HASH_TABLE_H_ #include <stdlib.h> #include <stdio.h> #include <map> #include <assert.h> #include <string> #include "windows.h" /* * Compare function * */ static bool StrCompare( const char* first, const char* second ) { size_t firstLen = strlen(first); size_t secondLen = strlen(second); return firstLen == secondLen && !strcmp( first, second ); } /* * Hash function * */ static unsigned int ImplHashFunc( const char* buf, int len ) { unsigned int hash = 5381; while(len--) { hash = ((hash << 5) + hash) + (*buf++); /* hash * 33 + c */ } return hash; } /* * Hash function * */ static unsigned int ImplCaseHashFunc( const unsigned char* buf, int len ) { unsigned int hash = 5381; while(len--) { hash = ((hash << 5) + hash) + tolower(*buf++); /* hash * 33 + c */ } return hash; } /* * Hash function * */ static unsigned int ImplHashFuncSimple( unsigned int key ) { key += ~(key << 15); key ^= (key >> 10); key += (key << 3); key ^= (key >> 6); key += ~(key << 11); key ^= (key >> 16); return key; } /* * encapsulate the hash table * advantage: * good performance; * terse interface to make more easy for outstanding and to employ * */ template<class T> class HashTable { public: typedef unsigned int (*HashFunctor)( const char* key, int len); typedef bool (*KeyCompare)( const char* keyFirst, const char* keySecond ); static const int INIT_TABLE_SIZE = 689981; typedef struct tagEntryNode { char* key; T value; tagEntryNode* next; tagEntryNode():key(0), value(), next(0) { } tagEntryNode( const char* _key, const T& val ): value(val), next(0) { size_t len = strlen(_key) + 1; key = new char[len]; strncpy( key, _key, len - 1); key[len - 1] = ‘\0‘; } ~tagEntryNode() { delete [] key; key = 0; } }EntryNode, *pEntryNode; typedef struct tagHashNode { EntryNode** table; size_t used; size_t size; size_t sizeMask; tagHashNode():table(0), used(0), size(0),sizeMask(0) { } tagHashNode( size_t _size ):table(0), used(0), size(_size), sizeMask(0) { Init( _size ); } ~tagHashNode() { } void Init( size_t _size ) { size = _size; sizeMask = size - 1; table = new EntryNode*[size]; memset( table, 0x00, sizeof(EntryNode*)*size ); } }HashNode, *pHashNode; /* * * */ HashTable( HashFunctor functor = ImplHashFunc, KeyCompare cmpFunctor = StrCompare ):m_hashFunctor(functor), m_keyCmpFunctor(cmpFunctor), m_hashTable(new HashNode), m_resizeRatio(2) { Init( INIT_TABLE_SIZE ); } /* * * */ ~HashTable() { Clear(); } /* * Clear all node and entity * */ void Clear() { Clear( m_hashTable ); } /* * Inset the pair of key and value * */ bool Insert( const char* key, const T& value ) { Rehash(); return Insert( m_hashTable, key, value ); } /* * Retrieve the pointer of value for given key * */ T* Find( const char* key ) { unsigned int hash = m_hashFunctor(key, strlen(key)); unsigned int idx = hash % m_hashTable->size; EntryNode* entry = m_hashTable->table[idx]; while( entry ) { if( m_keyCmpFunctor( entry->key, key) ) { return &entry->value; } entry = entry->next; } return NULL; } /* * Delete hashEntry for given key * */ void Delete( const char* key ) { unsigned int hash = m_hashFunctor(key, strlen(key)); unsigned int idx = hash % m_hashTable->size; EntryNode* entry = m_hashTable->table[idx]; EntryNode* preEntry = 0; while( entry ) { if( m_keyCmpFunctor( entry->key, key ) ) { if( preEntry ) { preEntry->next = entry->next; } else { m_hashTable->table[idx] = entry->next; } delete entry; entry = 0; m_hashTable->used--; return; } else { preEntry = entry; entry = entry->next; } } } protected: /* * Fink the index of corresponding of key value in the table * */ int FindKeyIndex( pHashNode hashNode, const char* key ) { unsigned int hash = m_hashFunctor( key, strlen( key ) ); unsigned int idx = hash % hashNode->size; EntryNode* entry = hashNode->table[idx]; if( 0 == entry ) hashNode->used++; while( entry ) { if( m_keyCmpFunctor( entry->key, key) ) { return -1; } entry = entry->next; } return idx; } /* * Implement insert operation * */ bool Insert( pHashNode hashNode, const char* key, const T& value ) { int idx = FindKeyIndex( hashNode, key ); if( idx != -1 ) { EntryNode* newNode = new EntryNode( key, value ); newNode->next = hashNode->table[idx]; hashNode->table[idx] = newNode; return true; } return false; } /* * Rehash double store memory to make more root then remake insert operation * very important */ void Rehash() { if( m_hashTable->used >= m_hashTable->size || (m_hashTable->used > 0 && (m_hashTable->size / m_hashTable->used) < m_resizeRatio ) ) { size_t newSize = NextPrime( m_hashTable->size * 2 ); pHashNode newHashNode = new HashNode( newSize ); for( size_t i = 0; i < m_hashTable->size; i++ ) { pEntryNode entryNode = m_hashTable->table[i]; while( entryNode ) { Insert( newHashNode, entryNode->key, entryNode->value ); entryNode = entryNode->next; } } Clear( m_hashTable ); m_hashTable = newHashNode; } } /* * Implement clear operation * */ void Clear( pHashNode hashNode ) { for( size_t i = 0; i < m_hashTable->size; i++ ) { pEntryNode entryNode = m_hashTable->table[i]; while( entryNode ) { pEntryNode next = entryNode->next; delete entryNode; entryNode = next; } } delete [] m_hashTable->table; } /* * Initialization * */ void Init( size_t tableSize ) { m_hashTable->Init( tableSize ); } /* * Helper function * check prime */ bool IsPrime( size_t x) { for( std::size_t i = 3; true; i += 2 ) { std::size_t q = x / i; if( q < i ) return true; if( x == q * i ) return false; } return true; } /* * Find next prime for given x * */ size_t NextPrime( size_t x ) { if( x <= 2 ) return 2; if(!(x & 1)) ++x; for(; !IsPrime(x); x += 2 ); return x; } protected: HashFunctor m_hashFunctor; KeyCompare m_keyCmpFunctor; pHashNode m_hashTable; size_t m_resizeRatio; }; /* * Test hash table * */ void TestHashTable() { unsigned long start = GetTickCount(); HashTable<int> hashTable; const int Len = 500000; for( int i = 0; i < Len; i++ ) { char key[16] = {0}; sprintf(key, "%s_%d", "china", i ); assert(hashTable.Insert( key, i )); } for( int i = 0; i < Len; i++ ) { char key[16] = {0}; sprintf(key, "%s_%d", "china", i ); if( i > 0 && !(i % 50) ) { hashTable.Delete( key ); assert( !hashTable.Find( key ) ); } else { assert(i == *hashTable.Find( key)); } } unsigned long interval = GetTickCount() - start; printf(" hash table consume time is %d \n", interval ); } /* * Test STL map * */ void TestHTSTLMap() { unsigned long start = GetTickCount(); std::map<std::string, int > strMap; const int Len = 500000; for( int i = 0; i < Len; i++ ) { char key[16] = {0}; sprintf(key, "%s_%d", "china", i ); std::string keyStr(key); strMap.insert( std::make_pair(keyStr, i )) ; } std::map<std::string, int >::iterator iter = strMap.begin(); for( int i = 0; i < Len; i++ ) { char key[16] = {0}; sprintf(key, "%s_%d", "china", i ); std::string keyStr(key); if( i > 0 && !(i % 50) ) { strMap.erase( key ); assert( strMap.find( key ) == strMap.end() ); } else { iter = strMap.find( keyStr ); assert( iter->second == i ); } } unsigned long interval = GetTickCount() - start; printf(" STL map consume time is %d \n", interval ); } /* * Test suite and compare performance * */ void TestSuiteHashTable() { TestHashTable(); TestHTSTLMap(); } #endif
compile and run in visual studio 2005