布隆过滤器
定义
-
布隆过滤器是一种概率型数据结构,它的特点是高效的插入和查询,能明确告知某个字符串一定不存在或者可能存在。
-
布隆过滤器相比传统的查询结构(例如:hash,set,map等数据结构)更加高效,占用空间更小;但是其缺点是它返回的结果是概率性的,也就是说结果存在误差的,虽然这个误差是可控的;同时它不支持删除操作;
-
组成:位图(bit数组)+ n个hash函数
原理
-
当一个元素加入位图时,通过k个hash函数将这个元素映射到位图的k个点,并把它们置为
1;当检索时,再通过k个hash函数运算检测位图的k个点是否都为1;如果有不为1的点,那么认为
不存在;如果全部为1,则可能存在(存在误差);
- 在位图中每个槽位只有两种状态(0或者1),一个槽位被设置为1状态,但不明确它被设置了多少
次;也就是不知道被多少个str1哈希映射以及是被哪个hash函数映射过来的;所以不支持删除操作;
- 在实际应用过程中,布隆过滤器该如何使用?要选择多少个hash函数,要分配多少空间的位图,存
储多少元素?另外如何控制假阳率(布隆过滤器能明确一定不存在,不能明确一定存在,那么存在
的判断是有误差的,假阳率就是错误判断存在的概率)?
n -- 布隆过滤器中元素的个数,如上图 只有str1和str2 两个元素 那么 n=2
p -- 假阳率,在0-1之间 0.000000
m -- 位图所占空间
k -- hash函数的个数
公式如下:
n = ceil(m / (-k / log(1 - exp(log(p) / k))))
p = pow(1 - exp(-k / (m / n)), k)
m = ceil((n * log(p)) / log(1 / pow(2, log(2))));
k = round((m / n) * log(2));
-
假定我们选取这四个值为:
n = 4000
p = 0.000000001
m = 172532
k = 30
-
四个值的关系:
-
在实际应用中,我们确定n和p,通过上面的计算算出m和k;也可以在网站上选取合适的值:
https://hur.st/bloomfilter
-
已知k,如何选择k个hash函数?
// 采⽤⼀个hash函数,给hash传不同的种⼦偏移值 // #define MIX_UINT64(v) ((uint32_t)((v>>32)^(v))) uint64_t hash1 = MurmurHash2_x64(key, len, Seed); uint64_t hash2 = MurmurHash2_x64(key, len, MIX_UINT64(hash1)); for (i = 0; i < k; i++) // k 是hash函数的个数 { Pos[i] = (hash1 + i*hash2) % m; // m 是位图的⼤⼩ } // 通过这种⽅式来模拟 k 个hash函数 跟我们前⾯开放寻址法 双重hash是⼀样的思路
测试用例
#include <stdio.h>
#include <stdint.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
//字节大小
#define BYTE_BITS (8)
#define MIX_UINT64(v) ((uint32_t)((v>>32)^(v)))
//设置位图
#define SETBIT(filter, n) (filter->pstFilter[n/BYTE_BITS] |= (1 << (n%BYTE_BITS)))
//获取位图
#define GETBIT(filter, n) (filter->pstFilter[n/BYTE_BITS] & (1 << (n%BYTE_BITS)))
typedef struct {
uint32_t dwMaxItems; // n - BloomFilter中最大元素个数 (输入量)
double dProbFalse; // p - 假阳概率 (输入量,比如万分之一:0.00001)
uint32_t dwFilterBits; // m = ceil((n * log(p)) / log(1.0 / (pow(2.0, log(2.0))))); - BloomFilter的比特数,位图所占空间
uint32_t dwHashFuncs; // k = round(log(2.0) * m / n); - 哈希函数个数
uint32_t dwSeed; // MurmurHash的种子偏移量
uint32_t dwFilterSize; // dwFilterBits / BYTE_BITS
unsigned char *pstFilter; // BloomFilter存储指针,使用malloc分配
uint32_t *pdwHashPos; // 存储上次hash得到的K个bit位置数组(由bloom_hash填充)
}BaseBloomFilter;
void _CalcBloomFilterParam(uint32_t n,double p, uint32_t *pm,uint32_t *pk){
/*
n -- 布隆过滤器中元素的个数,如上图 只有str1和str2 两个元素 那么 n=2
p -- 假阳率,在0-1之间 0.000000
m -- 位图所占空间
k -- hash函数的个数
公式如下:
n = ceil(m / (-k / log(1 - exp(log(p) / k))))
p = pow(1 - exp(-k / (m / n)), k)
m = ceil((n * log(p)) / log(1 / pow(2, log(2))));
k = round((m / n) * log(2));
*/
uint32_t m, k;
m = (uint32_t) ceil(-1.0 * n * log(p) / 0.480453 );//log以e为底的对数函数
m = (m - m%64 ) + 64;//8字节对齐
k = (uint32_t) round(0.69314 * m / n);
*pm = m;
*pk = k;
}
int InitBloomFilter(BaseBloomFilter* pstBloomfilter, \
uint32_t dwSeed, uint32_t dwMaxItems, double dProbFalse){
if(pstBloomfilter == NULL)
return -1;
if(dProbFalse <=0 || dProbFalse >= 1){
return -2;
}
if(pstBloomfilter->pstFilter != NULL){
free(pstBloomfilter->pstFilter);
}
if(pstBloomfilter->pdwHashPos != NULL){
free(pstBloomfilter->pdwHashPos);
}
pstBloomfilter->dwMaxItems = dwMaxItems;
pstBloomfilter->dProbFalse = dProbFalse;
pstBloomfilter->dwSeed = dwSeed;
_CalcBloomFilterParam(pstBloomfilter->dwMaxItems,pstBloomfilter->dProbFalse,\
&pstBloomfilter->dwFilterBits,&pstBloomfilter->dwHashFuncs);
pstBloomfilter->dwFilterSize = pstBloomfilter->dwFilterBits / BYTE_BITS;
pstBloomfilter->pstFilter = (unsigned char *) malloc(pstBloomfilter->dwFilterSize);
if(NULL == pstBloomfilter->pstFilter){
return -100;
}
pstBloomfilter->pdwHashPos = (uint32_t *) malloc(sizeof(uint32_t) * pstBloomfilter->dwHashFuncs);
if(NULL == pstBloomfilter->pdwHashPos){
return -200;
}
printf(">>> Init BloomFilter(n=%u, p=%e, m=%u, k=%d), malloc() size=%.2fMB, items:bits=1:%0.1lf\n",
pstBloomfilter->dwMaxItems, pstBloomfilter->dProbFalse, pstBloomfilter->dwFilterBits,
pstBloomfilter->dwHashFuncs, (double)pstBloomfilter->dwFilterSize/1024/1024,
pstBloomfilter->dwFilterBits*1.0/pstBloomfilter->dwMaxItems);
memset(pstBloomfilter->pstFilter,0,pstBloomfilter->dwFilterSize);
return 0;
}
int FreeBloomFilter(BaseBloomFilter *pstBloomfilter){
if(pstBloomfilter == NULL){
return -1;
}
free(pstBloomfilter->pstFilter);
pstBloomfilter->pstFilter = NULL;
free(pstBloomfilter->pdwHashPos);
pstBloomfilter->pdwHashPos = NULL;
return 0;
}
// MurmurHash2, 64-bit versions, by Austin Appleby
// https://sites.google.com/site/murmurhash/
uint64_t MurmurHash2_x64 ( const void * key, int len, uint32_t seed )
{
const uint64_t m = 0xc6a4a7935bd1e995;
const int r = 47;
uint64_t h = seed ^ (len * m);
const uint64_t * data = (const uint64_t *)key;
const uint64_t * end = data + (len/8);
while(data != end)
{
uint64_t k = *data++;
k *= m;
k ^= k >> r;
k *= m;
h ^= k;
h *= m;
}
const uint8_t * data2 = (const uint8_t*)data;
switch(len & 7)
{
case 7: h ^= ((uint64_t)data2[6]) << 48;
case 6: h ^= ((uint64_t)data2[5]) << 40;
case 5: h ^= ((uint64_t)data2[4]) << 32;
case 4: h ^= ((uint64_t)data2[3]) << 24;
case 3: h ^= ((uint64_t)data2[2]) << 16;
case 2: h ^= ((uint64_t)data2[1]) << 8;
case 1: h ^= ((uint64_t)data2[0]);
h *= m;
};
h ^= h >> r;
h *= m;
h ^= h >> r;
return h;
}
// 双重散列封装
void bloom_hash(BaseBloomFilter *pstBloomfilter, const void * key, int len)
{
//if (pstBloomfilter == NULL) return;
int i;
uint32_t dwFilterBits = pstBloomfilter->dwFilterBits;
uint64_t hash1 = MurmurHash2_x64(key, len, pstBloomfilter->dwSeed);
uint64_t hash2 = MurmurHash2_x64(key, len, MIX_UINT64(hash1));
for (i = 0; i < (int)pstBloomfilter->dwHashFuncs; i++)
{
pstBloomfilter->pdwHashPos[i] = (hash1 + i*hash2) % dwFilterBits;
}
return;
}
int BloomFilter_Add(BaseBloomFilter *pstBloomfilter, const void* key,int len){
if(pstBloomfilter == NULL || key == NULL || len<=0){
return -1;
}
bloom_hash(pstBloomfilter,key,len);
int i;
for(i=0;i<(int)pstBloomfilter->dwHashFuncs;i++){
SETBIT(pstBloomfilter,pstBloomfilter->pdwHashPos[i]);
}
return 0;
}
int BloomFilter_Check(BaseBloomFilter *pstBloomfilter, const void* key, int len){
if(pstBloomfilter == NULL || key == NULL || len<=0){
return -1;
}
bloom_hash(pstBloomfilter,key,len);
int i;
for(i=0;i<(int)pstBloomfilter->dwHashFuncs;i++){
if(GETBIT(pstBloomfilter,pstBloomfilter->pdwHashPos[i]) == 0){
return 1;//没找到
}
}
return 0;
}
int main(){
int MAX_ITEMS = 400000;
int ADD_ITEMS = 1000;
double P_ERROR = 0.0000001;
BaseBloomFilter stBloomfilter = {0};
InitBloomFilter(&stBloomfilter,0,MAX_ITEMS,P_ERROR);
char url[128] = {0};
int i;
for(i=0;i<ADD_ITEMS;i++){
sprintf(url,"https://0voice.com/%d.html", i);
BloomFilter_Add(&stBloomfilter,url,strlen(url));
}
printf("0 exist, 1 not exist\n");
sprintf(url,"https://0voice.com/%d.html",0);
printf("%s exist %d\n",url,BloomFilter_Check(&stBloomfilter,url,strlen(url)));
sprintf(url,"https://0voice.com/%d.html",ADD_ITEMS);
printf("%s exist %d\n",url,BloomFilter_Check(&stBloomfilter,url,strlen(url)));
return 0;
}
- 感谢
https://blog.csdn.net/a844651990/article/details/112711519