redis源码阅读-数据结构篇-hyperloglog

@

目录

5. HyperLogLog 实现 hyperloglog.c

数据结构定义

  • hllhdr

    struct hllhdr {
        char magic[4];      /* "HYLL" */
        uint8_t encoding;   /* HLL_DENSE or HLL_SPARSE. */
        uint8_t notused[3]; /* Reserved for future use, must be zero. */
        uint8_t card[8];    /* Cached cardinality, little endian. */
        uint8_t registers[]; /* Data bytes. */
    };
    
  • 一些宏定义

    /* The cached cardinality MSB is used to signal validity of the cached value. */
    #define HLL_INVALIDATE_CACHE(hdr) (hdr)->card[0] |= (1<<7)
    #define HLL_VALID_CACHE(hdr) (((hdr)->card[0] & (1<<7)) == 0)
    
    #define HLL_P 14 /* The greater is P, the smaller the error. */
    #define HLL_REGISTERS (1<<HLL_P) /* With P=14, 16384 registers. */
    #define HLL_P_MASK (HLL_REGISTERS-1) /* Mask to index register. */
    #define HLL_BITS 6 /* Enough to count up to 63 leading zeroes. */
    #define HLL_REGISTER_MAX ((1<<HLL_BITS)-1)
    #define HLL_HDR_SIZE sizeof(struct hllhdr)
    #define HLL_DENSE_SIZE (HLL_HDR_SIZE+((HLL_REGISTERS*HLL_BITS+7)/8))
    #define HLL_DENSE 0 /* Dense encoding. */
    #define HLL_SPARSE 1 /* Sparse encoding. */
    #define HLL_RAW 255 /* Only used internally, never exposed. */
    #define HLL_MAX_ENCODING 1
    
    static char *invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected\r\n";
    ...
    

Helper函数(可跳过,需要时阅读)

  • 哈希函数 O(1)

    uint64_t MurmurHash64A (const void * key, int len, unsigned int seed) {
        const uint64_t m = 0xc6a4a7935bd1e995;
        const int r = 47;
        uint64_t h = seed ^ (len * m);
        const uint8_t *data = (const uint8_t *)key;
        const uint8_t *end = data + (len-(len&7));
    
        while(data != end) {
            uint64_t k;
    #if (BYTE_ORDER == LITTLE_ENDIAN)
            k = *((uint64_t*)data);
    #else
            k = (uint64_t) data[0];
            k |= (uint64_t) data[1] << 8;
            k |= (uint64_t) data[2] << 16;
            k |= (uint64_t) data[3] << 24;
            k |= (uint64_t) data[4] << 32;
            k |= (uint64_t) data[5] << 40;
            k |= (uint64_t) data[6] << 48;
            k |= (uint64_t) data[7] << 56;
    #endif
            k *= m;
            k ^= k >> r;
            k *= m;
            h ^= k;
            h *= m;
            data += 8;
        }
        switch(len & 7) {
        case 7: h ^= (uint64_t)data[6] << 48;
        case 6: h ^= (uint64_t)data[5] << 40;
        case 5: h ^= (uint64_t)data[4] << 32;
        case 4: h ^= (uint64_t)data[3] << 24;
        case 3: h ^= (uint64_t)data[2] << 16;
        case 2: h ^= (uint64_t)data[1] << 8;
        case 1: h ^= (uint64_t)data[0];
                h *= m;
        };
        h ^= h >> r;
        h *= m;
        h ^= h >> r;
        return h;
    }
    

元素哈希处理 O(1)

// 返回000...1的长度
// regp 保存哈希桶index
int hllPatLen(unsigned char *ele, size_t elesize, long *regp) {
    uint64_t hash, bit, index;
    int count;
	// 获取哈希值
    hash = MurmurHash64A(ele,elesize,0xadc83b19ULL);
    index = hash & HLL_P_MASK; // (1 << 14)个桶的index
    hash |= ((uint64_t)1<<63);// 确保至少一个1
    bit = HLL_REGISTERS; // 第15位开始算起
    count = 1; // 第一个出现1的rank
    while((hash & bit) == 0) {// 直到找到1
        count++;
        bit <<= 1;
    }
    *regp = (int) index;// (1 << 14)个桶的index
    return count;// 返回000...1的长度
}

添加基数 O(1)

int hllDenseAdd(uint8_t *registers, unsigned char *ele, size_t elesize) {
    uint8_t oldcount, count;
    long index;
    // 元素哈希处理
    count = hllPatLen(ele,elesize,&index);
    // 000...1的长度更长则添加
    HLL_DENSE_GET_REGISTER(oldcount,registers,index);
    if (count > oldcount) {
        HLL_DENSE_SET_REGISTER(registers,index,count);
        return 1;
    } else {
        return 0;
    }
}

...

关于HyperLogLog可以详见本人之前的博客解析

上一篇:6.S081 Xv6 Lab Multithreading


下一篇:ClickHouse介绍(一)初次使用