1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
|
import mmh3 from bitarray import bitarray
""" 使用murmurhash算法实现简单的Bloom过滤器。 布隆过滤器用于检查集合中是否存在某个元素,并且在大数据情况下它具有良好的性能。 它的正率取决于哈希函数和元素数 """
BIT_SIZE = 5000000
class BloomFilter:
def __init__(self): bit_array = bitarray(BIT_SIZE) bit_array.setall(0)
self.bit_array = bit_array
def add(self, data): point_list = self.get_postions(data)
for b in point_list: self.bit_array[b] = 1
def contains(self, data): point_list = self.get_postions(data) result = True for b in point_list: result = result and self.bit_array[b] if not result: break
return result
def get_postions(self, data): point1 = mmh3.hash(data, 41) % BIT_SIZE point2 = mmh3.hash(data, 42) % BIT_SIZE point3 = mmh3.hash(data, 43) % BIT_SIZE point4 = mmh3.hash(data, 44) % BIT_SIZE point5 = mmh3.hash(data, 45) % BIT_SIZE point6 = mmh3.hash(data, 46) % BIT_SIZE point7 = mmh3.hash(data, 47) % BIT_SIZE
return [point1, point2, point3, point4, point5, point6, point7]
if __name__ == "__main__": bloom_filter = BloomFilter() bloom_filter.add("zhangsan") bloom_filter.add("lisi")
print(bloom_filter.contains("wangwu")) print(bloom_filter.contains("lisi"))
|