from array import array
from hashlib import sha512
class BloomFilter:
'http://en.wikipedia.org/wiki/Bloom_filter'
def __init__(self, size, k):
self.arr = array('L', [0] * ((size + 31) // 32))
self.k = k
assert (size ** k).bit_length() <= 512 # the sha hash has enough bits
def get_probes(self, key):
h = int.from_bytes(sha512(key.encode()).digest(), 'big')
for i in range(self.k):
h, pos = divmod(h, len(self.arr)*32)
byte, bitpos = divmod(pos, 32)
yield byte, 1 << bitpos
def add(self, key):
for byte, bit in self.get_probes(key):
self.arr[byte] |= bit
def __contains__(self, key):
return all(self.arr[byte] & bit for byte, bit in self.get_probes(key))
if __name__ == '__main__':
from random import sample
from string import ascii_letters
states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()
bits, k = 1000, 14
bf = BloomFilter(bits, k)
for state in states:
bf.add(state)
assert all(state in bf for state in states) # Verify True Positives
n = 100000
m = sum(''.join(sample(ascii_letters, 5)) in bf for i in range(n))
print(n-m, '::', m) # Ratio of True Negatives to False Positives