Welcome, guest | Sign In | My Account | Store | Cart
from array import array
from hashlib import sha512
try:
    from pickle import dumps     # py3.x version
except ImportError:
    from cPickle import dumps    # py2.x version

class BloomFilter:
    'http://en.wikipedia.org/wiki/Bloom_filter'

    def __init__(self, num_bits, num_probes):
        num_words = (num_bits + 31) // 32
        self.arr = array('L', [0]) * num_words
        self.num_probes = num_probes

    def get_probes(self, key):
        h = int(sha512(dumps(key)).hexdigest(), 16)
        num_words = len(self.arr)
        for i in range(self.num_probes):
            h, array_index = divmod(h, num_words)
            h, bit_index = divmod(h, 32)
            yield array_index, 1 << bit_index

    def add(self, key):
        for i, mask in self.get_probes(key):
            self.arr[i] |= mask

    def __contains__(self, key):
        return all(self.arr[i] & mask for i, mask in self.get_probes(key))


if __name__ == '__main__':

    from random import sample
    from string import ascii_letters

    states = '''Alabama Alaska Arizona Arkansas California Colorado Connecticut
        Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
        Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
        Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
        NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
        Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
        Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()

    bf = BloomFilter(num_bits=1000, num_probes=14)
    for state in states:
        bf.add(state)

    assert all(state in bf for state in states)    # Verify True Positives

    n = 100000
    m = sum(''.join(sample(ascii_letters, 5)) in bf for i in range(n))
    print('%d :: %d' % (n-m, m))    # Ratio of True Negatives to False Positives

Diff to Previous Revision

--- revision 1 2011-05-04 09:53:01
+++ revision 2 2011-05-04 17:53:16
@@ -1,27 +1,32 @@
 from array import array
 from hashlib import sha512
+try:
+    from pickle import dumps     # py3.x version
+except ImportError:
+    from cPickle import dumps    # py2.x version
 
 class BloomFilter:
     'http://en.wikipedia.org/wiki/Bloom_filter'
 
-    def __init__(self, size, k):
-        self.arr = array('L', [0] * ((size + 31) // 32))
-        self.k = k
-        assert (size ** k).bit_length() <= 512  # the sha hash has enough bits
+    def __init__(self, num_bits, num_probes):
+        num_words = (num_bits + 31) // 32
+        self.arr = array('L', [0]) * num_words
+        self.num_probes = num_probes
 
     def get_probes(self, key):
-        h = int.from_bytes(sha512(key.encode()).digest(), 'big')
-        for i in range(self.k):
-            h, pos = divmod(h, len(self.arr)*32)
-            byte, bitpos = divmod(pos, 32)
-            yield byte, 1 << bitpos
+        h = int(sha512(dumps(key)).hexdigest(), 16)
+        num_words = len(self.arr)
+        for i in range(self.num_probes):
+            h, array_index = divmod(h, num_words)
+            h, bit_index = divmod(h, 32)
+            yield array_index, 1 << bit_index
 
     def add(self, key):
-        for byte, bit in self.get_probes(key):
-            self.arr[byte] |= bit
+        for i, mask in self.get_probes(key):
+            self.arr[i] |= mask
 
     def __contains__(self, key):
-        return all(self.arr[byte] & bit for byte, bit in self.get_probes(key))
+        return all(self.arr[i] & mask for i, mask in self.get_probes(key))
 
 
 if __name__ == '__main__':
@@ -37,8 +42,7 @@
         Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
         Vermont Virginia Washington WestVirginia Wisconsin Wyoming'''.split()
 
-    bits, k = 1000, 14
-    bf = BloomFilter(bits, k)
+    bf = BloomFilter(num_bits=1000, num_probes=14)
     for state in states:
         bf.add(state)
 
@@ -46,6 +50,4 @@
 
     n = 100000
     m = sum(''.join(sample(ascii_letters, 5)) in bf for i in range(n))
-    print(n-m, '::', m)        # Ratio of True Negatives to False Positives
-        
-        
+    print('%d :: %d' % (n-m, m))    # Ratio of True Negatives to False Positives

History