Skip to content

Commit

Permalink
Make properties read-only
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewdalpino committed Oct 29, 2024
1 parent 64b0d77 commit 65fe4a5
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 23 deletions.
62 changes: 39 additions & 23 deletions src/okbloomer/bloom_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,24 +47,40 @@ def __init__(
f"Slice size must be less than {self.MAX_SLICE_SIZE}, {slice_size} given."
)

self.max_false_positive_rate = max_false_positive_rate
self.num_hashes = num_hashes
self.layer_size = layer_size
self.slice_size = slice_size
self.layers: list[NDArray] = []
self.n = 0 # The number of bits currently stored in the filter.
self.m = 0 # The maximum number of bits that can be stored in the filter.
self._max_false_positive_rate = max_false_positive_rate
self._num_hashes = num_hashes
self._layer_size = layer_size
self._slice_size = slice_size
self._layers: list[NDArray] = []
self._n = 0 # The number of bits currently stored in the filter.
self._m = 0 # The maximum number of bits that can be stored in the filter.

self._add_layer()

@property
def max_false_positive_rate(self) -> float:
return self._max_false_positive_rate

@property
def num_hashes(self) -> int:
return self._num_hashes

@property
def layer_size(self) -> int:
return self._layer_size

@property
def slice_size(self) -> int:
return self._slice_size

@property
def num_layers(self) -> int:
return len(self.layers)
return len(self._layers)

@property
def utilization(self) -> float:
"""Return the proportion of bits that are currently set"""
return self.n / self.m
return self._n / self._m

@property
def capacity(self) -> float:
Expand All @@ -74,32 +90,32 @@ def capacity(self) -> float:
@property
def false_positive_rate(self) -> float:
"""Return the probability of recording a false positive"""
return self.utilization**self.num_hashes
return self.utilization**self._num_hashes

def insert(self, token: str) -> None:
"""Insert a token into the filter"""
offsets = self._hash(token)

layer = self.layers[-1]
layer = self._layers[-1]

changed = False

for offset in offsets:
if layer[offset] == False:
layer[offset] = True

self.n += 1
self._n += 1

changed = True

if changed and self.false_positive_rate > self.max_false_positive_rate:
if changed and self.false_positive_rate > self._max_false_positive_rate:
self._add_layer()

def exists(self, token: str) -> bool:
"""Does the given token exist within the filter?"""
offsets = self._hash(token)

for layer in self.layers:
for layer in self._layers:
hits = 0

for offset in offsets:
Expand All @@ -108,7 +124,7 @@ def exists(self, token: str) -> bool:

hits += 1

if hits == self.num_hashes:
if hits == self._num_hashes:
return True

return False
Expand All @@ -117,7 +133,7 @@ def exists_or_insert(self, token: str) -> bool:
"""Does the token exist in the filter? If not, then insert it."""
offsets = self._hash(token)

for layer in self.layers[:-1]:
for layer in self._layers[:-1]:
hits = 0

for offset in offsets:
Expand All @@ -126,37 +142,37 @@ def exists_or_insert(self, token: str) -> bool:

hits += 1

if hits == self.num_hashes:
if hits == self._num_hashes:
return True

layer = self.layers[-1]
layer = self._layers[-1]

exists = True

for offset in offsets:
if layer[offset] == False:
layer[offset] = True

self.n += 1
self._n += 1

exists = False

if not exists and self.false_positive_rate > self.max_false_positive_rate:
if not exists and self.false_positive_rate > self._max_false_positive_rate:
self._add_layer()

return exists

def _add_layer(self) -> None:
"""Add another layer to the filter for maintaining the false positivity rate below the threshold."""
self.layers.append(np.zeros(self.layer_size, dtype="bool"))
self._layers.append(np.zeros(self.layer_size, dtype="bool"))

self.m += self.layer_size
self._m += self.layer_size

def _hash(self, token: str) -> list:
"""Return a list of filter offsets from a given token."""
offsets = []

for i in range(1, self.num_hashes + 1):
for i in range(1, self._num_hashes + 1):
offset = mmh3.hash(token, seed=i, signed=False)

offset %= self.slice_size
Expand Down
12 changes: 12 additions & 0 deletions tests/test_bloom_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@


class TestBloomFilter(TestCase):
def test_instantiate(self):
filter = BloomFilter(
max_false_positive_rate=0.001,
num_hashes=16,
layer_size=64000,
)

self.assertEqual(0.001, filter.max_false_positive_rate)
self.assertEqual(16, filter.num_hashes)
self.assertEqual(64000, filter.layer_size)
self.assertEqual(4000, filter.slice_size)

def test_insert_and_exists(self):
filter = BloomFilter()

Expand Down

0 comments on commit 65fe4a5

Please sign in to comment.