Skip to content

Commit

Permalink
Add coding style
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewdalpino committed Oct 29, 2024
1 parent 9eba232 commit 64b0d77
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 37 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ jobs:
os: [ubuntu-latest]
python:
- "3.10"
- "3.11"
- "3.12"

runs-on: ${{ matrix.os }}

Expand All @@ -30,3 +32,6 @@ jobs:

- name: Run tests
run: python -m unittest

- name: Coding style
run: python -m black --check ./
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ readme = "README.md"
license = {text = "MIT"}

[project.optional-dependencies]
dev = ["mypy"]
test = ["mypy"]
dev = ["mypy", "black"]
test = ["mypy", "black"]

[project.urls]
Homepage = "https://github.com/andrewdalpino/PyBloomer"
Expand Down
4 changes: 2 additions & 2 deletions src/okbloomer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
BloomFilter,
)

__version__ = '0.0.3'
__version__ = "0.0.3"

__all__ = [
'BloomFilter',
"BloomFilter",
]
39 changes: 25 additions & 14 deletions src/okbloomer/bloom_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,44 +5,55 @@

MAX_32_BIT_UNSIGNED_INTEGER = 4294967295


class BloomFilter(object):
"""
A probabilistic data structure that estimates the prior occurrence
of a given item with a maximum false positive rate.
of a given item with a maximum false positive rate.
"""

MAX_HASH_DIGEST = MAX_32_BIT_UNSIGNED_INTEGER

MAX_SLICE_SIZE = MAX_HASH_DIGEST

MAX_HASH_FUNCTIONS = MAX_SLICE_SIZE // 2

def __init__(self,
max_false_positive_rate: float = 0.01,
num_hashes: int = 4,
layer_size: int = 32000000) -> None:
def __init__(
self,
max_false_positive_rate: float = 0.01,
num_hashes: int = 4,
layer_size: int = 32000000,
) -> None:

if max_false_positive_rate < 0.0 or max_false_positive_rate > 1.0:
raise ValueError(f'Max false positive rate must be between 0 and 1, {max_false_positive_rate} given.')
raise ValueError(
f"Max false positive rate must be between 0 and 1, {max_false_positive_rate} given."
)

if num_hashes < 1 or num_hashes > self.MAX_HASH_FUNCTIONS:
raise ValueError(f'Num hashes must be between 1 and {self.MAX_HASH_FUNCTIONS}, {num_hashes} given.')
raise ValueError(
f"Num hashes must be between 1 and {self.MAX_HASH_FUNCTIONS}, {num_hashes} given."
)

if layer_size < num_hashes:
raise ValueError(f'Layer size must be greater than {num_hashes}, {layer_size} given.')
raise ValueError(
f"Layer size must be greater than {num_hashes}, {layer_size} given."
)

slice_size = layer_size // num_hashes

if slice_size > self.MAX_SLICE_SIZE:
raise ValueError(f'Slice size must be less than {self.MAX_SLICE_SIZE}, {slice_size} given.')
raise ValueError(
f"Slice size must be less than {self.MAX_SLICE_SIZE}, {slice_size} given."
)

self.max_false_positive_rate = max_false_positive_rate
self.num_hashes = num_hashes
self.layer_size = layer_size
self.slice_size = slice_size
self.layers: list[NDArray] = []
self.n = 0 # The number of bits currently stored in the filter.
self.m = 0 # The maximum number of bits that can be stored in the filter.
self.n = 0 # The number of bits currently stored in the filter.
self.m = 0 # The maximum number of bits that can be stored in the filter.

self._add_layer()

Expand All @@ -63,7 +74,7 @@ def capacity(self) -> float:
@property
def false_positive_rate(self) -> float:
"""Return the probability of recording a false positive"""
return self.utilization ** self.num_hashes
return self.utilization**self.num_hashes

def insert(self, token: str) -> None:
"""Insert a token into the filter"""
Expand Down Expand Up @@ -137,7 +148,7 @@ def exists_or_insert(self, token: str) -> bool:

def _add_layer(self) -> None:
"""Add another layer to the filter for maintaining the false positivity rate below the threshold."""
self.layers.append(np.zeros(self.layer_size, dtype='bool'))
self.layers.append(np.zeros(self.layer_size, dtype="bool"))

self.m += self.layer_size

Expand Down
41 changes: 22 additions & 19 deletions tests/test_bloom_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,40 @@

from unittest import TestCase


class TestBloomFilter(TestCase):
def test_insert_and_exists(self):
filter = BloomFilter()

self.assertEqual(filter.false_positive_rate, 0)

self.assertFalse(filter.exists('foo'))
self.assertFalse(filter.exists("foo"))

filter.insert('foo')
filter.insert("foo")

self.assertTrue(filter.exists('foo'))
self.assertTrue(filter.exists("foo"))
self.assertGreater(filter.false_positive_rate, 0)

self.assertFalse(filter.exists('bar'))
self.assertFalse(filter.exists("bar"))

filter.insert('bar')
filter.insert("bar")

self.assertTrue(filter.exists('bar'))
self.assertTrue(filter.exists("bar"))
self.assertGreater(filter.false_positive_rate, 0)

self.assertFalse(filter.exists('baz'))
self.assertFalse(filter.exists("baz"))

def test_exists_or_insert(self):
filter = BloomFilter()

self.assertFalse(filter.exists_or_insert('foo'))
self.assertTrue(filter.exists_or_insert('foo'))
self.assertFalse(filter.exists_or_insert("foo"))
self.assertTrue(filter.exists_or_insert("foo"))

self.assertFalse(filter.exists_or_insert('bar'))
self.assertTrue(filter.exists_or_insert('bar'))
self.assertFalse(filter.exists_or_insert("bar"))
self.assertTrue(filter.exists_or_insert("bar"))

self.assertFalse(filter.exists_or_insert('baz'))
self.assertTrue(filter.exists_or_insert('baz'))
self.assertFalse(filter.exists_or_insert("baz"))
self.assertTrue(filter.exists_or_insert("baz"))

def test_autoscaling(self):
random.seed(1)
Expand All @@ -50,16 +51,18 @@ def test_autoscaling(self):

self.assertEqual(filter.num_layers, 1)

filter.insert('foo')
filter.insert("foo")

for i in range(0, 100000):
filter.insert(''.join(random.choice(string.ascii_letters) for j in range(20)))
filter.insert(
"".join(random.choice(string.ascii_letters) for j in range(20))
)

filter.insert('bar')
filter.insert("bar")

self.assertTrue(filter.exists('foo'))
self.assertTrue(filter.exists('bar'))
self.assertFalse(filter.exists('baz'))
self.assertTrue(filter.exists("foo"))
self.assertTrue(filter.exists("bar"))
self.assertFalse(filter.exists("baz"))

self.assertEqual(filter.num_layers, 6)
self.assertLessEqual(filter.false_positive_rate, 0.001)
Expand Down

0 comments on commit 64b0d77

Please sign in to comment.