diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ede3a91..5319d59 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -13,6 +13,8 @@ jobs: os: [ubuntu-latest] python: - "3.10" + - "3.11" + - "3.12" runs-on: ${{ matrix.os }} @@ -30,3 +32,6 @@ jobs: - name: Run tests run: python -m unittest + + - name: Coding style + run: python -m black --check ./ diff --git a/pyproject.toml b/pyproject.toml index c64c75c..2d535b6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,8 +19,8 @@ readme = "README.md" license = {text = "MIT"} [project.optional-dependencies] -dev = ["mypy"] -test = ["mypy"] +dev = ["mypy", "black"] +test = ["mypy", "black"] [project.urls] Homepage = "https://github.com/andrewdalpino/PyBloomer" diff --git a/src/okbloomer/__init__.py b/src/okbloomer/__init__.py index 05c5342..f986750 100644 --- a/src/okbloomer/__init__.py +++ b/src/okbloomer/__init__.py @@ -5,8 +5,8 @@ BloomFilter, ) -__version__ = '0.0.3' +__version__ = "0.0.3" __all__ = [ - 'BloomFilter', + "BloomFilter", ] diff --git a/src/okbloomer/bloom_filter.py b/src/okbloomer/bloom_filter.py index 9f83228..77e5276 100644 --- a/src/okbloomer/bloom_filter.py +++ b/src/okbloomer/bloom_filter.py @@ -5,44 +5,55 @@ MAX_32_BIT_UNSIGNED_INTEGER = 4294967295 + class BloomFilter(object): """ A probabilistic data structure that estimates the prior occurrence - of a given item with a maximum false positive rate. + of a given item with a maximum false positive rate. """ - + MAX_HASH_DIGEST = MAX_32_BIT_UNSIGNED_INTEGER MAX_SLICE_SIZE = MAX_HASH_DIGEST MAX_HASH_FUNCTIONS = MAX_SLICE_SIZE // 2 - def __init__(self, - max_false_positive_rate: float = 0.01, - num_hashes: int = 4, - layer_size: int = 32000000) -> None: + def __init__( + self, + max_false_positive_rate: float = 0.01, + num_hashes: int = 4, + layer_size: int = 32000000, + ) -> None: if max_false_positive_rate < 0.0 or max_false_positive_rate > 1.0: - raise ValueError(f'Max false positive rate must be between 0 and 1, {max_false_positive_rate} given.') + raise ValueError( + f"Max false positive rate must be between 0 and 1, {max_false_positive_rate} given." + ) if num_hashes < 1 or num_hashes > self.MAX_HASH_FUNCTIONS: - raise ValueError(f'Num hashes must be between 1 and {self.MAX_HASH_FUNCTIONS}, {num_hashes} given.') + raise ValueError( + f"Num hashes must be between 1 and {self.MAX_HASH_FUNCTIONS}, {num_hashes} given." + ) if layer_size < num_hashes: - raise ValueError(f'Layer size must be greater than {num_hashes}, {layer_size} given.') + raise ValueError( + f"Layer size must be greater than {num_hashes}, {layer_size} given." + ) slice_size = layer_size // num_hashes if slice_size > self.MAX_SLICE_SIZE: - raise ValueError(f'Slice size must be less than {self.MAX_SLICE_SIZE}, {slice_size} given.') + raise ValueError( + f"Slice size must be less than {self.MAX_SLICE_SIZE}, {slice_size} given." + ) self.max_false_positive_rate = max_false_positive_rate self.num_hashes = num_hashes self.layer_size = layer_size self.slice_size = slice_size self.layers: list[NDArray] = [] - self.n = 0 # The number of bits currently stored in the filter. - self.m = 0 # The maximum number of bits that can be stored in the filter. + self.n = 0 # The number of bits currently stored in the filter. + self.m = 0 # The maximum number of bits that can be stored in the filter. self._add_layer() @@ -63,7 +74,7 @@ def capacity(self) -> float: @property def false_positive_rate(self) -> float: """Return the probability of recording a false positive""" - return self.utilization ** self.num_hashes + return self.utilization**self.num_hashes def insert(self, token: str) -> None: """Insert a token into the filter""" @@ -137,7 +148,7 @@ def exists_or_insert(self, token: str) -> bool: def _add_layer(self) -> None: """Add another layer to the filter for maintaining the false positivity rate below the threshold.""" - self.layers.append(np.zeros(self.layer_size, dtype='bool')) + self.layers.append(np.zeros(self.layer_size, dtype="bool")) self.m += self.layer_size diff --git a/tests/test_bloom_filter.py b/tests/test_bloom_filter.py index 886dd6f..1618b9d 100644 --- a/tests/test_bloom_filter.py +++ b/tests/test_bloom_filter.py @@ -5,39 +5,40 @@ from unittest import TestCase + class TestBloomFilter(TestCase): def test_insert_and_exists(self): filter = BloomFilter() self.assertEqual(filter.false_positive_rate, 0) - self.assertFalse(filter.exists('foo')) + self.assertFalse(filter.exists("foo")) - filter.insert('foo') + filter.insert("foo") - self.assertTrue(filter.exists('foo')) + self.assertTrue(filter.exists("foo")) self.assertGreater(filter.false_positive_rate, 0) - self.assertFalse(filter.exists('bar')) + self.assertFalse(filter.exists("bar")) - filter.insert('bar') + filter.insert("bar") - self.assertTrue(filter.exists('bar')) + self.assertTrue(filter.exists("bar")) self.assertGreater(filter.false_positive_rate, 0) - self.assertFalse(filter.exists('baz')) + self.assertFalse(filter.exists("baz")) def test_exists_or_insert(self): filter = BloomFilter() - self.assertFalse(filter.exists_or_insert('foo')) - self.assertTrue(filter.exists_or_insert('foo')) + self.assertFalse(filter.exists_or_insert("foo")) + self.assertTrue(filter.exists_or_insert("foo")) - self.assertFalse(filter.exists_or_insert('bar')) - self.assertTrue(filter.exists_or_insert('bar')) + self.assertFalse(filter.exists_or_insert("bar")) + self.assertTrue(filter.exists_or_insert("bar")) - self.assertFalse(filter.exists_or_insert('baz')) - self.assertTrue(filter.exists_or_insert('baz')) + self.assertFalse(filter.exists_or_insert("baz")) + self.assertTrue(filter.exists_or_insert("baz")) def test_autoscaling(self): random.seed(1) @@ -50,16 +51,18 @@ def test_autoscaling(self): self.assertEqual(filter.num_layers, 1) - filter.insert('foo') + filter.insert("foo") for i in range(0, 100000): - filter.insert(''.join(random.choice(string.ascii_letters) for j in range(20))) + filter.insert( + "".join(random.choice(string.ascii_letters) for j in range(20)) + ) - filter.insert('bar') + filter.insert("bar") - self.assertTrue(filter.exists('foo')) - self.assertTrue(filter.exists('bar')) - self.assertFalse(filter.exists('baz')) + self.assertTrue(filter.exists("foo")) + self.assertTrue(filter.exists("bar")) + self.assertFalse(filter.exists("baz")) self.assertEqual(filter.num_layers, 6) self.assertLessEqual(filter.false_positive_rate, 0.001)