diff --git a/README.md b/README.md index 1c865f2..b8d3d4b 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -# PyBloomer -PyBloomer is an implementation of the OkBloomer algorithm, an autoscaling [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) with ultra-low memory footprint for Python. It employs a novel layered filtering strategy that allows it to expand while maintaining an upper bound on the false positive rate. Each layer is comprised of a bitmap that remembers the hash signatures of the items inserted so far. If an item gets caught in the filter, then it has probably been seen before. However, if an item passes through the filter, then it definitely has never been seen before. +# Ok Bloomer +An implementation of the OkBloomer algorithm, an autoscaling [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) with ultra-low memory footprint for Python. Ok Bloomer employs a novel layered filtering strategy that allows it to expand while maintaining an upper bound on the false positive rate. As such, Ok Bloomer is suitable for streaming data where the size is not known a priori. - **Ultra-low** memory footprint - **Autoscaling** works on streaming data @@ -9,16 +9,16 @@ PyBloomer is an implementation of the OkBloomer algorithm, an autoscaling [Bloom ### Parameters | # | Name | Default | Type | Description | |---|---|---|---|---| -| 1 | max_false_positive_rate | 0.01 | float | The false positive rate to remain below. | +| 1 | max_false_positive_rate | 0.01 | float | The upper false positivity rate bounds. | | 2 | num_hashes | 4 | int | The number of hash functions used, i.e. the number of slices per layer. | | 3 | layer_size | 32000000 | int | The size of each layer of the filter in bits. | ### Example ```python -import pybloomer +import okbloomer -filter = pybloomer.BloomFilter( +filter = okbloomer.BloomFilter( max_false_positive_rate=0.01, num_hashes=4, layer_size=32000000, @@ -46,4 +46,4 @@ True ``` ## References -- [1] A. DalPino. (2021). OkBloomer, a novel autoscaling Bloom Filter for PHP [[link](https://github.com/andrewdalpino/OkBloomer)]. +- [1] A. DalPino. (2021). OkBloomer, a novel autoscaling Bloom Filter [[link](https://github.com/andrewdalpino/OkBloomer)]. diff --git a/pyproject.toml b/pyproject.toml index a1cc2b8..47b1b4e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,13 +7,14 @@ name = "OkBloomer" version = "0.0.1" requires-python = ">= 3.10" dependencies = [ - "numpy>=1.19.5", + "mmh3>5.0.0", "nptyping>=2.5.0", + "numpy>=1.19.5", ] authors = [ {name = "Andrew DalPino", email = "support@andrewdalpino.com"}, ] -description = "A Python implementation of the OkBloomer algorithm, an autoscaling Bloom filter with ultra-low memory footprint." +description = "An implementation of the OkBloomer algorithm, an autoscaling Bloom filter with ultra-low memory footprint for Python." readme = "README.md" license = {text = "MIT"} diff --git a/src/okbloomer/bloom_filter.py b/src/okbloomer/bloom_filter.py index 0990ea5..4c4dee1 100644 --- a/src/okbloomer/bloom_filter.py +++ b/src/okbloomer/bloom_filter.py @@ -1,13 +1,21 @@ import numpy as np +import mmh3 + from nptyping import NDArray +MAX_32_BIT_UNSIGNED_INTEGER = 4294967295 + class BloomFilter(object): """ A probabilistic data structure that estimates the prior occurrence of a given item with a maximum false positive rate. """ - MAX_SLICE_SIZE = 2147483647 + MAX_HASH_DIGEST = MAX_32_BIT_UNSIGNED_INTEGER + + MAX_SLICE_SIZE = MAX_HASH_DIGEST + + MAX_HASH_FUNCTIONS = int(MAX_SLICE_SIZE / 2) def __init__(self, max_false_positive_rate: float = 0.01, @@ -17,8 +25,8 @@ def __init__(self, if max_false_positive_rate < 0.0 or max_false_positive_rate > 1.0: raise ValueError(f'Max false positive rate must be between 0 and 1, {max_false_positive_rate} given.') - if num_hashes < 1: - raise ValueError(f'Num hashes must be greater than 1, {num_hashes} given.') + if num_hashes < 1 or num_hashes > self.MAX_HASH_FUNCTIONS: + raise ValueError(f'Num hashes must be between 1 and {self.MAX_HASH_FUNCTIONS}, {num_hashes} given.') if layer_size < num_hashes: raise ValueError(f'Layer size must be greater than {num_hashes}, {layer_size} given.') @@ -140,7 +148,7 @@ def _hash(self, token: str) -> list: offsets = [] for i in range(1, self.num_hashes + 1): - offset = hash(f'{i}{token}') + offset = mmh3.hash(token, seed=i, signed=False) offset %= self.slice_size offset *= i diff --git a/tests/test_bloom_filter.py b/tests/test_bloom_filter.py index fdf8941..8163590 100644 --- a/tests/test_bloom_filter.py +++ b/tests/test_bloom_filter.py @@ -5,7 +5,7 @@ import okbloomer class TestBloomFilter(unittest.TestCase): - def test_basic(self): + def test_insert_and_exists(self): filter = okbloomer.BloomFilter() self.assertEqual(filter.false_positive_rate, 0)