-
Notifications
You must be signed in to change notification settings - Fork 12
/
hasher.py
40 lines (35 loc) · 1.14 KB
/
hasher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import numpy as np
from pyspark.mllib.linalg import SparseVector
def minhash(v, a, b, p, m):
"""
Determines the type and computes the minhash of the vector.
1: Multiplies the index by the non-zero seed "a".
2: Adds the bias "b" (can be 0).
3: Modulo "p", a number larger than the number of elements.
4: Modulo "m", the number of buckets.
Parameters
----------
v : object
Python list, NumPy array, or SparseVector.
a : integer
Seed, > 0.
b : integer
Seed, >= 0.
p : integer
Only restriction is that this number is larger than the number of elements.
m : integer
Number of bins.
Returns
-------
i : integer
Integer minhash value that is in [0, buckets).
"""
indices = None
if type(v) is SparseVector:
indices = v.indices
elif type(v) is np.ndarray or type(v) is list:
indices = np.arange(len(v), dtype = np.int)
else:
raise Exception("Unknown array type '%s'." % type(v))
# Map the indices to hash values and take the minimum.
return np.array((((a * indices) + b) % p) % m).min()