-
Notifications
You must be signed in to change notification settings - Fork 5
/
_distances.py
105 lines (90 loc) · 3.39 KB
/
_distances.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
'''
Various distances.
dameraulevenshtein is originally by Michael Homer, posted at:
http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/
'''
import numpy as np
from itertools import izip
def pdist2(X, Y=None, distance=None):
'''
Pairwise distance between two lists of items.
Parameters
----------
X : iterable
a list of items
Y : iterable
2nd list of items (if None, Y=X)
distance : callable
distance function to evaluate on pairs of list items
Returns
-------
Matrix D containing distances between all the list items.
D[i, j] contains distance between X[i] and Y[j] (or X[j] if Y is None).
'''
if Y == None:
j0 = lambda i: i+1
_Y = X
else:
j0 = lambda i: 0
_Y = Y
m = len(X)
n = len(_Y)
D = np.zeros((m, n), dtype=float)
for i in range(m):
for j in range(j0(i), n):
D[i, j] = distance(X[i], _Y[j])
if Y == None:
D += D.T
return D
def hamming(seq1, seq2):
if len(seq1) != len(seq2):
raise ValueError('Hamming distance requires strings of equal length.')
if isinstance(seq1, np.ndarray) and isinstance(seq2, np.ndarray):
return (seq1 != seq2).sum()
else:
return sum(c1 != c2 for c1, c2 in izip(seq1, seq2))
def jaccard(seq1, seq2):
if len(seq1) != len(seq2):
raise ValueError('Jaccard distance requires strings of equal length.')
if len(seq1) == 0:
return 0.
return sum(c1 != c2 for c1, c2 in izip(seq1, seq2)) / float(len(seq1))
def dameraulevenshtein(seq1, seq2):
"""Calculate the Damerau-Levenshtein distance between sequences.
This distance is the number of additions, deletions, substitutions,
and transpositions needed to transform the first sequence into the
second. Although generally used with strings, any sequences of
comparable objects will work.
Transpositions are exchanges of *consecutive* characters; all other
operations are self-explanatory.
This implementation is O(N*M) time and O(M) space, for N and M the
lengths of the two sequences.
>>> dameraulevenshtein('ba', 'abc')
2
>>> dameraulevenshtein('fee', 'deed')
2
It works with arbitrary sequences too:
>>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
2
"""
# codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F
# Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix.
# However, only the current and two previous rows are needed at once,
# so we only store those.
oneago = None
thisrow = range(1, len(seq2) + 1) + [0]
for x in xrange(len(seq1)):
# Python lists wrap around for negative indices, so put the
# leftmost column at the *end* of the list. This matches with
# the zero-indexed strings and saves extra calculation.
twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
for y in xrange(len(seq2)):
delcost = oneago[y] + 1
addcost = thisrow[y - 1] + 1
subcost = oneago[y - 1] + (seq1[x] != seq2[y])
thisrow[y] = min(delcost, addcost, subcost)
# This block deals with transpositions
if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]):
thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
return thisrow[len(seq2) - 1]