_distances.py

'''
Various distances.

dameraulevenshtein is originally by Michael Homer, posted at:
http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/
'''

import numpy as np
from itertools import izip

def pdist2(X, Y=None, distance=None):
    '''
    Pairwise distance between two lists of items.

    Parameters
    ----------
    X : iterable
        a list of items
    Y : iterable
        2nd list of items (if None, Y=X)
    distance : callable
        distance function to evaluate on pairs of list items

    Returns
    -------
    Matrix D containing distances between all the list items.
    D[i, j] contains distance between X[i] and Y[j] (or X[j] if Y is None).
    '''
    if Y == None:
        j0 = lambda i: i+1
        _Y = X
    else:
        j0 = lambda i: 0
        _Y = Y
    m = len(X)
    n = len(_Y)
    D = np.zeros((m, n), dtype=float)
    for i in range(m):
        for j in range(j0(i), n):
            D[i, j] = distance(X[i], _Y[j])
    if Y == None:
        D += D.T
    return D


def hamming(seq1, seq2):
    if len(seq1) != len(seq2):
        raise ValueError('Hamming distance requires strings of equal length.')
    if isinstance(seq1, np.ndarray) and isinstance(seq2, np.ndarray):
        return (seq1 != seq2).sum()
    else:
        return sum(c1 != c2 for c1, c2 in izip(seq1, seq2))

def jaccard(seq1, seq2):
    if len(seq1) != len(seq2):
        raise ValueError('Jaccard distance requires strings of equal length.')
    if len(seq1) == 0:
        return 0.
    return sum(c1 != c2 for c1, c2 in izip(seq1, seq2)) / float(len(seq1))

def dameraulevenshtein(seq1, seq2):
    """Calculate the Damerau-Levenshtein distance between sequences.

    This distance is the number of additions, deletions, substitutions,
    and transpositions needed to transform the first sequence into the
    second. Although generally used with strings, any sequences of
    comparable objects will work.

    Transpositions are exchanges of *consecutive* characters; all other
    operations are self-explanatory.

    This implementation is O(N*M) time and O(M) space, for N and M the
    lengths of the two sequences.

    >>> dameraulevenshtein('ba', 'abc')
    2
    >>> dameraulevenshtein('fee', 'deed')
    2

    It works with arbitrary sequences too:
    >>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
    2
    """
    # codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F
    # Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix.
    # However, only the current and two previous rows are needed at once,
    # so we only store those.
    oneago = None
    thisrow = range(1, len(seq2) + 1) + [0]
    for x in xrange(len(seq1)):
        # Python lists wrap around for negative indices, so put the
        # leftmost column at the *end* of the list. This matches with
        # the zero-indexed strings and saves extra calculation.
        twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
        for y in xrange(len(seq2)):
            delcost = oneago[y] + 1
            addcost = thisrow[y - 1] + 1
            subcost = oneago[y - 1] + (seq1[x] != seq2[y])
            thisrow[y] = min(delcost, addcost, subcost)
            # This block deals with transpositions
            if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
                and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]):
                thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
    return thisrow[len(seq2) - 1]