forked from vanangamudi/solthiruthi-sothanaikal
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bktree.py
106 lines (81 loc) · 2.94 KB
/
bktree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -*- coding: utf-8 -*-
from editdistance import eval as levenshtein
from tqdm import tqdm
from tamil.utf8 import get_letters
from collections import deque
from pprint import pprint, pformat
import csv
import utils
from resources import DEFAULT_DICTIONARY_FILES
class BKTree:
"""
From https://iq.opengenus.org/burkhard-keller-tree/
"""
def __init__(self, distance_func):
self._tree = None
self._distance_func = distance_func
def distance_func(self, node, candidate):
return self._distance_func(node, candidate)
def add(self, node):
if self._tree is None:
self._tree = (node, {})
return
current, children = self._tree
while True:
dist = self.distance_func(node, current)
target = children.get(dist)
if target is None:
children[dist] = (node, {})
break
current, children = target
def search(self, node, radius):
if self._tree is None:
return []
candidates = deque([self._tree])
result = []
while candidates:
candidate, children = candidates.popleft()
dist = self.distance_func(node, candidate)
if dist <= radius:
result.append((dist, candidate))
low, high = dist - radius, dist + radius
candidates.extend(c for d, c in children.items()
if low <= d <= high)
return sorted(result, key=lambda x: x[0])
class TamilBKTree(BKTree):
def distance_func(self, node, candidate):
node, candidate = get_letters(node), get_letters(candidate)
return super().distance_func(node, candidate)
def build_bktree(filepaths,
pbarp = False):
tree = TamilBKTree(levenshtein)
for filepath in filepaths:
print('loading {}...'.format(filepath))
if pbarp:
pbar = tqdm.tqdm(utils.openfile(filepath), ncols=100)
else:
pbar = utils.openfile(filepath)
for item in csv.reader(pbar, delimiter='\t'):
token, count = item
if token:
tree.add(token)
if pbarp:
pbar.set_description(token)
return tree
if __name__ == '__main__':
tree = TamilBKTree(levenshtein)
for filepath in DEFAULT_DICTIONARY_FILES:
print('loading {}...'.format(filepath))
with utils.openfile(filepath) as f:
csvf = csv.reader(f, delimiter='\t')
for line in tqdm(csvf):
token, count = line
if token:
tree.add(token)
word = input('> ')
while word:
pprint('என்ன என்ன வார்த்தைகளோ?')
pprint(tree.search(word, 2))
word = input('> ')
with utilsopen('tamil_tree_output.txt', 'w') as of:
of.write(pformat(tree))