-
Notifications
You must be signed in to change notification settings - Fork 314
Adding suffix tree #323
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Adding suffix tree #323
Changes from 5 commits
4432547
598330b
391302d
7fd9da7
68ef229
57fd9f9
cac7126
b315c5e
e3586fa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,273 @@ | ||
from pydatastructs.utils.misc_util import SuffixNode | ||
|
||
__all__ = [ | ||
'SuffixTree' | ||
] | ||
|
||
class SuffixTree(): | ||
""" | ||
Represents Suffix Tree. | ||
|
||
Examples | ||
======== | ||
|
||
>>> from pydatastructs.strings import SuffixTree as suffix | ||
>>> s = suffix('hello') | ||
>>> s.find('he') | ||
0 | ||
>>> s.find_all('l') | ||
{2, 3} | ||
>>> s.find('f') | ||
-1 | ||
>>> lt=["abeceda", "abecednik", "abeabecedabeabeced", "abecedaaaa", "aaabbbeeecceeeddaaaaabeceda"] | ||
>>> s1 = suffix(lt) | ||
>>> s1.lcs() | ||
'abeced' | ||
|
||
References | ||
========== | ||
|
||
.. [1] https://en.wikipedia.org/wiki/Suffix_tree | ||
.. [2] https://en.wikipedia.org/wiki/Generalized_suffix_tree | ||
""" | ||
|
||
def __new__(cls, input=''): | ||
obj = object.__new__(cls) | ||
obj.root = SuffixNode() | ||
obj.root.depth = 0 | ||
obj.root.idx = 0 | ||
obj.root.parent = obj.root | ||
obj.root._add_suffix_link(obj.root) | ||
if not input == '': | ||
obj.build(input) | ||
return obj | ||
|
||
@classmethod | ||
def methods(cls): | ||
return ['__new__', 'lcs', 'find', 'find_all'] | ||
|
||
def _check_input(self, input): | ||
if isinstance(input, str): | ||
return 'str' | ||
elif isinstance(input, list): | ||
if all(isinstance(item, str) for item in input): | ||
return 'list' | ||
raise ValueError("String argument should be of type String or a list of strings") | ||
|
||
def build(self, x): | ||
""" | ||
Builds the Suffix tree on the given input. | ||
|
||
Parameters | ||
========== | ||
|
||
x: str or list of str | ||
|
||
Returns | ||
======= | ||
|
||
None | ||
""" | ||
type = self._check_input(x) | ||
if type == 'str': | ||
x += next(self._terminalSymbolsGenerator()) | ||
self._build(x) | ||
if type == 'list': | ||
self._build_generalized(x) | ||
|
||
def _build(self, x): | ||
self.word = x | ||
self._build_McCreight(x) | ||
|
||
def _build_McCreight(self, x): | ||
u = self.root | ||
d = 0 | ||
for i in range(len(x)): | ||
while u.depth == d and u._has_transition(x[d + i]): | ||
u = u._get_transition_link(x[d + i]) | ||
d = d + 1 | ||
while d < u.depth and x[u.idx + d] == x[i + d]: | ||
d = d + 1 | ||
if d < u.depth: | ||
u = self._create_node(x, u, d) | ||
self._create_leaf(x, i, u, d) | ||
if not u._get_suffix_link(): | ||
self._compute_slink(x, u) | ||
u = u._get_suffix_link() | ||
d = d - 1 | ||
if d < 0: | ||
d = 0 | ||
|
||
def _create_node(self, x, u, d): | ||
i = u.idx | ||
p = u.parent | ||
v = SuffixNode(idx=i, depth=d) | ||
v._add_transition_link(u, x[i + d]) | ||
u.parent = v | ||
p._add_transition_link(v, x[i + p.depth]) | ||
v.parent = p | ||
return v | ||
|
||
def _create_leaf(self, x, i, u, d): | ||
w = SuffixNode() | ||
w.idx = i | ||
w.depth = len(x) - i | ||
u._add_transition_link(w, x[i + d]) | ||
w.parent = u | ||
return w | ||
|
||
def _compute_slink(self, x, u): | ||
d = u.depth | ||
v = u.parent._get_suffix_link() | ||
while v.depth < d - 1: | ||
v = v._get_transition_link(x[u.idx + v.depth + 1]) | ||
if v.depth > d - 1: | ||
v = self._create_node(x, v, d - 1) | ||
u._add_suffix_link(v) | ||
|
||
def _build_generalized(self, xs): | ||
terminal_gen = self._terminalSymbolsGenerator() | ||
_xs = ''.join([x + next(terminal_gen) for x in xs]) | ||
self.word = _xs | ||
self._generalized_word_starts(xs) | ||
self._build(_xs) | ||
self.root._traverse(self._label_generalized) | ||
|
||
def _label_generalized(self, node): | ||
if node.is_leaf(): | ||
x = {self._get_word_start_index(node.idx)} | ||
else: | ||
x = {n for ns in node.transition_links.values() for n in ns.generalized_idxs} | ||
node.generalized_idxs = x | ||
|
||
def _get_word_start_index(self, idx): | ||
i = 0 | ||
for _idx in self.word_starts[1:]: | ||
if idx < _idx: | ||
return i | ||
else: | ||
i += 1 | ||
return i | ||
|
||
def lcs(self, stringIdxs = -1): | ||
|
||
""" | ||
Finds the Largest Common Substring of Strings provided in stringIdxs. | ||
If stringIdxs is not provided, the LCS of all strings is returned. | ||
|
||
Parameters | ||
========== | ||
|
||
stringIdxs: int or list of int | ||
|
||
Returns | ||
======= | ||
|
||
Longest Common Substring | ||
""" | ||
if stringIdxs == -1 or not isinstance(stringIdxs, list): | ||
stringIdxs = set(range(len(self.word_starts))) | ||
else: | ||
stringIdxs = set(stringIdxs) | ||
deepestNode = self._find_lcs(self.root, stringIdxs) | ||
start = deepestNode.idx | ||
end = deepestNode.idx + deepestNode.depth | ||
return self.word[start:end] | ||
|
||
def _find_lcs(self, node, stringIdxs): | ||
nodes = [self._find_lcs(n, stringIdxs) | ||
for n in node.transition_links.values() | ||
if n.generalized_idxs.issuperset(stringIdxs)] | ||
if nodes == []: | ||
return node | ||
deepestNode = max(nodes, key=lambda n: n.depth) | ||
return deepestNode | ||
|
||
def _generalized_word_starts(self, xs): | ||
self.word_starts = [] | ||
i = 0 | ||
for n in range(len(xs)): | ||
self.word_starts.append(i) | ||
i += len(xs[n]) + 1 | ||
|
||
def find(self, y): | ||
""" | ||
Finds the starting position of the substring y in the string used for | ||
building the Suffix tree. | ||
|
||
Parameters | ||
========== | ||
|
||
y: str | ||
|
||
Returns | ||
======= | ||
|
||
Index of the starting position of string y in the string used for building the Suffix tree | ||
-1 if y is not a substring. | ||
""" | ||
node = self.root | ||
while True: | ||
edge = self._edgeLabel(node, node.parent) | ||
if edge.startswith(y): | ||
return node.idx | ||
|
||
i = 0 | ||
while (i < len(edge) and edge[i] == y[0]): | ||
y = y[1:] | ||
i += 1 | ||
|
||
if i != 0: | ||
if i == len(edge) and y != '': | ||
pass | ||
else: | ||
return -1 | ||
|
||
node = node._get_transition_link(y[0]) | ||
if not node: | ||
return -1 | ||
|
||
def find_all(self, y): | ||
""" | ||
Finds the starting position of the substring y in the string used for | ||
building the Suffix tree. | ||
|
||
Parameters | ||
========== | ||
|
||
y: str | ||
|
||
Returns | ||
======= | ||
|
||
Set of Index of the starting positions of string y in the string used for building the Suffix tree | ||
{} if y is not a substring. | ||
""" | ||
node = self.root | ||
while True: | ||
edge = self._edgeLabel(node, node.parent) | ||
if edge.startswith(y): | ||
break | ||
i = 0 | ||
while (i < len(edge) and edge[i] == y[0]): | ||
y = y[1:] | ||
i += 1 | ||
if i != 0: | ||
if i == len(edge) and y != '': | ||
pass | ||
else: | ||
return {} | ||
node = node._get_transition_link(y[0]) | ||
if not node: | ||
return {} | ||
|
||
leaves = node._get_leaves() | ||
return {n.idx for n in leaves} | ||
|
||
def _edgeLabel(self, node, parent): | ||
return self.word[node.idx + parent.depth: node.idx + node.depth] | ||
|
||
def _terminalSymbolsGenerator(self): | ||
|
||
UPPAs = list(list(range(0xE000, 0xF8FF+1)) + list(range(0xF0000, 0xFFFFD+1)) + list(range(0x100000, 0x10FFFD+1))) | ||
|
||
for i in UPPAs: | ||
yield (chr(i)) | ||
raise ValueError("To many input strings.") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
from pydatastructs import SuffixTree | ||
from pydatastructs.utils.raises_util import raises | ||
import random, string | ||
|
||
def test_suffixtree(): | ||
""" | ||
References | ||
========== | ||
.. https://www.cise.ufl.edu/~sahni/dsaaj/enrich/c16/suffix.htm | ||
|
||
""" | ||
s = SuffixTree("HelloworldHe") | ||
assert s.find("Hel") == 0 | ||
assert s.find_all("He") == {0, 10} | ||
assert s.find("Win") == -1 | ||
assert s.find_all("go") == {} | ||
|
||
f = ['integer', 'inteinteger', 'integralerint', 'iaingerntier', 'regetnerireg', 'reger'] | ||
s = SuffixTree(f) | ||
assert s.lcs() == 'er' | ||
|
||
assert raises(ValueError, lambda: SuffixTree(123)) | ||
res = (100, 1, 0) | ||
assert raises(ValueError, lambda: SuffixTree(res)) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ | |
Set, | ||
CartesianTreeNode, | ||
RedBlackTreeNode, | ||
TrieNode | ||
TrieNode, | ||
SuffixNode | ||
) | ||
__all__.extend(misc_util.__all__) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oops left that point
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You have come early ☺