|
13 | 13 | :- table(fuzzy_substr/3).
|
14 | 14 | :- endif.
|
15 | 15 |
|
| 16 | +:- use_foreign_library('./bin/libutils_rs.so'). |
| 17 | + |
16 | 18 | :- meta_predicate run_levenshtein(+, +, 2, -).
|
17 | 19 |
|
18 |
| -%% list_subset(?List1, ?List2) |
| 20 | +%! list_subset(?List1, ?List2) |
19 | 21 | % Returns true if List1 is a subset of List2.
|
20 | 22 | list_subset([], _).
|
21 | 23 | list_subset([First|Rest], B) :-
|
22 | 24 | member(First, B),
|
23 | 25 | list_subset(Rest, B), !.
|
24 | 26 |
|
25 |
| -%% join(+Items, +Sep, -Output) |
| 27 | +%! join(+Items, +Sep, -Output) |
26 | 28 | % Joins the string with the provided separator string
|
27 | 29 | join([], _Sep, "") :- !.
|
28 | 30 | join([Item], _Sep, Item) :- !.
|
|
31 | 33 | string_concat(Head, Sep, HeadSep),
|
32 | 34 | string_concat(HeadSep, TailOutput, Output), !.
|
33 | 35 |
|
34 |
| -%% sequence_match/2(+Sequence, +String) |
| 36 | +%! sequence_match/2(+Sequence, +String) |
35 | 37 | % sequence_match is true if the all elements in Sequence appear in
|
36 | 38 | % String, in sequential order.
|
37 | 39 | sequence_match(Sequence, String) :-
|
|
48 | 50 | sequence_match(Sequence, [_|Tail1]) :-
|
49 | 51 | sequence_match(Sequence, Tail1).
|
50 | 52 |
|
51 |
| -%% split_left/4(String, Sep, N, -Substrings) |
52 |
| -% split_left splits the provided string on the characters in Sep, |
53 |
| -% up to a maximum of N times into Substrings. Multiple seperator characters |
54 |
| -% will be treated as one. |
55 |
| -split_left(String, Sep, N, Substrings) :- |
56 |
| - string_chars(String, Chars), |
57 |
| - string_chars(Sep, Sep_), |
58 |
| - split_left(Chars, Sep_, N, [], CharSubstrings), |
59 |
| - maplist( |
60 |
| - string_chars, |
61 |
| - Substrings, |
62 |
| - CharSubstrings |
63 |
| - ), !. |
64 |
| - |
65 |
| -%% Splits the string from left to right, on the provided separator, |
66 |
| -% up to a maximum of n times, and stores intermediate state in Accumulator |
67 |
| -split_left([], _Sep, _, Accumulator, [Reversed]) :- |
68 |
| - reverse(Accumulator, Reversed), !. |
69 |
| -split_left([Head|Tail], Sep, 0, Accumulator, [Whole]) :- |
70 |
| - member(Head, Sep), |
71 |
| - split_left(Tail, Sep, 0, Accumulator, [Whole]), !. |
72 |
| -split_left(String, _Sep, 0, Accumulator, [Whole]) :- |
73 |
| - reverse(Accumulator, Reversed), |
74 |
| - append(Reversed, String, Whole), !. |
75 |
| -split_left([Head|Tail], Sep, N, [], Strings) :- |
76 |
| - member(Head, Sep), |
77 |
| - split_left(Tail, Sep, N, [], Strings), !. |
78 |
| -split_left([Head|Tail], Sep, N, Accumulator, [Reversed|Strings]) :- |
79 |
| - member(Head, Sep), |
80 |
| - reverse(Accumulator, Reversed), |
81 |
| - % Force early evaluation |
82 |
| - NSub is N - 1, |
83 |
| - split_left(Tail, Sep, NSub, [], Strings), !. |
84 |
| -split_left([Head|Tail], Sep, N, Accumulator, Strings) :- |
85 |
| - split_left(Tail, Sep, N, [Head|Accumulator], Strings), !. |
86 | 53 |
|
87 | 54 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
88 |
| -%% Development notes: |
89 |
| -% Haskell impl is ~3x faster (with the backtracking functionality) |
90 |
| -% BUT: |
91 |
| -% - +1.2GB of disk usage in Docker |
92 |
| -% - Requires reading/writing to streams (annoying) |
93 |
| -% - Requires compilation and extra steps in Makefiles |
94 |
| - |
95 |
| -%% Basic implementation of substitution cost. |
96 |
| -lev_cost(C, C, 0.0) :- !. |
97 |
| -lev_cost(A, B, 0.4) :- downcase_atom(A, C), downcase_atom(B, C), !. |
98 |
| -lev_cost(_, _, 1.0) :- !. |
99 |
| - |
100 |
| -%% Fills the current row with the cheapest action |
101 |
| -fill_row(_, Row, _, [], Row). |
102 |
| -fill_row(PrevRow, Row, Ca, [Cb|StrB], Out) :- |
103 |
| - PrevRow = [Subst,Delete|PRest], |
104 |
| - Row = [Insert|Rest], |
105 |
| - lev_cost(Ca, Cb, SubstCost), |
106 |
| - Substx is Subst + SubstCost, |
107 |
| - Deletex is Delete + 1.0, |
108 |
| - Insertx is Insert + 1.0, |
109 |
| - min_list([Substx, Deletex, Insertx], MinC), |
110 |
| - fill_row([Delete|PRest], [MinC,Insert|Rest], Ca, StrB, Out). |
111 |
| - |
112 |
| -%% Rearranges arguments as needed for initial fill_row call. |
113 |
| -% can do [Head|Tail], [HRow, Head|Tail] to keep the full table |
114 |
| -% for backtracking purposes |
115 |
| -fill_row_helper(StrA, Num, Char, [Head|_], [HRow, Head]) :- |
116 |
| - Numf is float(Num), |
117 |
| - fill_row(Head, [Numf], Char, StrA, RRow), |
118 |
| - reverse(RRow, HRow). |
119 |
| - |
120 |
| -%% Helper which builds the levenshtein distance table (or in this case, a single row). |
121 |
| -run_levenshtein(A, "", RowFn, [FirstRow]) :- |
122 |
| - string(A), |
123 |
| - string_chars(A, AChars), |
124 |
| - length(AChars, LA), |
125 |
| - call(RowFn, LA, FirstRow), !. |
126 |
| - |
127 |
| -run_levenshtein(A, B, RowFn, Table) :- |
128 |
| - string(A), |
129 |
| - string(B), |
130 |
| - string_chars(A, AChars), |
131 |
| - string_chars(B, BChars), |
132 |
| - length(AChars, LA), |
133 |
| - length(BChars, LB), |
134 |
| - call(RowFn, LA, FirstRow), |
135 |
| - numlist(1, LB, Nums), |
136 |
| - foldl(fill_row_helper(AChars), Nums, BChars, [FirstRow], Table), !. |
137 |
| - |
138 |
| -to_float(X, Y) :- Y is float(X). |
139 |
| - |
140 |
| -numlist_helper(LA, List) :- |
141 |
| - numlist(0, LA, NList), maplist(to_float, NList, List). |
| 55 | +%% Documentation for foreign library |
| 56 | +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
142 | 57 |
|
143 |
| -%% levenshtein_distance(+A:str, +B:str, -Distance:float) |
| 58 | +%! levenshtein_distance(+A:str, +B:str, -Distance:float) is semidet |
144 | 59 | % Returns the Levenshtein distance between A and B
|
145 | 60 | % https://en.wikipedia.org/wiki/Levenshtein_distance
|
146 |
| -% Uses the two-row table solution to optimize for memory and runtime characteristics |
147 |
| -levenshtein_distance(A, B, Distance) :- |
148 |
| - run_levenshtein(A, B, numlist_helper, [Row|_]), |
149 |
| - last(Row, Distance). |
150 | 61 |
|
151 |
| -zeros_helper(LA, List) :- |
152 |
| - LAPlus is LA + 1, |
153 |
| - length(List, LAPlus), maplist(=(0.0), List). |
154 |
| - |
155 |
| -%% levenshtein_distance_fuzzy(+A:str, +B:str, -Distance:float) |
| 62 | +%! levenshtein_distance_fuzzy(+A:str, +B:str, -Distance:float) is semidet |
156 | 63 | % https://en.wikipedia.org/wiki/Approximate_string_matching#Problem_formulation_and_algorithms
|
157 |
| -% Use min_list for minimum distance. Ignore first item in row. |
158 |
| -fuzzy_substr(A, B, Distance) :- |
159 |
| - run_levenshtein(A, B, zeros_helper, [[_|Out]|_]), |
160 |
| - min_list(Out, Distance). |
161 | 64 |
|
| 65 | +%! split_left/4(String, Sep, N, -Substrings) |
| 66 | +% split_left splits the provided string on the characters in Sep, |
| 67 | +% up to a maximum of N times into Substrings. Multiple seperator characters |
| 68 | +% will be treated as one. |
0 commit comments