|
1 | 1 | package club.staircrusher.stdlib.util.string
|
2 | 2 |
|
3 |
| -fun String.emptyToNull() = this.ifBlank { null } |
| 3 | +import java.util.* |
4 | 4 |
|
5 |
| -// https://en.wikipedia.org/wiki/Levenshtein_distance#:~:text=The%20Levenshtein%20distance%20between%20two,defined%20the%20metric%20in%201965. |
6 |
| -// string similarity by levenshtein distance considering korean |
7 |
| -fun String.isSimilarWith(other: String, maxThreshold: Int = 3): Boolean { |
8 |
| - val similarity = jamoLevenshtein(this, other) |
9 |
| - return similarity <= maxThreshold |
10 |
| -} |
| 5 | +fun String.emptyToNull() = this.ifBlank { null } |
11 | 6 |
|
12 |
| -private fun Char.decomposeHangul(): List<Char>? { |
13 |
| - val result = mutableListOf<Char>() |
14 |
| - val choseongs = |
15 |
| - listOf('ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ') |
16 |
| - val joongseongs = |
17 |
| - listOf('ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ') |
18 |
| - val jongseongs = listOf( |
19 |
| - null, |
20 |
| - 'ㄱ', |
21 |
| - 'ㄲ', |
22 |
| - 'ㄳ', |
23 |
| - 'ㄴ', |
24 |
| - 'ㄵ', |
25 |
| - 'ㄶ', |
26 |
| - 'ㄷ', |
27 |
| - 'ㄹ', |
28 |
| - 'ㄺ', |
29 |
| - 'ㄻ', |
30 |
| - 'ㄼ', |
31 |
| - 'ㄽ', |
32 |
| - 'ㄾ', |
33 |
| - 'ㄿ', |
34 |
| - 'ㅀ', |
35 |
| - 'ㅁ', |
36 |
| - 'ㅂ', |
37 |
| - 'ㅄ', |
38 |
| - 'ㅅ', |
39 |
| - 'ㅆ', |
40 |
| - 'ㅇ', |
41 |
| - 'ㅈ', |
42 |
| - 'ㅊ', |
43 |
| - 'ㅋ', |
44 |
| - 'ㅌ', |
45 |
| - 'ㅍ', |
46 |
| - 'ㅎ' |
| 7 | +fun String.isSimilarWith(pattern: String): Boolean { |
| 8 | + return simpleMatch( |
| 9 | + this.lowercase(Locale.US), |
| 10 | + pattern.lowercase(Locale.US).filter { it.isWhitespace().not() } |
47 | 11 | )
|
48 |
| - val char = this |
49 |
| - val codePoint = char.code |
50 |
| - |
51 |
| - if (codePoint in 44032..55203) { |
52 |
| - val baseCode = codePoint - 44032 |
53 |
| - val choseongIndex = baseCode / 21 / 28 |
54 |
| - val joongseongIndex = baseCode / 28 % 21 |
55 |
| - val jongseongIndex = baseCode % 28 |
56 |
| - |
57 |
| - result.addAll( |
58 |
| - listOfNotNull(choseongs[choseongIndex], joongseongs[joongseongIndex], jongseongs[jongseongIndex]), |
59 |
| - ) |
60 |
| - } else { |
61 |
| - return null |
62 |
| - } |
63 |
| - return result |
64 | 12 | }
|
65 | 13 |
|
66 |
| - |
67 |
| -private fun levenshtein(s1: String, s2: String, cost: Map<Pair<Char, Char>, Int> = emptyMap()): Int { |
68 |
| - if (s1.length < s2.length) { |
69 |
| - return levenshtein(s2, s1, cost) |
70 |
| - } |
71 |
| - |
72 |
| - if (s2.isEmpty()) { |
73 |
| - return s1.length |
74 |
| - } |
75 |
| - |
76 |
| - val previousRow = IntArray(s2.length + 1) { it } |
77 |
| - for (i in s1.indices) { |
78 |
| - val currentRow = IntArray(s2.length + 1) |
79 |
| - currentRow[0] = i + 1 |
80 |
| - for (j in s2.indices) { |
81 |
| - val insertion = previousRow[j + 1] + 1 |
82 |
| - val deletion = currentRow[j] + 1 |
83 |
| - val substitution = previousRow[j] + if (s1[i] == s2[j]) 0 else cost.getOrDefault(s1[i] to s2[j], 1) |
84 |
| - currentRow[j + 1] = minOf(insertion, deletion, substitution) |
| 14 | +private fun simpleMatch(text: String, pattern: String): Boolean { |
| 15 | + var patternIndex = 0 |
| 16 | + for (char in text) { |
| 17 | + if (patternIndex < pattern.length && pattern[patternIndex] == char) { |
| 18 | + patternIndex++ |
85 | 19 | }
|
86 |
| - previousRow.indices.forEach { previousRow[it] = currentRow[it] } // Optimized copy |
87 |
| - } |
88 |
| - return previousRow.last() |
89 |
| -} |
90 |
| - |
91 |
| -private fun jamoLevenshtein(s1: String, s2: String): Int { |
92 |
| - if (s1.length < s2.length) { |
93 |
| - return jamoLevenshtein(s2, s1) |
94 |
| - } |
95 |
| - |
96 |
| - if (s2.isEmpty()) { |
97 |
| - return s1.length |
98 |
| - } |
99 |
| - |
100 |
| - val previousRow = IntArray(s2.length + 1) { it } |
101 |
| - for (i in s1.indices) { |
102 |
| - val currentRow = IntArray(s2.length + 1) |
103 |
| - currentRow[0] = i + 1 |
104 |
| - for (j in s2.indices) { |
105 |
| - val insertion = previousRow[j + 1] + 1 |
106 |
| - val deletion = currentRow[j] + 1 |
107 |
| - val substitution = previousRow[j] + getJamoCost(s1[i], s2[j]) |
108 |
| - currentRow[j + 1] = minOf(insertion, deletion, substitution) |
| 20 | + if (patternIndex == pattern.length) { |
| 21 | + return true |
109 | 22 | }
|
110 |
| - previousRow.indices.forEach { previousRow[it] = currentRow[it] } // Optimized copy |
111 | 23 | }
|
112 |
| - return previousRow.last() |
113 |
| -} |
114 |
| - |
115 |
| -private fun getJamoCost(c1: Char, c2: Char): Int { |
116 |
| - if (c1 == c2) return 0 |
117 |
| - val jamo1 = c1.decomposeHangul() |
118 |
| - val jamo2 = c2.decomposeHangul() |
119 |
| - return if (jamo1 != null && jamo2 != null) levenshtein(jamo1.joinToString(""), jamo2.joinToString("")) / 3 else 1 |
| 24 | + return false |
120 | 25 | }
|
0 commit comments