Skip to content

Commit 9fe1546

Browse files
authored
Fix the problem caused by long repeat words in issue #1119 (#1120)
* fix : add detectRepeatedWords method in IK_smart mod,to fix problem #1119 * fix : add detectRepeatedWords method in IK_smart mod,to fix problem #1119 * fix : add detectRepeatedWords method in IK_smart mod,to fix problem #1119
1 parent f8b9e07 commit 9fe1546

File tree

3 files changed

+126
-3
lines changed

3 files changed

+126
-3
lines changed

core/src/main/java/org/wltea/analyzer/core/IKArbitrator.java

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,13 +83,82 @@ void process(AnalyzeContext context , boolean useSmart){
8383
}
8484
}
8585

86+
/**
87+
* 检测是否为叠词模式
88+
* @param lexemeCell 词元链表头
89+
* @return 如果检测到叠词模式返回简化的路径,否则返回null
90+
*/
91+
private LexemePath detectRepeatedWords(QuickSortSet.Cell lexemeCell) {
92+
if (lexemeCell == null || lexemeCell.getLexeme() == null) {
93+
return null;
94+
}
95+
96+
// 检查是否有连续的长词元(长度>10)或大量重复词元
97+
QuickSortSet.Cell current = lexemeCell;
98+
int longLexemeCount = 0;
99+
int totalCount = 0;
100+
Lexeme firstLexeme = null;
101+
Lexeme lastLexeme = null;
102+
103+
while (current != null && current.getLexeme() != null) {
104+
Lexeme lexeme = current.getLexeme();
105+
if (firstLexeme == null) {
106+
firstLexeme = lexeme;
107+
}
108+
lastLexeme = lexeme;
109+
110+
if (lexeme.getLength() > 10) {
111+
longLexemeCount++;
112+
}
113+
totalCount++;
114+
115+
// 如果发现多个长词元或词元总数过多,认为是叠词
116+
if (longLexemeCount > 5 || totalCount > 50) {
117+
// 构造简化路径:第一个词元 + 剩余部分合并为一个词元
118+
LexemePath simplifiedPath = new LexemePath();
119+
120+
// 添加第一个词元
121+
simplifiedPath.addNotCrossLexeme(firstLexeme);
122+
123+
// 如果有剩余部分,创建一个合并的词元
124+
if (totalCount > 1 && lastLexeme != null) {
125+
// 计算剩余部分的起始位置和长度
126+
int remainStart = firstLexeme.getBegin() + firstLexeme.getLength();
127+
int remainEnd = lastLexeme.getBegin() + lastLexeme.getLength();
128+
int remainLength = remainEnd - remainStart;
129+
130+
if (remainLength > 0) {
131+
// 创建一个表示剩余部分的词元
132+
// offset 应该是第一个词元的 offset + 第一个词元的长度
133+
int remainOffset = firstLexeme.getOffset() + firstLexeme.getLength();
134+
Lexeme remainLexeme = new Lexeme(remainOffset, remainStart, remainLength, Lexeme.TYPE_CNCHAR);
135+
simplifiedPath.addNotCrossLexeme(remainLexeme);
136+
}
137+
}
138+
139+
return simplifiedPath;
140+
}
141+
142+
current = current.getNext();
143+
}
144+
145+
return null; // 没有检测到叠词模式
146+
}
147+
86148
/**
87149
* 歧义识别
88150
* @param lexemeCell 歧义路径链表头
89151
* @param fullTextLength 歧义路径文本长度
90152
* @return
91153
*/
92154
private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){
155+
// 首先检测是否为叠词模式,如果是则直接返回简化路径
156+
LexemePath simplifiedPath = this.detectRepeatedWords(lexemeCell);
157+
if (simplifiedPath != null) {
158+
//System.out.println("Detected repeated words pattern, using simplified path");
159+
return simplifiedPath;
160+
}
161+
93162
//候选路径集合
94163
TreeSet<LexemePath> pathOptions = new TreeSet<LexemePath>();
95164
//候选结果路径
@@ -128,6 +197,14 @@ private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell , Lexe
128197
QuickSortSet.Cell c = lexemeCell;
129198
//迭代遍历Lexeme链表
130199
while(c != null && c.getLexeme() != null){
200+
//限制大长度叠词,避免性能问题和整数溢出
201+
if(c.getLexeme().getLength() > 10){
202+
//System.out.println("already repeat words 10 times");
203+
//跳过过长的词元
204+
c = c.getNext();
205+
continue;
206+
}
207+
131208
if(!option.addNotCrossLexeme(c.getLexeme())){
132209
//词元交叉,添加失败则加入lexemeStack栈
133210
conflictStack.push(c);

core/src/main/java/org/wltea/analyzer/core/LexemePath.java

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -156,13 +156,23 @@ int getPathLength(){
156156
* @return
157157
*/
158158
int getXWeight(){
159-
int product = 1;
159+
long product = 1;
160160
Cell c = this.getHead();
161161
while( c != null && c.getLexeme() != null){
162-
product *= c.getLexeme().getLength();
162+
int length = c.getLexeme().getLength();
163+
//限制长度避免溢出
164+
if(length > 10){
165+
length = 10;
166+
}
167+
product *= length;
168+
//防止溢出
169+
if(product > Integer.MAX_VALUE){
170+
//System.out.println("weight too long");
171+
return Integer.MAX_VALUE;
172+
}
163173
c = c.getNext();
164174
}
165-
return product;
175+
return (int)product;
166176
}
167177

168178
/**

core/src/test/java/org/wltea/analyzer/lucene/IKAnalyzerTests.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,42 @@ static List<TokenInfo> tokenizeWithType(Configuration configuration, String s) {
263263
return tokenInfos;
264264
}
265265

266+
/**
267+
* 用ik_smart分词器测试超长叠词性能
268+
* 如果分词耗时超过5秒则测试失败
269+
*/
270+
@Test
271+
public void tokenize_smart_long_repeated_words_performance()
272+
{
273+
Configuration cfg = TestUtils.createFakeConfigurationSub(true);
274+
275+
// 构建超长叠词:重复"哈哈哈哈哈哈哈哈哈哈"1000次
276+
StringBuilder sb = new StringBuilder();
277+
String repeatedWord = "哈哈哈哈哈哈哈哈哈哈";
278+
for (int i = 0; i < 1001; i++) {
279+
sb.append(repeatedWord);
280+
}
281+
String longRepeatedText = sb.toString();
282+
283+
// 记录开始时间
284+
long startTime = System.currentTimeMillis();
285+
286+
// 执行分词
287+
String[] tokens = tokenize(cfg, longRepeatedText);
288+
289+
// 记录结束时间
290+
long endTime = System.currentTimeMillis();
291+
long duration = endTime - startTime;
292+
293+
// 验证分词耗时不超过5秒(5000毫秒)
294+
assert duration <= 5000 : String.format("IK_SMART分词超长叠词耗时%dms,超过5秒限制", duration);
295+
296+
// 验证分词结果不为空
297+
assert tokens.length > 0 : "分词结果不能为空";
298+
299+
System.out.println(String.format("IK_SMART分词超长叠词耗时: %dms, 分词结果数量: %d", duration, tokens.length));
300+
}
301+
266302
/**
267303
* 将类型字符串映射为对应的数字常量
268304
*

0 commit comments

Comments
 (0)