Fix the problem caused by long repeat words in issue #1119 (#1120)

kin122 · web-flow · commit 9fe15464f2d2 · 2025-09-08T17:26:37.000+08:00
* fix : add detectRepeatedWords method in IK_smart mod,to fix problem #1119 * fix : add detectRepeatedWords method in IK_smart mod,to fix problem #1119 * fix : add detectRepeatedWords method in IK_smart mod,to fix problem #1119
diff --git a/core/src/main/java/org/wltea/analyzer/core/IKArbitrator.java b/core/src/main/java/org/wltea/analyzer/core/IKArbitrator.java
@@ -83,13 +83,82 @@ void process(AnalyzeContext context , boolean useSmart){
 		}
 	}
 	
+	/**
+	 * 检测是否为叠词模式
+	 * @param lexemeCell 词元链表头
+	 * @return 如果检测到叠词模式返回简化的路径，否则返回null
+	 */
+	private LexemePath detectRepeatedWords(QuickSortSet.Cell lexemeCell) {
+		if (lexemeCell == null || lexemeCell.getLexeme() == null) {
+			return null;
+		}
+		
+		// 检查是否有连续的长词元（长度>10）或大量重复词元
+		QuickSortSet.Cell current = lexemeCell;
+		int longLexemeCount = 0;
+		int totalCount = 0;
+		Lexeme firstLexeme = null;
+		Lexeme lastLexeme = null;
+		
+		while (current != null && current.getLexeme() != null) {
+			Lexeme lexeme = current.getLexeme();
+			if (firstLexeme == null) {
+				firstLexeme = lexeme;
+			}
+			lastLexeme = lexeme;
+			
+			if (lexeme.getLength() > 10) {
+				longLexemeCount++;
+			}
+			totalCount++;
+			
+			// 如果发现多个长词元或词元总数过多，认为是叠词
+			if (longLexemeCount > 5 || totalCount > 50) {
+				// 构造简化路径：第一个词元 + 剩余部分合并为一个词元
+				LexemePath simplifiedPath = new LexemePath();
+				
+				// 添加第一个词元
+				simplifiedPath.addNotCrossLexeme(firstLexeme);
+				
+				// 如果有剩余部分，创建一个合并的词元
+				if (totalCount > 1 && lastLexeme != null) {
+					// 计算剩余部分的起始位置和长度
+					int remainStart = firstLexeme.getBegin() + firstLexeme.getLength();
+					int remainEnd = lastLexeme.getBegin() + lastLexeme.getLength();
+					int remainLength = remainEnd - remainStart;
+					
+					if (remainLength > 0) {
+						// 创建一个表示剩余部分的词元
+						// offset 应该是第一个词元的 offset + 第一个词元的长度
+						int remainOffset = firstLexeme.getOffset() + firstLexeme.getLength();
+						Lexeme remainLexeme = new Lexeme(remainOffset, remainStart, remainLength, Lexeme.TYPE_CNCHAR);
+						simplifiedPath.addNotCrossLexeme(remainLexeme);
+					}
+				}
+				
+				return simplifiedPath;
+			}
+			
+			current = current.getNext();
+		}
+		
+		return null; // 没有检测到叠词模式
+	}
+	
 	/**
 	 * 歧义识别
 	 * @param lexemeCell 歧义路径链表头
 	 * @param fullTextLength 歧义路径文本长度
 	 * @return
 	 */
 	private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){
+		// 首先检测是否为叠词模式，如果是则直接返回简化路径
+		LexemePath simplifiedPath = this.detectRepeatedWords(lexemeCell);
+		if (simplifiedPath != null) {
+			//System.out.println("Detected repeated words pattern, using simplified path");
+			return simplifiedPath;
+		}
+		
 		//候选路径集合
 		TreeSet<LexemePath> pathOptions = new TreeSet<LexemePath>();
 		//候选结果路径
@@ -128,6 +197,14 @@ private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell , Lexe
 		QuickSortSet.Cell c = lexemeCell;
 		//迭代遍历Lexeme链表
 		while(c != null && c.getLexeme() != null){
+			//限制大长度叠词，避免性能问题和整数溢出
+			if(c.getLexeme().getLength() > 10){
+				//System.out.println("already repeat words 10 times");
+				//跳过过长的词元
+				c = c.getNext();
+				continue;
+			}
+			
 			if(!option.addNotCrossLexeme(c.getLexeme())){
 				//词元交叉，添加失败则加入lexemeStack栈
 				conflictStack.push(c);
diff --git a/core/src/main/java/org/wltea/analyzer/core/LexemePath.java b/core/src/main/java/org/wltea/analyzer/core/LexemePath.java
@@ -156,13 +156,23 @@ int getPathLength(){
 	 * @return
 	 */
 	int getXWeight(){
-		int product = 1;
+		long product = 1;
 		Cell c = this.getHead();
 		while( c != null && c.getLexeme() != null){
-			product *= c.getLexeme().getLength();
+			int length = c.getLexeme().getLength();
+			//限制长度避免溢出
+			if(length > 10){
+				length = 10;
+			}
+			product *= length;
+			//防止溢出
+			if(product > Integer.MAX_VALUE){
+				//System.out.println("weight too long");
+				return Integer.MAX_VALUE;
+			}
 			c = c.getNext();
 		}
-		return product;
+		return (int)product;
 	}
 	
 	/**
diff --git a/core/src/test/java/org/wltea/analyzer/lucene/IKAnalyzerTests.java b/core/src/test/java/org/wltea/analyzer/lucene/IKAnalyzerTests.java
@@ -263,6 +263,42 @@ static List<TokenInfo> tokenizeWithType(Configuration configuration, String s) {
         return tokenInfos;
     }
     
+    /**
+     * 用ik_smart分词器测试超长叠词性能
+     * 如果分词耗时超过5秒则测试失败
+     */
+    @Test
+    public void tokenize_smart_long_repeated_words_performance()
+    {
+        Configuration cfg = TestUtils.createFakeConfigurationSub(true);
+        
+        // 构建超长叠词：重复"哈哈哈哈哈哈哈哈哈哈"1000次
+        StringBuilder sb = new StringBuilder();
+        String repeatedWord = "哈哈哈哈哈哈哈哈哈哈";
+        for (int i = 0; i < 1001; i++) {
+            sb.append(repeatedWord);
+        }
+        String longRepeatedText = sb.toString();
+        
+        // 记录开始时间
+        long startTime = System.currentTimeMillis();
+        
+        // 执行分词
+        String[] tokens = tokenize(cfg, longRepeatedText);
+        
+        // 记录结束时间
+        long endTime = System.currentTimeMillis();
+        long duration = endTime - startTime;
+        
+        // 验证分词耗时不超过5秒（5000毫秒）
+        assert duration <= 5000 : String.format("IK_SMART分词超长叠词耗时%dms，超过5秒限制", duration);
+        
+        // 验证分词结果不为空
+        assert tokens.length > 0 : "分词结果不能为空";
+        
+        System.out.println(String.format("IK_SMART分词超长叠词耗时: %dms, 分词结果数量: %d", duration, tokens.length));
+    }
+
     /**
      * 将类型字符串映射为对应的数字常量
      *