Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions core/src/main/java/org/wltea/analyzer/core/IKArbitrator.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,82 @@ void process(AnalyzeContext context , boolean useSmart){
}
}

/**
* 检测是否为叠词模式
* @param lexemeCell 词元链表头
* @return 如果检测到叠词模式返回简化的路径,否则返回null
*/
private LexemePath detectRepeatedWords(QuickSortSet.Cell lexemeCell) {
if (lexemeCell == null || lexemeCell.getLexeme() == null) {
return null;
}

// 检查是否有连续的长词元(长度>10)或大量重复词元
QuickSortSet.Cell current = lexemeCell;
int longLexemeCount = 0;
int totalCount = 0;
Lexeme firstLexeme = null;
Lexeme lastLexeme = null;

while (current != null && current.getLexeme() != null) {
Lexeme lexeme = current.getLexeme();
if (firstLexeme == null) {
firstLexeme = lexeme;
}
lastLexeme = lexeme;

if (lexeme.getLength() > 10) {
longLexemeCount++;
}
totalCount++;

// 如果发现多个长词元或词元总数过多,认为是叠词
if (longLexemeCount > 5 || totalCount > 50) {
// 构造简化路径:第一个词元 + 剩余部分合并为一个词元
LexemePath simplifiedPath = new LexemePath();

// 添加第一个词元
simplifiedPath.addNotCrossLexeme(firstLexeme);

// 如果有剩余部分,创建一个合并的词元
if (totalCount > 1 && lastLexeme != null) {
// 计算剩余部分的起始位置和长度
int remainStart = firstLexeme.getBegin() + firstLexeme.getLength();
int remainEnd = lastLexeme.getBegin() + lastLexeme.getLength();
int remainLength = remainEnd - remainStart;

if (remainLength > 0) {
// 创建一个表示剩余部分的词元
// offset 应该是第一个词元的 offset + 第一个词元的长度
int remainOffset = firstLexeme.getOffset() + firstLexeme.getLength();
Lexeme remainLexeme = new Lexeme(remainOffset, remainStart, remainLength, Lexeme.TYPE_CNCHAR);
simplifiedPath.addNotCrossLexeme(remainLexeme);
}
}

return simplifiedPath;
}

current = current.getNext();
}

return null; // 没有检测到叠词模式
}

/**
* 歧义识别
* @param lexemeCell 歧义路径链表头
* @param fullTextLength 歧义路径文本长度
* @return
*/
private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){
// 首先检测是否为叠词模式,如果是则直接返回简化路径
LexemePath simplifiedPath = this.detectRepeatedWords(lexemeCell);
if (simplifiedPath != null) {
//System.out.println("Detected repeated words pattern, using simplified path");
return simplifiedPath;
}

//候选路径集合
TreeSet<LexemePath> pathOptions = new TreeSet<LexemePath>();
//候选结果路径
Expand Down Expand Up @@ -128,6 +197,14 @@ private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell , Lexe
QuickSortSet.Cell c = lexemeCell;
//迭代遍历Lexeme链表
while(c != null && c.getLexeme() != null){
//限制大长度叠词,避免性能问题和整数溢出
if(c.getLexeme().getLength() > 10){
//System.out.println("already repeat words 10 times");
//跳过过长的词元
c = c.getNext();
continue;
}

if(!option.addNotCrossLexeme(c.getLexeme())){
//词元交叉,添加失败则加入lexemeStack栈
conflictStack.push(c);
Expand Down
16 changes: 13 additions & 3 deletions core/src/main/java/org/wltea/analyzer/core/LexemePath.java
Original file line number Diff line number Diff line change
Expand Up @@ -156,13 +156,23 @@ int getPathLength(){
* @return
*/
int getXWeight(){
int product = 1;
long product = 1;
Cell c = this.getHead();
while( c != null && c.getLexeme() != null){
product *= c.getLexeme().getLength();
int length = c.getLexeme().getLength();
//限制长度避免溢出
if(length > 10){
length = 10;
}
product *= length;
//防止溢出
if(product > Integer.MAX_VALUE){
//System.out.println("weight too long");
return Integer.MAX_VALUE;
}
c = c.getNext();
}
return product;
return (int)product;
}

/**
Expand Down
36 changes: 36 additions & 0 deletions core/src/test/java/org/wltea/analyzer/lucene/IKAnalyzerTests.java
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,42 @@ static List<TokenInfo> tokenizeWithType(Configuration configuration, String s) {
return tokenInfos;
}

/**
* 用ik_smart分词器测试超长叠词性能
* 如果分词耗时超过5秒则测试失败
*/
@Test
public void tokenize_smart_long_repeated_words_performance()
{
Configuration cfg = TestUtils.createFakeConfigurationSub(true);

// 构建超长叠词:重复"哈哈哈哈哈哈哈哈哈哈"1000次
StringBuilder sb = new StringBuilder();
String repeatedWord = "哈哈哈哈哈哈哈哈哈哈";
for (int i = 0; i < 1001; i++) {
sb.append(repeatedWord);
}
String longRepeatedText = sb.toString();

// 记录开始时间
long startTime = System.currentTimeMillis();

// 执行分词
String[] tokens = tokenize(cfg, longRepeatedText);

// 记录结束时间
long endTime = System.currentTimeMillis();
long duration = endTime - startTime;

// 验证分词耗时不超过5秒(5000毫秒)
assert duration <= 5000 : String.format("IK_SMART分词超长叠词耗时%dms,超过5秒限制", duration);

// 验证分词结果不为空
assert tokens.length > 0 : "分词结果不能为空";

System.out.println(String.format("IK_SMART分词超长叠词耗时: %dms, 分词结果数量: %d", duration, tokens.length));
}

/**
* 将类型字符串映射为对应的数字常量
*
Expand Down