VCPToolBox/TextChunker.js at main · B3000Kcn/VCPToolBox · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
// TextChunker.js
require('dotenv').config({ path: './config.env' });
const { get_encoding } = require("@dqbd/tiktoken"); // 假设您已安装 tiktoken 用于精确计算
const encoding = get_encoding("cl100k_base"); // gpt-4, gpt-3.5, embedding models 常用

// 从 config.env 文件读取最大 token 数，并应用85%的安全边界
const embeddingMaxToken = parseInt(process.env.WhitelistEmbeddingModelMaxToken, 10) || 8000;
const safeMaxTokens = Math.floor(embeddingMaxToken * 0.85);
const defaultOverlapTokens = Math.floor(safeMaxTokens * 0.1); // 重叠部分为最大值的10%

console.log(`[TextChunker] 配置加载: MaxToken=${embeddingMaxToken}, SafeMaxTokens=${safeMaxTokens}, OverlapTokens=${defaultOverlapTokens}`);

/**
 * 智能文本切分器
 * @param {string} text - 需要切分的原始文本
 * @param {number} maxTokens - 每个切片的最大token数
 * @param {number} overlapTokens - 切片间的重叠token数，以保证上下文连续性
 * @returns {string[]} 切分后的文本块数组
 */
function chunkText(text, maxTokens = safeMaxTokens, overlapTokens = defaultOverlapTokens) {
    if (!text) return [];

    const sentences = text.split(/(?<=[。？！.!?\n])/g); // 按句子和换行符分割，保留分隔符
    const chunks = [];
    let currentChunk = "";
    let currentTokens = 0;

    for (let i = 0; i < sentences.length; i++) {
        let sentence = sentences[i];
        let sentenceTokens = encoding.encode(sentence).length;

        // 处理超长句子：如果单个句子超过maxTokens，需要强制分割
        if (sentenceTokens > maxTokens) {
            // 先保存当前切片（如果有内容）
            if (currentChunk.trim()) {
                chunks.push(currentChunk.trim());
                currentChunk = "";
                currentTokens = 0;
            }

            // 对超长句子进行强制分割
            const forceSplitChunks = forceSplitLongText(sentence, maxTokens, overlapTokens);
            chunks.push(...forceSplitChunks);
            continue;
        }

        if (currentTokens + sentenceTokens > maxTokens) {
            chunks.push(currentChunk.trim());

            // 创建重叠部分
            let overlapChunk = "";
            let overlapTokenCount = 0;
            for (let j = i - 1; j >= 0; j--) {
                const prevSentence = sentences[j];
                const prevSentenceTokens = encoding.encode(prevSentence).length;
                if (overlapTokenCount + prevSentenceTokens > overlapTokens) break;
                overlapChunk = prevSentence + overlapChunk;
                overlapTokenCount += prevSentenceTokens;
            }
            currentChunk = overlapChunk;
            currentTokens = overlapTokenCount;
        }

        currentChunk += sentence;
        currentTokens += sentenceTokens;
    }

    if (currentChunk.trim()) {
        chunks.push(currentChunk.trim());
    }

    return chunks;
}

/**
 * 强制分割超长文本
 * @param {string} text - 需要分割的超长文本
 * @param {number} maxTokens - 每个切片的最大token数
 * @param {number} overlapTokens - 切片间的重叠token数
 * @returns {string[]} 分割后的文本块数组
 */
function forceSplitLongText(text, maxTokens, overlapTokens) {
    const chunks = [];
    const tokens = encoding.encode(text);

    // Safety check to prevent infinite loop
    const safeOverlap = Math.min(overlapTokens, Math.max(0, maxTokens - 1));
    const decoder = new TextDecoder('utf-8');

    let start = 0;
    while (start < tokens.length) {
        let end = Math.min(start + maxTokens, tokens.length);

        // 尝试在合适的位置断开（避免在词汇中间断开）
        if (end < tokens.length) {
            const chunkTokens = tokens.slice(start, end);
            let chunkText = decoder.decode(chunkTokens);

            // 尝试在标点符号或空白处断开
            const breakPoints = ['\n', '。', '！', '？', '，', '；', '：', ' ', '\t'];
            let bestBreakPoint = -1;

            for (let i = chunkText.length - 1; i >= Math.max(0, chunkText.length - 200); i--) {
                if (breakPoints.includes(chunkText[i])) {
                    bestBreakPoint = i + 1;
                    break;
                }
            }

            let finalChunkText = chunkText;
            if (bestBreakPoint > 0) {
                const candidateText = chunkText.substring(0, bestBreakPoint);
                const newTokens = encoding.encode(candidateText);

                // Prevent infinite loop: only accept the breakpoint if it provides meaningful progress
                if (newTokens.length > safeOverlap || newTokens.length === (end - start)) {
                    finalChunkText = candidateText;
                    end = start + newTokens.length;
                }
            }

            chunks.push(finalChunkText.trim());
        } else {
            // 最后一块
            const chunkTokens = tokens.slice(start);
            chunks.push(decoder.decode(chunkTokens).trim());
        }

        // 计算下一个起始位置（考虑重叠），确保 start 至少向前推进 1 步，即使 end 没有发生变化
        start = Math.max(start + 1, end - safeOverlap);
    }

    return chunks.filter(chunk => chunk.length > 0);
}

module.exports = { chunkText };