⚡️ Speed up method SentenceSplitter._merge by 28%
#123
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
📄 28% (0.28x) speedup for
SentenceSplitter._mergeinllama-index-core/llama_index/core/node_parser/text/sentence.py⏱️ Runtime :
45.8 milliseconds→35.9 milliseconds(best of57runs)📝 Explanation and details
The optimization achieves a 27% speedup by addressing key performance bottlenecks in the
_mergemethod that processes text chunks.Primary optimization - Eliminating O(n) operations:
splits.pop(0)with index-based iteration (i += 1). The original code performed O(n) list shifts for each pop(0) operation, creating quadratic behavior with large input lists. The optimized version uses simple index increment (O(1)) and bulk deletion at the end.Secondary optimizations:
chunks.appendandself.chunk_overlapto avoid repeated attribute lookups in tight loopscur_chunk.insert(0, ...), builds overlap list and inserts with slice assignment (cur_chunk[0:0] = overlap), reducing list shift operations_postprocess_chunksfor better performancePerformance impact by test case:
The optimization is particularly valuable for document processing pipelines that handle large texts, where the SentenceSplitter processes hundreds or thousands of text segments. The quadratic behavior of the original implementation would become increasingly problematic as document size grows.
✅ Correctness verification report:
🌀 Generated Regression Tests and Runtime
from typing import List
imports
import pytest
from llama_index.core.node_parser.text.sentence import SentenceSplitter
Minimal _Split class for testing
class _Split:
def init(self, text: str, token_size: int, is_sentence: bool = True):
self.text = text
self.token_size = token_size
self.is_sentence = is_sentence
------------------- Basic Test Cases -------------------
def test_multiple_chunks_no_overlap():
# Splits require multiple chunks, no overlap
splitter = SentenceSplitter(chunk_size=10, chunk_overlap=0)
splits = [_Split("Hello ", 6), _Split("world!", 6), _Split("Bye.", 4)]
codeflash_output = splitter._merge(splits, 10); result = codeflash_output # 7.06μs -> 7.76μs (9.07% slower)
def test_multiple_chunks_with_overlap():
# Splits require multiple chunks, with overlap
splitter = SentenceSplitter(chunk_size=10, chunk_overlap=6)
splits = [_Split("Hello ", 6), _Split("world!", 6), _Split("Bye.", 4)]
codeflash_output = splitter._merge(splits, 10); result = codeflash_output # 8.05μs -> 9.56μs (15.7% slower)
def test_zero_chunk_overlap():
# Zero overlap, should not repeat any splits
splitter = SentenceSplitter(chunk_size=4, chunk_overlap=0)
splits = [_Split("A", 1), _Split("B", 1), _Split("C", 1), _Split("D", 1), _Split("E", 1)]
codeflash_output = splitter._merge(splits, 4); result = codeflash_output # 7.37μs -> 8.05μs (8.47% slower)
def test_large_number_of_splits():
# Many splits, chunking and overlap
splitter = SentenceSplitter(chunk_size=50, chunk_overlap=10)
splits = [_Split(f"Sentence{i}. ", 5) for i in range(200)] # 200 splits
codeflash_output = splitter._merge(splits.copy(), 50); result = codeflash_output # 67.6μs -> 67.9μs (0.383% slower)
# Check all sentences are present and order is preserved
all_text = "".join(result)
for i in range(200):
pass
def test_large_split_sizes():
# Splits with varying large sizes
splitter = SentenceSplitter(chunk_size=100, chunk_overlap=20)
splits = [_Split("A"*20, 20), _Split("B"*40, 40), _Split("C"*50, 50), _Split("D"*10, 10)]
codeflash_output = splitter._merge(splits.copy(), 100); result = codeflash_output # 6.34μs -> 7.14μs (11.3% slower)
# All text must be present
all_text = "".join(result)
def test_large_chunk_overlap():
# Large overlap, many splits
splitter = SentenceSplitter(chunk_size=50, chunk_overlap=45)
splits = [_Split(f"X{i}", 5) for i in range(20)]
codeflash_output = splitter._merge(splits.copy(), 50); result = codeflash_output # 28.8μs -> 30.1μs (4.43% slower)
# Check for overlapping content
for i in range(1, len(result)):
overlap = set(result[i-1].split("X")) & set(result[i].split("X"))
def test_performance_large_input():
# Test performance with 1000 splits (should not timeout)
splitter = SentenceSplitter(chunk_size=100, chunk_overlap=20)
splits = [_Split("A", 1) for _ in range(1000)]
codeflash_output = splitter._merge(splits.copy(), 100); result = codeflash_output # 275μs -> 241μs (14.5% faster)
def test_varied_is_sentence_flags_large():
# Large input, varied is_sentence flags
splitter = SentenceSplitter(chunk_size=50, chunk_overlap=10)
splits = [_Split(f"Sent{i}.", 5, is_sentence=(i%2==0)) for i in range(100)]
codeflash_output = splitter._merge(splits.copy(), 50); result = codeflash_output # 39.3μs -> 39.4μs (0.269% slower)
# All sentences present
all_text = "".join(result)
for i in range(100):
pass
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
#------------------------------------------------
from typing import List
imports
import pytest
from llama_index.core.node_parser.text.sentence import SentenceSplitter
Minimal _Split class for testing
class _Split:
def init(self, text: str, token_size: int, is_sentence: bool = True):
self.text = text
self.token_size = token_size
self.is_sentence = is_sentence
----------- Unit Tests ------------
-------------------- BASIC TEST CASES --------------------
def test_merge_basic_single_split():
# Test merging a single sentence split
splitter = SentenceSplitter(chunk_overlap=0)
splits = [_Split("Hello world.", 3, True)]
codeflash_output = splitter._merge(splits.copy(), chunk_size=10); result = codeflash_output # 3.65μs -> 4.63μs (21.3% slower)
def test_merge_basic_multiple_splits_fit_chunk():
# Test multiple splits that all fit in one chunk
splitter = SentenceSplitter(chunk_overlap=0)
splits = [_Split("Hello ", 2, True), _Split("world.", 3, True)]
codeflash_output = splitter._merge(splits.copy(), chunk_size=10); result = codeflash_output # 4.30μs -> 5.10μs (15.6% slower)
def test_merge_basic_multiple_chunks_no_overlap():
# Test splitting into multiple chunks, no overlap
splitter = SentenceSplitter(chunk_overlap=0)
splits = [
_Split("A.", 2, True),
_Split("B.", 2, True),
_Split("C.", 2, True),
_Split("D.", 2, True),
]
codeflash_output = splitter._merge(splits.copy(), chunk_size=4); result = codeflash_output # 6.58μs -> 7.25μs (9.16% slower)
def test_merge_basic_multiple_chunks_with_overlap():
# Test splitting into multiple chunks with overlap
splitter = SentenceSplitter(chunk_overlap=2)
splits = [
_Split("A.", 2, True),
_Split("B.", 2, True),
_Split("C.", 2, True),
_Split("D.", 2, True),
]
codeflash_output = splitter._merge(splits.copy(), chunk_size=4); result = codeflash_output # 7.80μs -> 9.00μs (13.4% slower)
def test_merge_basic_is_sentence_false():
# Test that non-sentence splits are still merged correctly
splitter = SentenceSplitter(chunk_overlap=0)
splits = [
_Split("Hello", 2, False),
_Split(" ", 1, False),
_Split("world.", 3, True),
]
codeflash_output = splitter._merge(splits.copy(), chunk_size=6); result = codeflash_output # 4.88μs -> 5.46μs (10.6% slower)
-------------------- EDGE TEST CASES --------------------
def test_merge_edge_empty_splits():
# Test with empty splits list
splitter = SentenceSplitter()
splits = []
codeflash_output = splitter._merge(splits.copy(), chunk_size=10); result = codeflash_output # 1.65μs -> 2.40μs (31.1% slower)
def test_merge_edge_all_whitespace_chunks():
# Test that whitespace-only splits are removed
splitter = SentenceSplitter()
splits = [
_Split(" ", 1, True),
_Split("\n", 1, True),
_Split("Hello", 2, True),
_Split(" ", 1, True),
_Split("world.", 3, True),
_Split(" ", 1, True),
]
codeflash_output = splitter._merge(splits.copy(), chunk_size=10); result = codeflash_output # 5.63μs -> 6.31μs (10.7% slower)
def test_merge_edge_chunk_size_smaller_than_split():
# Test for error when a split exceeds chunk size
splitter = SentenceSplitter()
splits = [_Split("This is a very long sentence.", 50, True)]
with pytest.raises(ValueError):
splitter._merge(splits.copy(), chunk_size=10) # 1.77μs -> 2.21μs (19.7% slower)
def test_merge_edge_overlap_larger_than_chunk_size():
# Test that ValueError is raised if chunk_overlap > chunk_size in init
with pytest.raises(ValueError):
SentenceSplitter(chunk_overlap=10)._merge([], chunk_size=5)
def test_merge_edge_overlap_exactly_chunk_size():
# Overlap equal to chunk size (should work)
splitter = SentenceSplitter(chunk_overlap=5)
splits = [
_Split("A", 2, True),
_Split("B", 3, True),
_Split("C", 2, True),
]
codeflash_output = splitter._merge(splits.copy(), chunk_size=5); result = codeflash_output # 7.72μs -> 9.29μs (16.9% slower)
def test_merge_edge_split_token_size_zero():
# Test split with zero token size
splitter = SentenceSplitter()
splits = [_Split("A", 0, True), _Split("B", 2, True)]
codeflash_output = splitter._merge(splits.copy(), chunk_size=2); result = codeflash_output # 4.48μs -> 5.21μs (13.9% slower)
def test_merge_edge_split_is_sentence_false_chunk_boundary():
# Test non-sentence split at chunk boundary
splitter = SentenceSplitter(chunk_overlap=0)
splits = [
_Split("A", 2, False),
_Split("B", 2, False),
_Split("C", 2, True),
]
codeflash_output = splitter._merge(splits.copy(), chunk_size=4); result = codeflash_output # 6.37μs -> 7.09μs (10.1% slower)
def test_merge_edge_leading_trailing_whitespace_in_chunk():
# Test that leading/trailing whitespace is stripped from chunks
splitter = SentenceSplitter()
splits = [
_Split(" Hello", 2, True),
_Split(" world. ", 3, True),
]
codeflash_output = splitter._merge(splits.copy(), chunk_size=10); result = codeflash_output # 4.63μs -> 5.33μs (13.2% slower)
-------------------- LARGE SCALE TEST CASES --------------------
def test_merge_large_many_small_splits_no_overlap():
# 1000 splits, each token_size=1, chunk_size=10, overlap=0
splitter = SentenceSplitter(chunk_overlap=0)
splits = [_Split(str(i), 1, True) for i in range(1000)]
codeflash_output = splitter._merge(splits.copy(), chunk_size=10); result = codeflash_output # 281μs -> 247μs (13.7% faster)
for i, chunk in enumerate(result):
expected = "".join(str(j) for j in range(i * 10, (i + 1) * 10))
def test_merge_large_many_small_splits_with_overlap():
# 100 splits, each token_size=1, chunk_size=10, overlap=5
splitter = SentenceSplitter(chunk_overlap=5)
splits = [_Split(str(i), 1, True) for i in range(100)]
codeflash_output = splitter._merge(splits.copy(), chunk_size=10); result = codeflash_output # 49.0μs -> 49.4μs (0.856% slower)
for i, chunk in enumerate(result):
start = i * 5
expected = "".join(str(j) for j in range(start, start + 10))
def test_merge_large_varied_token_sizes():
# 100 splits, alternating token sizes 1 and 3, chunk_size=20, overlap=5
splitter = SentenceSplitter(chunk_overlap=5)
splits = []
for i in range(100):
splits.append(_Split(str(i), 1 if i % 2 == 0 else 3, True))
codeflash_output = splitter._merge(splits.copy(), chunk_size=20); result = codeflash_output # 35.7μs -> 36.3μs (1.73% slower)
# Chunks should not exceed 20 tokens, overlap 5 tokens
# Check that all splits are included in order, and no chunk exceeds 20 tokens
all_text = "".join(split.text for split in splits)
merged_text = "".join(result)
for chunk in result:
pass
def test_merge_large_all_whitespace():
# 500 splits, all whitespace, should produce empty result
splitter = SentenceSplitter()
splits = [_Split(" ", 1, True) for _ in range(500)]
codeflash_output = splitter._merge(splits.copy(), chunk_size=10); result = codeflash_output # 13.7ms -> 10.8ms (26.3% faster)
def test_merge_large_mixed_whitespace_and_text():
# Mix of whitespace and real splits
splitter = SentenceSplitter()
splits = []
for i in range(500):
splits.append(_Split(" ", 1, True))
splits.append(_Split(str(i), 1, True))
codeflash_output = splitter._merge(splits.copy(), chunk_size=20); result = codeflash_output # 31.2ms -> 24.2ms (29.0% faster)
# Should only contain numbers, no whitespace
for chunk in result:
for c in chunk:
pass
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
To edit these changes
git checkout codeflash/optimize-SentenceSplitter._merge-mhv7v3zsand push.