Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 45 additions & 30 deletions llama-index-core/llama_index/core/node_parser/text/sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,34 +230,44 @@ def _merge(self, splits: List[_Split], chunk_size: int) -> List[str]:
cur_chunk_len = 0
new_chunk = True

def close_chunk() -> None:
nonlocal chunks, cur_chunk, last_chunk, cur_chunk_len, new_chunk

chunks.append("".join([text for text, length in cur_chunk]))
# Pre-allocate 'chunks_append' for repeated use
chunks_append = chunks.append

# Avoid repeated attribute lookup in tight loops
chunk_overlap = self.chunk_overlap

def close_chunk() -> None:
nonlocal cur_chunk, last_chunk, cur_chunk_len, new_chunk
chunks_append("".join(text for text, _ in cur_chunk))
last_chunk = cur_chunk
cur_chunk = []
cur_chunk_len = 0
new_chunk = True

# add overlap to the next chunk using the last one first
# there is a small issue with this logic. If the chunk directly after
# the overlap is really big, then we could go over the chunk_size, and
# in theory the correct thing to do would be to remove some/all of the
# overlap. However, it would complicate the logic further without
# much real world benefit, so it's not implemented now.
if len(last_chunk) > 0:
# Add overlap to the next chunk using the last one first
if last_chunk:
last_index = len(last_chunk) - 1
while (
last_index >= 0
and cur_chunk_len + last_chunk[last_index][1] <= self.chunk_overlap
):
text, length = last_chunk[last_index]
cur_chunk_len += length
cur_chunk.insert(0, (text, length))
overlap_len = 0
overlap = []
# Build the overlap list in reversed order, so only insert once at end
while last_index >= 0 and overlap_len + last_chunk[last_index][1] <= chunk_overlap:
overlap_len += last_chunk[last_index][1]
overlap.append(last_chunk[last_index])
last_index -= 1

while len(splits) > 0:
cur_split = splits[0]
if overlap:
# Instead of inserting one by one, reverse and extend
overlap.reverse()
cur_chunk[0:0] = overlap
cur_chunk_len += overlap_len

splits_pop = splits.pop

# Use i-index for reduced pop overhead when chunk sizes are large
i = 0
splits_len = len(splits)
while i < splits_len:
cur_split = splits[i]
if cur_split.token_size > chunk_size:
raise ValueError("Single token exceeded chunk size")
if cur_chunk_len + cur_split.token_size > chunk_size and not new_chunk:
Expand All @@ -272,16 +282,26 @@ def close_chunk() -> None:
# add split to chunk
cur_chunk_len += cur_split.token_size
cur_chunk.append((cur_split.text, cur_split.token_size))
splits.pop(0)
i += 1
new_chunk = False
else:
# close out chunk
close_chunk()

# handle the last chunk
splits_len = len(splits) # Update splits_len since we may pop elements

# When we add elements by pop, we should make sure to remove used splits
# Actually for the original logic, pop(0) is used, but using index avoids the O(n) overhead
# So, at the end after building all chunks, drop all processed splits
if i:
del splits[:i]

# Handle the last chunk
if not new_chunk:
chunk = "".join([text for text, length in cur_chunk])
chunks.append(chunk)
chunks_append("".join(text for text, _ in cur_chunk))

# Run postprocessing to remove blank spaces

# run postprocessing to remove blank spaces
return self._postprocess_chunks(chunks)
Expand All @@ -290,13 +310,8 @@ def _postprocess_chunks(self, chunks: List[str]) -> List[str]:
"""Post-process chunks.
Remove whitespace only chunks and remove leading and trailing whitespace.
"""
new_chunks = []
for chunk in chunks:
stripped_chunk = chunk.strip()
if stripped_chunk == "":
continue
new_chunks.append(stripped_chunk)
return new_chunks
# Minor speedup using list comprehension instead of loop
return [chunk.strip() for chunk in chunks if chunk.strip()]

def _token_size(self, text: str) -> int:
return len(self._tokenizer(text))
Expand Down