Skip to content

Commit dc489c8

Browse files
committed
perf: implement memory optimization for file processing and content handling
1 parent 590e55a commit dc489c8

File tree

3 files changed

+115
-20
lines changed

3 files changed

+115
-20
lines changed

src/gitingest/ingestion.py

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,12 @@
1-
"""Functions to ingest and analyze a codebase directory or single file."""
1+
"""Functions to ingest and analyze a codebase directory or single file.
2+
3+
Memory optimization:
4+
- Lazy loading: File content is only loaded when accessed and cached to avoid repeated reads
5+
- Chunked reading: Large files are read in chunks to avoid loading everything at once
6+
- Content cache clearing: Periodically clears content cache to free memory during processing
7+
- Memory limits: Skips files that would cause excessive memory usage
8+
- Early termination: Stops processing when limits are reached
9+
"""
210

311
from __future__ import annotations
412

@@ -65,7 +73,12 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:
6573
msg = f"File {file_node.name} has no content"
6674
raise ValueError(msg)
6775

68-
return format_node(file_node, query=query)
76+
result = format_node(file_node, query=query)
77+
78+
# Clear content cache to free memory
79+
file_node.clear_content_cache()
80+
81+
return result
6982

7083
root_node = FileSystemNode(
7184
name=path.name,
@@ -78,7 +91,12 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:
7891

7992
_process_node(node=root_node, query=query, stats=stats)
8093

81-
return format_node(root_node, query=query)
94+
result = format_node(root_node, query=query)
95+
96+
# Clear content cache to free memory after formatting
97+
root_node.clear_content_cache()
98+
99+
return result
82100

83101

84102
def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystemStats) -> None:
@@ -173,6 +191,8 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat
173191
This function checks the file's size, increments the statistics, and reads its content.
174192
If the file size exceeds the maximum allowed, it raises an error.
175193
194+
Implements memory optimization by checking limits before processing files.
195+
176196
Parameters
177197
----------
178198
path : Path
@@ -194,6 +214,11 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat
194214
print(f"Skipping file {path}: would exceed total size limit")
195215
return
196216

217+
# Skip very large files that would consume too much memory
218+
if file_size > MAX_TOTAL_SIZE_BYTES / 10: # Limit single file to 10% of total limit
219+
print(f"Skipping file {path}: file is too large for memory-efficient processing")
220+
return
221+
197222
stats.total_files += 1
198223
stats.total_size += file_size
199224

@@ -211,6 +236,12 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat
211236
parent_node.size += file_size
212237
parent_node.file_count += 1
213238

239+
# If we've processed a lot of files, clear any cached content to free memory
240+
if stats.total_files % 100 == 0:
241+
for sibling in parent_node.children[:-10]: # Keep the 10 most recent files cached
242+
if sibling.type == FileSystemNodeType.FILE:
243+
sibling.clear_content_cache()
244+
214245

215246
def limit_exceeded(stats: FileSystemStats, depth: int) -> bool:
216247
"""Check if any of the traversal limits have been exceeded.

src/gitingest/output_formatter.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,14 @@
1-
"""Functions to ingest and analyze a codebase directory or single file."""
1+
"""Functions to ingest and analyze a codebase directory or single file.
2+
3+
Memory optimization:
4+
- Generator-based processing: Uses generators to process files one at a time
5+
- Streaming approach: Avoids loading all file contents into memory at once
6+
- Works with lazy loading: Complements the lazy loading in FileSystemNode
7+
"""
28

39
from __future__ import annotations
410

5-
from typing import TYPE_CHECKING
11+
from typing import TYPE_CHECKING, Generator
612

713
import tiktoken
814

@@ -99,6 +105,9 @@ def _gather_file_contents(node: FileSystemNode) -> str:
99105
This function recursively processes a directory node and gathers the contents of all files
100106
under that node. It returns the concatenated content of all files as a single string.
101107
108+
The implementation is memory-efficient, processing one file at a time and using
109+
generators to avoid loading all content into memory at once.
110+
102111
Parameters
103112
----------
104113
node : FileSystemNode
@@ -110,11 +119,18 @@ def _gather_file_contents(node: FileSystemNode) -> str:
110119
The concatenated content of all files under the given node.
111120
112121
"""
113-
if node.type != FileSystemNodeType.DIRECTORY:
114-
return node.content_string
115122

116-
# Recursively gather contents of all files under the current directory
117-
return "\n".join(_gather_file_contents(child) for child in node.children)
123+
def _gather_contents_generator(node: FileSystemNode) -> Generator[str]:
124+
"""Yield file contents one at a time."""
125+
if node.type != FileSystemNodeType.DIRECTORY:
126+
yield node.content_string
127+
else:
128+
# Process one child at a time to avoid loading all content at once
129+
for child in node.children:
130+
yield from _gather_contents_generator(child)
131+
132+
# Join the generator results with newlines, processing one file at a time
133+
return "\n".join(_gather_contents_generator(node))
118134

119135

120136
def _create_tree_structure(

src/gitingest/schemas/filesystem.py

Lines changed: 59 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
1-
"""Define the schema for the filesystem representation."""
1+
"""Define the schema for the filesystem representation.
2+
3+
Memory optimization:
4+
- Lazy loading: File content is only loaded when the content property is accessed
5+
- Content caching: Content is cached to avoid repeated file reads
6+
- Cache clearing: The clear_content_cache method allows freeing memory when content is no longer needed
7+
- Chunked reading: Large files are read in chunks to avoid loading everything at once
8+
"""
29

310
from __future__ import annotations
411

@@ -49,6 +56,7 @@ class FileSystemNode: # pylint: disable=too-many-instance-attributes
4956
dir_count: int = 0
5057
depth: int = 0
5158
children: list[FileSystemNode] = field(default_factory=list)
59+
_content_cache: str | None = field(default=None, repr=False)
5260

5361
def sort_children(self) -> None:
5462
"""Sort the children nodes of a directory according to a specific order.
@@ -83,6 +91,18 @@ def _sort_key(child: FileSystemNode) -> tuple[int, str]:
8391

8492
self.children.sort(key=_sort_key)
8593

94+
def clear_content_cache(self) -> None:
95+
"""Clear the cached content to free up memory.
96+
97+
This method clears the content cache of this node and all its children recursively,
98+
allowing the garbage collector to reclaim memory used by file contents.
99+
"""
100+
self._content_cache = None
101+
102+
# Recursively clear cache for all children
103+
for child in self.children:
104+
child.clear_content_cache()
105+
86106
@property
87107
def content_string(self) -> str:
88108
"""Return the content of the node as a string, including path and content.
@@ -104,12 +124,15 @@ def content_string(self) -> str:
104124
return "\n".join(parts) + "\n\n"
105125

106126
@property
107-
def content(self) -> str: # pylint: disable=too-many-return-statements
127+
def content(self) -> str: # pylint: disable=too-many-return-statements,too-many-branches # noqa: C901,PLR0912
108128
"""Return file content (if text / notebook) or an explanatory placeholder.
109129
110130
Heuristically decides whether the file is text or binary by decoding a small chunk of the file
111131
with multiple encodings and checking for common binary markers.
112132
133+
Uses lazy loading to avoid loading the entire file into memory until needed,
134+
and caches the result to avoid repeated file reads.
135+
113136
Returns
114137
-------
115138
str
@@ -121,29 +144,40 @@ def content(self) -> str: # pylint: disable=too-many-return-statements
121144
If the node is a directory.
122145
123146
"""
147+
# Return cached content if available
148+
if self._content_cache is not None:
149+
return self._content_cache
150+
124151
if self.type == FileSystemNodeType.DIRECTORY:
125152
msg = "Cannot read content of a directory node"
126153
raise ValueError(msg)
127154

128155
if self.type == FileSystemNodeType.SYMLINK:
129-
return "" # TODO: are we including the empty content of symlinks?
156+
self._content_cache = "" # TODO: are we including the empty content of symlinks?
157+
return self._content_cache
130158

131159
if self.path.suffix == ".ipynb": # Notebook
132160
try:
133-
return process_notebook(self.path)
161+
self._content_cache = process_notebook(self.path)
134162
except Exception as exc:
135-
return f"Error processing notebook: {exc}"
163+
self._content_cache = f"Error processing notebook: {exc}"
164+
else:
165+
return self._content_cache
166+
return self._content_cache
136167

137168
chunk = _read_chunk(self.path)
138169

139170
if chunk is None:
140-
return "Error reading file"
171+
self._content_cache = "Error reading file"
172+
return self._content_cache
141173

142174
if chunk == b"":
143-
return "[Empty file]"
175+
self._content_cache = "[Empty file]"
176+
return self._content_cache
144177

145178
if not _decodes(chunk, "utf-8"):
146-
return "[Binary file]"
179+
self._content_cache = "[Binary file]"
180+
return self._content_cache
147181

148182
# Find the first encoding that decodes the sample
149183
good_enc: str | None = next(
@@ -152,10 +186,24 @@ def content(self) -> str: # pylint: disable=too-many-return-statements
152186
)
153187

154188
if good_enc is None:
155-
return "Error: Unable to decode file with available encodings"
189+
self._content_cache = "Error: Unable to decode file with available encodings"
190+
return self._content_cache
156191

157192
try:
193+
# Read file in chunks to avoid loading large files entirely into memory
194+
content_chunks = []
195+
chunk_size = 1024 * 1024 # 1MB chunks
196+
158197
with self.path.open(encoding=good_enc) as fp:
159-
return fp.read()
198+
while True:
199+
chunk = fp.read(chunk_size)
200+
if not chunk:
201+
break
202+
content_chunks.append(chunk)
203+
204+
self._content_cache = "".join(content_chunks)
160205
except (OSError, UnicodeDecodeError) as exc:
161-
return f"Error reading file with {good_enc!r}: {exc}"
206+
self._content_cache = f"Error reading file with {good_enc!r}: {exc}"
207+
else:
208+
return self._content_cache
209+
return self._content_cache

0 commit comments

Comments
 (0)