perf: implement memory optimization for file processing and content handling

MickaelCa · MickaelCa · commit dc489c88f41c · 2025-07-13T18:26:58.000+02:00
diff --git a/src/gitingest/ingestion.py b/src/gitingest/ingestion.py
@@ -1,4 +1,12 @@
-"""Functions to ingest and analyze a codebase directory or single file."""
+"""Functions to ingest and analyze a codebase directory or single file.
+
+Memory optimization:
+- Lazy loading: File content is only loaded when accessed and cached to avoid repeated reads
+- Chunked reading: Large files are read in chunks to avoid loading everything at once
+- Content cache clearing: Periodically clears content cache to free memory during processing
+- Memory limits: Skips files that would cause excessive memory usage
+- Early termination: Stops processing when limits are reached
+"""
 
 from __future__ import annotations
 
@@ -65,7 +73,12 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:
             msg = f"File {file_node.name} has no content"
             raise ValueError(msg)
 
-        return format_node(file_node, query=query)
+        result = format_node(file_node, query=query)
+
+        # Clear content cache to free memory
+        file_node.clear_content_cache()
+
+        return result
 
     root_node = FileSystemNode(
         name=path.name,
@@ -78,7 +91,12 @@ def ingest_query(query: IngestionQuery) -> tuple[str, str, str]:
 
     _process_node(node=root_node, query=query, stats=stats)
 
-    return format_node(root_node, query=query)
+    result = format_node(root_node, query=query)
+
+    # Clear content cache to free memory after formatting
+    root_node.clear_content_cache()
+
+    return result
 
 
 def _process_node(node: FileSystemNode, query: IngestionQuery, stats: FileSystemStats) -> None:
@@ -173,6 +191,8 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat
     This function checks the file's size, increments the statistics, and reads its content.
     If the file size exceeds the maximum allowed, it raises an error.
 
+    Implements memory optimization by checking limits before processing files.
+
     Parameters
     ----------
     path : Path
@@ -194,6 +214,11 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat
         print(f"Skipping file {path}: would exceed total size limit")
         return
 
+    # Skip very large files that would consume too much memory
+    if file_size > MAX_TOTAL_SIZE_BYTES / 10:  # Limit single file to 10% of total limit
+        print(f"Skipping file {path}: file is too large for memory-efficient processing")
+        return
+
     stats.total_files += 1
     stats.total_size += file_size
 
@@ -211,6 +236,12 @@ def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStat
     parent_node.size += file_size
     parent_node.file_count += 1
 
+    # If we've processed a lot of files, clear any cached content to free memory
+    if stats.total_files % 100 == 0:
+        for sibling in parent_node.children[:-10]:  # Keep the 10 most recent files cached
+            if sibling.type == FileSystemNodeType.FILE:
+                sibling.clear_content_cache()
+
 
 def limit_exceeded(stats: FileSystemStats, depth: int) -> bool:
     """Check if any of the traversal limits have been exceeded.
diff --git a/src/gitingest/output_formatter.py b/src/gitingest/output_formatter.py
@@ -1,8 +1,14 @@
-"""Functions to ingest and analyze a codebase directory or single file."""
+"""Functions to ingest and analyze a codebase directory or single file.
+
+Memory optimization:
+- Generator-based processing: Uses generators to process files one at a time
+- Streaming approach: Avoids loading all file contents into memory at once
+- Works with lazy loading: Complements the lazy loading in FileSystemNode
+"""
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Generator
 
 import tiktoken
 
@@ -99,6 +105,9 @@ def _gather_file_contents(node: FileSystemNode) -> str:
     This function recursively processes a directory node and gathers the contents of all files
     under that node. It returns the concatenated content of all files as a single string.
 
+    The implementation is memory-efficient, processing one file at a time and using
+    generators to avoid loading all content into memory at once.
+
     Parameters
     ----------
     node : FileSystemNode
@@ -110,11 +119,18 @@ def _gather_file_contents(node: FileSystemNode) -> str:
         The concatenated content of all files under the given node.
 
     """
-    if node.type != FileSystemNodeType.DIRECTORY:
-        return node.content_string
 
-    # Recursively gather contents of all files under the current directory
-    return "\n".join(_gather_file_contents(child) for child in node.children)
+    def _gather_contents_generator(node: FileSystemNode) -> Generator[str]:
+        """Yield file contents one at a time."""
+        if node.type != FileSystemNodeType.DIRECTORY:
+            yield node.content_string
+        else:
+            # Process one child at a time to avoid loading all content at once
+            for child in node.children:
+                yield from _gather_contents_generator(child)
+
+    # Join the generator results with newlines, processing one file at a time
+    return "\n".join(_gather_contents_generator(node))
 
 
 def _create_tree_structure(
diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py
@@ -1,4 +1,11 @@
-"""Define the schema for the filesystem representation."""
+"""Define the schema for the filesystem representation.
+
+Memory optimization:
+- Lazy loading: File content is only loaded when the content property is accessed
+- Content caching: Content is cached to avoid repeated file reads
+- Cache clearing: The clear_content_cache method allows freeing memory when content is no longer needed
+- Chunked reading: Large files are read in chunks to avoid loading everything at once
+"""
 
 from __future__ import annotations
 
@@ -49,6 +56,7 @@ class FileSystemNode:  # pylint: disable=too-many-instance-attributes
     dir_count: int = 0
     depth: int = 0
     children: list[FileSystemNode] = field(default_factory=list)
+    _content_cache: str | None = field(default=None, repr=False)
 
     def sort_children(self) -> None:
         """Sort the children nodes of a directory according to a specific order.
@@ -83,6 +91,18 @@ def _sort_key(child: FileSystemNode) -> tuple[int, str]:
 
         self.children.sort(key=_sort_key)
 
+    def clear_content_cache(self) -> None:
+        """Clear the cached content to free up memory.
+
+        This method clears the content cache of this node and all its children recursively,
+        allowing the garbage collector to reclaim memory used by file contents.
+        """
+        self._content_cache = None
+
+        # Recursively clear cache for all children
+        for child in self.children:
+            child.clear_content_cache()
+
     @property
     def content_string(self) -> str:
         """Return the content of the node as a string, including path and content.
@@ -104,12 +124,15 @@ def content_string(self) -> str:
         return "\n".join(parts) + "\n\n"
 
     @property
-    def content(self) -> str:  # pylint: disable=too-many-return-statements
+    def content(self) -> str:  # pylint: disable=too-many-return-statements,too-many-branches  # noqa: C901,PLR0912
         """Return file content (if text / notebook) or an explanatory placeholder.
 
         Heuristically decides whether the file is text or binary by decoding a small chunk of the file
         with multiple encodings and checking for common binary markers.
 
+        Uses lazy loading to avoid loading the entire file into memory until needed,
+        and caches the result to avoid repeated file reads.
+
         Returns
         -------
         str
@@ -121,29 +144,40 @@ def content(self) -> str:  # pylint: disable=too-many-return-statements
             If the node is a directory.
 
         """
+        # Return cached content if available
+        if self._content_cache is not None:
+            return self._content_cache
+
         if self.type == FileSystemNodeType.DIRECTORY:
             msg = "Cannot read content of a directory node"
             raise ValueError(msg)
 
         if self.type == FileSystemNodeType.SYMLINK:
-            return ""  # TODO: are we including the empty content of symlinks?
+            self._content_cache = ""  # TODO: are we including the empty content of symlinks?
+            return self._content_cache
 
         if self.path.suffix == ".ipynb":  # Notebook
             try:
-                return process_notebook(self.path)
+                self._content_cache = process_notebook(self.path)
             except Exception as exc:
-                return f"Error processing notebook: {exc}"
+                self._content_cache = f"Error processing notebook: {exc}"
+            else:
+                return self._content_cache
+            return self._content_cache
 
         chunk = _read_chunk(self.path)
 
         if chunk is None:
-            return "Error reading file"
+            self._content_cache = "Error reading file"
+            return self._content_cache
 
         if chunk == b"":
-            return "[Empty file]"
+            self._content_cache = "[Empty file]"
+            return self._content_cache
 
         if not _decodes(chunk, "utf-8"):
-            return "[Binary file]"
+            self._content_cache = "[Binary file]"
+            return self._content_cache
 
         # Find the first encoding that decodes the sample
         good_enc: str | None = next(
@@ -152,10 +186,24 @@ def content(self) -> str:  # pylint: disable=too-many-return-statements
         )
 
         if good_enc is None:
-            return "Error: Unable to decode file with available encodings"
+            self._content_cache = "Error: Unable to decode file with available encodings"
+            return self._content_cache
 
         try:
+            # Read file in chunks to avoid loading large files entirely into memory
+            content_chunks = []
+            chunk_size = 1024 * 1024  # 1MB chunks
+
             with self.path.open(encoding=good_enc) as fp:
-                return fp.read()
+                while True:
+                    chunk = fp.read(chunk_size)
+                    if not chunk:
+                        break
+                    content_chunks.append(chunk)
+
+            self._content_cache = "".join(content_chunks)
         except (OSError, UnicodeDecodeError) as exc:
-            return f"Error reading file with {good_enc!r}: {exc}"
+            self._content_cache = f"Error reading file with {good_enc!r}: {exc}"
+        else:
+            return self._content_cache
+        return self._content_cache