1
- """Define the schema for the filesystem representation."""
1
+ """Define the schema for the filesystem representation.
2
+
3
+ Memory optimization:
4
+ - Lazy loading: File content is only loaded when the content property is accessed
5
+ - Content caching: Content is cached to avoid repeated file reads
6
+ - Cache clearing: The clear_content_cache method allows freeing memory when content is no longer needed
7
+ - Chunked reading: Large files are read in chunks to avoid loading everything at once
8
+ """
2
9
3
10
from __future__ import annotations
4
11
@@ -49,6 +56,7 @@ class FileSystemNode: # pylint: disable=too-many-instance-attributes
49
56
dir_count : int = 0
50
57
depth : int = 0
51
58
children : list [FileSystemNode ] = field (default_factory = list )
59
+ _content_cache : str | None = field (default = None , repr = False )
52
60
53
61
def sort_children (self ) -> None :
54
62
"""Sort the children nodes of a directory according to a specific order.
@@ -83,6 +91,18 @@ def _sort_key(child: FileSystemNode) -> tuple[int, str]:
83
91
84
92
self .children .sort (key = _sort_key )
85
93
94
+ def clear_content_cache (self ) -> None :
95
+ """Clear the cached content to free up memory.
96
+
97
+ This method clears the content cache of this node and all its children recursively,
98
+ allowing the garbage collector to reclaim memory used by file contents.
99
+ """
100
+ self ._content_cache = None
101
+
102
+ # Recursively clear cache for all children
103
+ for child in self .children :
104
+ child .clear_content_cache ()
105
+
86
106
@property
87
107
def content_string (self ) -> str :
88
108
"""Return the content of the node as a string, including path and content.
@@ -104,12 +124,15 @@ def content_string(self) -> str:
104
124
return "\n " .join (parts ) + "\n \n "
105
125
106
126
@property
107
- def content (self ) -> str : # pylint: disable=too-many-return-statements
127
+ def content (self ) -> str : # pylint: disable=too-many-return-statements,too-many-branches # noqa: C901,PLR0912
108
128
"""Return file content (if text / notebook) or an explanatory placeholder.
109
129
110
130
Heuristically decides whether the file is text or binary by decoding a small chunk of the file
111
131
with multiple encodings and checking for common binary markers.
112
132
133
+ Uses lazy loading to avoid loading the entire file into memory until needed,
134
+ and caches the result to avoid repeated file reads.
135
+
113
136
Returns
114
137
-------
115
138
str
@@ -121,29 +144,40 @@ def content(self) -> str: # pylint: disable=too-many-return-statements
121
144
If the node is a directory.
122
145
123
146
"""
147
+ # Return cached content if available
148
+ if self ._content_cache is not None :
149
+ return self ._content_cache
150
+
124
151
if self .type == FileSystemNodeType .DIRECTORY :
125
152
msg = "Cannot read content of a directory node"
126
153
raise ValueError (msg )
127
154
128
155
if self .type == FileSystemNodeType .SYMLINK :
129
- return "" # TODO: are we including the empty content of symlinks?
156
+ self ._content_cache = "" # TODO: are we including the empty content of symlinks?
157
+ return self ._content_cache
130
158
131
159
if self .path .suffix == ".ipynb" : # Notebook
132
160
try :
133
- return process_notebook (self .path )
161
+ self . _content_cache = process_notebook (self .path )
134
162
except Exception as exc :
135
- return f"Error processing notebook: { exc } "
163
+ self ._content_cache = f"Error processing notebook: { exc } "
164
+ else :
165
+ return self ._content_cache
166
+ return self ._content_cache
136
167
137
168
chunk = _read_chunk (self .path )
138
169
139
170
if chunk is None :
140
- return "Error reading file"
171
+ self ._content_cache = "Error reading file"
172
+ return self ._content_cache
141
173
142
174
if chunk == b"" :
143
- return "[Empty file]"
175
+ self ._content_cache = "[Empty file]"
176
+ return self ._content_cache
144
177
145
178
if not _decodes (chunk , "utf-8" ):
146
- return "[Binary file]"
179
+ self ._content_cache = "[Binary file]"
180
+ return self ._content_cache
147
181
148
182
# Find the first encoding that decodes the sample
149
183
good_enc : str | None = next (
@@ -152,10 +186,24 @@ def content(self) -> str: # pylint: disable=too-many-return-statements
152
186
)
153
187
154
188
if good_enc is None :
155
- return "Error: Unable to decode file with available encodings"
189
+ self ._content_cache = "Error: Unable to decode file with available encodings"
190
+ return self ._content_cache
156
191
157
192
try :
193
+ # Read file in chunks to avoid loading large files entirely into memory
194
+ content_chunks = []
195
+ chunk_size = 1024 * 1024 # 1MB chunks
196
+
158
197
with self .path .open (encoding = good_enc ) as fp :
159
- return fp .read ()
198
+ while True :
199
+ chunk = fp .read (chunk_size )
200
+ if not chunk :
201
+ break
202
+ content_chunks .append (chunk )
203
+
204
+ self ._content_cache = "" .join (content_chunks )
160
205
except (OSError , UnicodeDecodeError ) as exc :
161
- return f"Error reading file with { good_enc !r} : { exc } "
206
+ self ._content_cache = f"Error reading file with { good_enc !r} : { exc } "
207
+ else :
208
+ return self ._content_cache
209
+ return self ._content_cache
0 commit comments