Skip to content

Commit 7a5b545

Browse files
committed
refactor: Return reason for ignoring files
- Modify to return a tuple indicating if a file should be ignored and the reason why. - Use this reason to provide more informative warnings to the user when files are skipped (e.g., 'File too large', 'Matches exclude pattern'). - Rename to for clarity. - Update unit and integration tests to reflect the new function signature and behavior.
1 parent 4881538 commit 7a5b545

2 files changed

Lines changed: 79 additions & 90 deletions

File tree

pack.py

Lines changed: 37 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ def is_likely_non_text(file_path: Path) -> bool:
311311

312312

313313
def should_ignore(file_path: Path, root_dir: Path, include_pattern: str,
314-
exclude_pattern: str, max_file_size_bytes: int) -> bool:
314+
exclude_pattern: str, max_file_size_bytes: int) -> tuple[bool, str]:
315315
"""
316316
Check if a file should be ignored based on defined rules:
317317
- Not a file or inaccessible
@@ -321,66 +321,66 @@ def should_ignore(file_path: Path, root_dir: Path, include_pattern: str,
321321
- Does not match the include pattern
322322
- Matches the exclude pattern
323323
- Is likely binary.
324+
325+
Returns a tuple (should_ignore, reason) where should_ignore is a boolean
326+
and reason is a string explaining why the file was ignored.
324327
"""
325328
# 1. Check if it's actually a file (resolve symlinks first)
326329
try:
327330
if not file_path.is_file():
328331
# This might happen with broken symlinks during the walk
329-
return True
332+
return True, f"{str(file_path.absolute())} is not a file"
330333
# Check file size
331334
file_size = file_path.stat().st_size
332335
if file_size > max_file_size_bytes:
333336
logging.info(
334337
f"Skipping large file {file_path.name} ({file_size} bytes > {max_file_size_bytes} bytes)",
335338
file=sys.stderr)
336-
return True
339+
return True, f"File too large ({file_size} bytes > {max_file_size_bytes} bytes)"
337340
except OSError as e:
338341
# Could be a permission error or other issue accessing the file type/stat
339342
print(f"Warning: Could not check status of {file_path}: {e}",
340343
file=sys.stderr)
341-
return True # Ignore if we can't verify it's a file or get its size
344+
return True, f"Could not check status of {file_path}: {e}" # Ignore if we can't verify it's a file or get its size
342345

343346
# Use relative path for hidden checks and pattern matching
344347
try:
345348
relative_path = file_path.relative_to(root_dir)
346349
relative_path_str = str(relative_path)
347350
except ValueError:
348351
# Should not happen if file_path is within root_dir, but handle defensively
349-
print(
350-
f"Warning: Could not get relative path for {file_path} based on {root_dir}",
351-
file=sys.stderr)
352-
return True
352+
return True, f"Could not get relative path for {file_path} based on {root_dir}"
353353

354354
# 2. Check glob patterns
355355
# First check exclude pattern (if specified)
356356
if exclude_pattern and (fnmatch.fnmatch(relative_path_str, exclude_pattern)
357357
or fnmatch.fnmatch(file_path.name,
358358
exclude_pattern)):
359-
return True
359+
return True, f"Matches exclude pattern {exclude_pattern}"
360360

361361
# Then check include pattern
362362
if include_pattern != '*' and not fnmatch.fnmatch(
363363
relative_path_str, include_pattern) and not fnmatch.fnmatch(
364364
file_path.name, include_pattern):
365-
return True
365+
return True, f"Does not match include pattern {include_pattern}"
366366

367367
# 3. Check for hidden file/directory
368368
# Check filename itself
369369
if file_path.name.startswith('.'):
370-
return True
370+
return True, "Is a hidden file"
371371
# Check any parent directory component
372372
# Use relative_path.parts to avoid checking parts outside the root_dir
373373
# Check parent parts
374374
if any(part.startswith('.') for part in relative_path.parts[:-1]):
375-
return True
375+
return True, "Is in a hidden directory"
376376

377377
# 4. Check for binary content (can be slow, do last)
378378
if is_likely_non_text(file_path):
379379
logging.info(f"Skipping likely non text file: {relative_path_str}",
380380
file=sys.stderr)
381-
return True
381+
return True, "Is likely non text"
382382

383-
return False # If none of the ignore conditions match
383+
return False, "Not ignored" # If none of the ignore conditions match
384384

385385

386386
def read_file_content(file_path: Path,
@@ -416,8 +416,8 @@ def read_file_content(file_path: Path,
416416
return None
417417

418418

419-
def read_files_parallel(files_to_process: list[tuple[Path, Path]],
420-
num_workers: int,
419+
def read_files_parallel(files_to_process: list[tuple[Path,
420+
Path]], num_workers: int,
421421
paths_only: bool) -> list[tuple[str, str]]:
422422
"""
423423
Read files in parallel using a thread pool.
@@ -470,7 +470,7 @@ def read_files_parallel(files_to_process: list[tuple[Path, Path]],
470470
return results
471471

472472

473-
def collect_results(input_paths_str: list[str], include_pattern: str,
473+
def collect_files_content(input_paths_str: list[str], include_pattern: str,
474474
exclude_pattern: str, max_file_size_bytes: int,
475475
num_workers: int, paths_only: bool,
476476
using_stdout: bool) -> list[tuple[str, str]]:
@@ -495,67 +495,34 @@ def collect_results(input_paths_str: list[str], include_pattern: str,
495495
p = Path(path_str).resolve()
496496

497497
if not p.exists():
498-
print(f"Warning: Input path not found: {path_str}",
499-
file=sys.stderr)
500-
continue
498+
raise ValueError(f"Input path not found: {path_str}")
501499

502500
if p in processed_files:
503501
# Avoid processing the same resolved path twice if listed multiple times
504502
continue
505503

506504
if p.is_file():
507-
# Check if file should be ignored (using simpler rules for explicit files)
508-
# Need to check size, binary, and if already processed
509-
try:
510-
file_size = p.stat().st_size
511-
is_too_large = file_size > max_file_size_bytes
512-
is_binary = is_likely_non_text(p)
513-
is_processed = p in processed_files
514-
515-
if not is_too_large and not is_binary and not is_processed:
516-
# Use cwd as root for relative path
517-
files_to_process_tuples.append((p, cwd))
518-
processed_files.add(p)
519-
else:
520-
reason = []
521-
if is_too_large:
522-
reason.append(
523-
f"too large ({file_size} > {max_file_size_bytes})")
524-
if is_binary:
525-
reason.append("likely binary")
526-
if is_processed:
527-
reason.append("already processed")
528-
print(
529-
f"Info: Ignoring explicitly provided file: {path_str} ({', '.join(reason)})",
530-
file=sys.stderr)
531-
except OSError as e:
532-
print(
533-
f"Warning: Could not stat explicitly provided file {path_str}: {e}",
534-
file=sys.stderr)
535-
505+
item, root_dir = p, p.parent
506+
should_ignore_result, reason = should_ignore(
507+
item, root_dir, include_pattern, exclude_pattern, max_file_size_bytes)
508+
if should_ignore_result:
509+
print(f"Warning: Ignoring file {item} because {reason}", file=sys.stderr)
510+
continue
511+
files_to_process_tuples.append((item, root_dir))
512+
processed_files.add(item)
536513
elif p.is_dir():
537514
print(f"Scanning directory: {p}", file=sys.stderr)
538-
try:
539-
# Use rglob for recursion
540-
for item in p.rglob('*'):
541-
# Combine checks for file, not processed, and not ignored
542-
if item not in processed_files and not should_ignore(
543-
item, p, include_pattern, exclude_pattern,
544-
max_file_size_bytes):
545-
files_to_process_tuples.append(
546-
(item, p)) # Use dir 'p' as root
547-
processed_files.add(item)
548-
except PermissionError as e:
549-
print(f"Warning: Permission denied during scan of {p}: {e}",
550-
file=sys.stderr)
551-
except Exception as e:
552-
print(f"Error during scan of {p}: {e}", file=sys.stderr)
553-
# Decide if we should exit or just continue
554-
# continue # Continue with next path for now
515+
# Use rglob for recursion
516+
for item in p.rglob('*'):
517+
if item in processed_files:
518+
continue
519+
should_ignore_result, reason = should_ignore(item, p, include_pattern, exclude_pattern, max_file_size_bytes)
520+
if should_ignore_result:
521+
continue
522+
files_to_process_tuples.append((item, p))
523+
processed_files.add(item)
555524
else:
556-
print(
557-
f"Warning: Input path is neither a file nor a directory: {path_str}",
558-
file=sys.stderr)
525+
raise ValueError(f"Input path is neither a file nor a directory: {path_str}")
559526

560527
# --- Filtering & Sorting (Preparation) ---
561528
print(
@@ -799,7 +766,7 @@ def main(argv: list[str]):
799766
using_stdout = True
800767

801768
try:
802-
results = collect_results(input_paths_str=input_paths_str,
769+
results = collect_files_content(input_paths_str=input_paths_str,
803770
include_pattern=include_pattern,
804771
exclude_pattern=exclude_pattern,
805772
max_file_size_bytes=max_file_size_bytes,

tests/test_pack.py

Lines changed: 42 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -71,25 +71,46 @@ def test_should_ignore(self):
7171

7272

7373
# Test Case 1: File larger than max_file_size_bytes
74-
self.assertTrue(pack.should_ignore(large_file, root_dir, "*", "", 1000))
75-
self.assertFalse(pack.should_ignore(large_file, root_dir, "*", "", 3000))
74+
should_ignore_result, reason = pack.should_ignore(large_file, root_dir, "*", "", 1000)
75+
self.assertTrue(should_ignore_result)
76+
self.assertIn("File too large", reason)
77+
78+
should_ignore_result, reason = pack.should_ignore(large_file, root_dir, "*", "", 3000)
79+
self.assertFalse(should_ignore_result)
80+
self.assertEqual(reason, "Not ignored")
7681

7782
# Test Case 2: Hidden file
78-
self.assertTrue(pack.should_ignore(root_dir / ".env", root_dir, "*", "", 5000))
83+
should_ignore_result, reason = pack.should_ignore(root_dir / ".env", root_dir, "*", "", 5000)
84+
self.assertTrue(should_ignore_result)
85+
self.assertEqual(reason, "Is a hidden file")
7986

8087
# Test Case 3: File within a hidden directory
81-
self.assertTrue(pack.should_ignore(root_dir / ".git" / "config", root_dir, "*", "", 5000))
88+
should_ignore_result, reason = pack.should_ignore(root_dir / ".git" / "config", root_dir, "*", "", 5000)
89+
self.assertTrue(should_ignore_result)
90+
self.assertEqual(reason, "Is in a hidden directory")
8291

8392
# Test Case 4: include_pattern
84-
self.assertFalse(pack.should_ignore(root_dir / "src" / "main.py", root_dir, "*.py", "", 5000))
85-
self.assertTrue(pack.should_ignore(root_dir / "src" / "main.py", root_dir, "*.txt", "", 5000))
93+
should_ignore_result, reason = pack.should_ignore(root_dir / "src" / "main.py", root_dir, "*.py", "", 5000)
94+
self.assertFalse(should_ignore_result)
95+
self.assertEqual(reason, "Not ignored")
96+
97+
should_ignore_result, reason = pack.should_ignore(root_dir / "src" / "main.py", root_dir, "*.txt", "", 5000)
98+
self.assertTrue(should_ignore_result)
99+
self.assertIn("Does not match include pattern", reason)
86100

87101
# Test Case 5: exclude_pattern
88-
self.assertTrue(pack.should_ignore(root_dir / "src" / "main_test.py", root_dir, "*", "*_test.py", 5000))
89-
self.assertFalse(pack.should_ignore(root_dir / "src" / "main.py", root_dir, "*", "*_test.py", 5000))
102+
should_ignore_result, reason = pack.should_ignore(root_dir / "src" / "main_test.py", root_dir, "*", "*_test.py", 5000)
103+
self.assertTrue(should_ignore_result)
104+
self.assertIn("Matches exclude pattern", reason)
105+
106+
should_ignore_result, reason = pack.should_ignore(root_dir / "src" / "main.py", root_dir, "*", "*_test.py", 5000)
107+
self.assertFalse(should_ignore_result)
108+
self.assertEqual(reason, "Not ignored")
90109

91110
# Test Case 6: Likely binary file
92-
self.assertTrue(pack.should_ignore(binary_file, root_dir, "*", "", 5000))
111+
should_ignore_result, reason = pack.should_ignore(binary_file, root_dir, "*", "", 5000)
112+
self.assertTrue(should_ignore_result)
113+
self.assertEqual(reason, "Is likely non text")
93114

94115
class TestIntegration(unittest.TestCase):
95116
def setUp(self):
@@ -112,8 +133,8 @@ def setUp(self):
112133
def tearDown(self):
113134
self.tmpdir.cleanup()
114135

115-
def test_collect_results_default(self):
116-
results = pack.collect_results(
136+
def test_collect_files_content_default(self):
137+
results = pack.collect_files_content(
117138
input_paths_str=[str(self.project_dir)],
118139
include_pattern="*",
119140
exclude_pattern="",
@@ -137,8 +158,8 @@ def test_collect_results_default(self):
137158
readme_result = next(r for r in results if r[0] == "README.md")
138159
self.assertEqual(readme_result[1], "readme content")
139160

140-
def test_collect_results_with_patterns(self):
141-
results = pack.collect_results(
161+
def test_collect_files_content_with_patterns(self):
162+
results = pack.collect_files_content(
142163
input_paths_str=[str(self.project_dir)],
143164
include_pattern="src/*.py",
144165
exclude_pattern="*utils*",
@@ -152,21 +173,22 @@ def test_collect_results_with_patterns(self):
152173
self.assertEqual(results[0][0], "src/main.py")
153174
self.assertEqual(results[0][1], "main content")
154175

155-
def test_collect_results_mixed_input(self):
156-
results = pack.collect_results(
157-
input_paths_str=[str(self.project_dir / "src"), str(self.root_dir / "another_file.txt")],
176+
def test_collect_files_content_mixed_input(self):
177+
another_txt_file = self.root_dir / "another_file.txt"
178+
another_txt_file.write_text("another content")
179+
another_md_file = self.root_dir / "another_file.md"
180+
another_md_file.write_text("another content")
181+
results = pack.collect_files_content(
182+
input_paths_str=[str(self.project_dir / "src"), str(another_txt_file), str(another_md_file)],
158183
include_pattern="*",
159-
exclude_pattern="",
184+
exclude_pattern="*.md",
160185
max_file_size_bytes=10000,
161186
num_workers=1,
162187
paths_only=False,
163188
using_stdout=False
164189
)
165190

166191
relative_paths = sorted([r[0] for r in results])
167-
self.assertEqual(len(relative_paths), 3)
168-
# Note: relative path for another_file.txt is from CWD of the test runner.
169-
# Let's check for the file names instead.
170192
filenames = sorted([Path(p).name for p in relative_paths])
171193
self.assertEqual(filenames, ["another_file.txt", "main.py", "utils.py"])
172194

0 commit comments

Comments
 (0)