@@ -311,7 +311,7 @@ def is_likely_non_text(file_path: Path) -> bool:
311311
312312
313313def should_ignore (file_path : Path , root_dir : Path , include_pattern : str ,
314- exclude_pattern : str , max_file_size_bytes : int ) -> bool :
314+ exclude_pattern : str , max_file_size_bytes : int ) -> tuple [ bool , str ] :
315315 """
316316 Check if a file should be ignored based on defined rules:
317317 - Not a file or inaccessible
@@ -321,66 +321,66 @@ def should_ignore(file_path: Path, root_dir: Path, include_pattern: str,
321321 - Does not match the include pattern
322322 - Matches the exclude pattern
323323 - Is likely binary.
324+
325+ Returns a tuple (should_ignore, reason) where should_ignore is a boolean
326+ and reason is a string explaining why the file was ignored.
324327 """
325328 # 1. Check if it's actually a file (resolve symlinks first)
326329 try :
327330 if not file_path .is_file ():
328331 # This might happen with broken symlinks during the walk
329- return True
332+ return True , f" { str ( file_path . absolute ()) } is not a file"
330333 # Check file size
331334 file_size = file_path .stat ().st_size
332335 if file_size > max_file_size_bytes :
333336 logging .info (
334337 f"Skipping large file { file_path .name } ({ file_size } bytes > { max_file_size_bytes } bytes)" ,
335338 file = sys .stderr )
336- return True
339+ return True , f"File too large ( { file_size } bytes > { max_file_size_bytes } bytes)"
337340 except OSError as e :
338341 # Could be a permission error or other issue accessing the file type/stat
339342 print (f"Warning: Could not check status of { file_path } : { e } " ,
340343 file = sys .stderr )
341- return True # Ignore if we can't verify it's a file or get its size
344+ return True , f"Could not check status of { file_path } : { e } " # Ignore if we can't verify it's a file or get its size
342345
343346 # Use relative path for hidden checks and pattern matching
344347 try :
345348 relative_path = file_path .relative_to (root_dir )
346349 relative_path_str = str (relative_path )
347350 except ValueError :
348351 # Should not happen if file_path is within root_dir, but handle defensively
349- print (
350- f"Warning: Could not get relative path for { file_path } based on { root_dir } " ,
351- file = sys .stderr )
352- return True
352+ return True , f"Could not get relative path for { file_path } based on { root_dir } "
353353
354354 # 2. Check glob patterns
355355 # First check exclude pattern (if specified)
356356 if exclude_pattern and (fnmatch .fnmatch (relative_path_str , exclude_pattern )
357357 or fnmatch .fnmatch (file_path .name ,
358358 exclude_pattern )):
359- return True
359+ return True , f"Matches exclude pattern { exclude_pattern } "
360360
361361 # Then check include pattern
362362 if include_pattern != '*' and not fnmatch .fnmatch (
363363 relative_path_str , include_pattern ) and not fnmatch .fnmatch (
364364 file_path .name , include_pattern ):
365- return True
365+ return True , f"Does not match include pattern { include_pattern } "
366366
367367 # 3. Check for hidden file/directory
368368 # Check filename itself
369369 if file_path .name .startswith ('.' ):
370- return True
370+ return True , "Is a hidden file"
371371 # Check any parent directory component
372372 # Use relative_path.parts to avoid checking parts outside the root_dir
373373 # Check parent parts
374374 if any (part .startswith ('.' ) for part in relative_path .parts [:- 1 ]):
375- return True
375+ return True , "Is in a hidden directory"
376376
377377 # 4. Check for binary content (can be slow, do last)
378378 if is_likely_non_text (file_path ):
379379 logging .info (f"Skipping likely non text file: { relative_path_str } " ,
380380 file = sys .stderr )
381- return True
381+ return True , "Is likely non text"
382382
383- return False # If none of the ignore conditions match
383+ return False , "Not ignored" # If none of the ignore conditions match
384384
385385
386386def read_file_content (file_path : Path ,
@@ -416,8 +416,8 @@ def read_file_content(file_path: Path,
416416 return None
417417
418418
419- def read_files_parallel (files_to_process : list [tuple [Path , Path ]],
420- num_workers : int ,
419+ def read_files_parallel (files_to_process : list [tuple [Path ,
420+ Path ]], num_workers : int ,
421421 paths_only : bool ) -> list [tuple [str , str ]]:
422422 """
423423 Read files in parallel using a thread pool.
@@ -470,7 +470,7 @@ def read_files_parallel(files_to_process: list[tuple[Path, Path]],
470470 return results
471471
472472
473- def collect_results (input_paths_str : list [str ], include_pattern : str ,
473+ def collect_files_content (input_paths_str : list [str ], include_pattern : str ,
474474 exclude_pattern : str , max_file_size_bytes : int ,
475475 num_workers : int , paths_only : bool ,
476476 using_stdout : bool ) -> list [tuple [str , str ]]:
@@ -495,67 +495,34 @@ def collect_results(input_paths_str: list[str], include_pattern: str,
495495 p = Path (path_str ).resolve ()
496496
497497 if not p .exists ():
498- print (f"Warning: Input path not found: { path_str } " ,
499- file = sys .stderr )
500- continue
498+ raise ValueError (f"Input path not found: { path_str } " )
501499
502500 if p in processed_files :
503501 # Avoid processing the same resolved path twice if listed multiple times
504502 continue
505503
506504 if p .is_file ():
507- # Check if file should be ignored (using simpler rules for explicit files)
508- # Need to check size, binary, and if already processed
509- try :
510- file_size = p .stat ().st_size
511- is_too_large = file_size > max_file_size_bytes
512- is_binary = is_likely_non_text (p )
513- is_processed = p in processed_files
514-
515- if not is_too_large and not is_binary and not is_processed :
516- # Use cwd as root for relative path
517- files_to_process_tuples .append ((p , cwd ))
518- processed_files .add (p )
519- else :
520- reason = []
521- if is_too_large :
522- reason .append (
523- f"too large ({ file_size } > { max_file_size_bytes } )" )
524- if is_binary :
525- reason .append ("likely binary" )
526- if is_processed :
527- reason .append ("already processed" )
528- print (
529- f"Info: Ignoring explicitly provided file: { path_str } ({ ', ' .join (reason )} )" ,
530- file = sys .stderr )
531- except OSError as e :
532- print (
533- f"Warning: Could not stat explicitly provided file { path_str } : { e } " ,
534- file = sys .stderr )
535-
505+ item , root_dir = p , p .parent
506+ should_ignore_result , reason = should_ignore (
507+ item , root_dir , include_pattern , exclude_pattern , max_file_size_bytes )
508+ if should_ignore_result :
509+ print (f"Warning: Ignoring file { item } because { reason } " , file = sys .stderr )
510+ continue
511+ files_to_process_tuples .append ((item , root_dir ))
512+ processed_files .add (item )
536513 elif p .is_dir ():
537514 print (f"Scanning directory: { p } " , file = sys .stderr )
538- try :
539- # Use rglob for recursion
540- for item in p .rglob ('*' ):
541- # Combine checks for file, not processed, and not ignored
542- if item not in processed_files and not should_ignore (
543- item , p , include_pattern , exclude_pattern ,
544- max_file_size_bytes ):
545- files_to_process_tuples .append (
546- (item , p )) # Use dir 'p' as root
547- processed_files .add (item )
548- except PermissionError as e :
549- print (f"Warning: Permission denied during scan of { p } : { e } " ,
550- file = sys .stderr )
551- except Exception as e :
552- print (f"Error during scan of { p } : { e } " , file = sys .stderr )
553- # Decide if we should exit or just continue
554- # continue # Continue with next path for now
515+ # Use rglob for recursion
516+ for item in p .rglob ('*' ):
517+ if item in processed_files :
518+ continue
519+ should_ignore_result , reason = should_ignore (item , p , include_pattern , exclude_pattern , max_file_size_bytes )
520+ if should_ignore_result :
521+ continue
522+ files_to_process_tuples .append ((item , p ))
523+ processed_files .add (item )
555524 else :
556- print (
557- f"Warning: Input path is neither a file nor a directory: { path_str } " ,
558- file = sys .stderr )
525+ raise ValueError (f"Input path is neither a file nor a directory: { path_str } " )
559526
560527 # --- Filtering & Sorting (Preparation) ---
561528 print (
@@ -799,7 +766,7 @@ def main(argv: list[str]):
799766 using_stdout = True
800767
801768 try :
802- results = collect_results (input_paths_str = input_paths_str ,
769+ results = collect_files_content (input_paths_str = input_paths_str ,
803770 include_pattern = include_pattern ,
804771 exclude_pattern = exclude_pattern ,
805772 max_file_size_bytes = max_file_size_bytes ,
0 commit comments