diff --git a/src/gitingest/schemas/filesystem.py b/src/gitingest/schemas/filesystem.py index b5669f18..a62e0887 100644 --- a/src/gitingest/schemas/filesystem.py +++ b/src/gitingest/schemas/filesystem.py @@ -142,7 +142,7 @@ def content(self) -> str: # pylint: disable=too-many-return-statements if chunk == b"": return "[Empty file]" - if not _decodes(chunk, "utf-8"): + if is_binary_file(chunk): return "[Binary file]" # Find the first encoding that decodes the sample @@ -159,3 +159,15 @@ def content(self) -> str: # pylint: disable=too-many-return-statements return fp.read() except (OSError, UnicodeDecodeError) as exc: return f"Error reading file with {good_enc!r}: {exc}" + + +def is_binary_file(file_contents: bytes | None) -> bool: + """Check whether a file is binary by reading its first 1024 bytes and looking for non-text characters.""" + if not file_contents: + return False # Empty files are not binary + + text_characters = bytes( + {7, 8, 9, 10, 12, 13, 27}.union(set(range(0x20, 0x100)) - {0x7F}), + ) + # If translate returns any bytes, those are non-text (binary) bytes + return bool(file_contents.translate(None, text_characters)) diff --git a/src/gitingest/utils/file_utils.py b/src/gitingest/utils/file_utils.py index 2c6ef74d..a1b9a0e2 100644 --- a/src/gitingest/utils/file_utils.py +++ b/src/gitingest/utils/file_utils.py @@ -27,9 +27,11 @@ def _get_preferred_encodings() -> list[str]: platform's default encoding followed by common fallback encodings. """ - encodings = [locale.getpreferredencoding(), "utf-8", "utf-16", "utf-16le", "utf-8-sig", "latin"] + encodings = [locale.getpreferredencoding(), "utf-8", "utf-16le", "utf-8-sig", "latin"] if platform.system() == "Windows": - encodings += ["cp1252", "iso-8859-1"] + encodings += ["utf-16be", "cp1252", "iso-8859-1"] + else: + encodings += ["utf-16"] return list(dict.fromkeys(encodings))