From 6eca4725f2f6e93a025a0b6a774c69f1d664579d Mon Sep 17 00:00:00 2001 From: polymood Date: Tue, 23 Sep 2025 18:24:22 +0200 Subject: [PATCH 1/2] Fix(utils): Robustly handle filenames with colons on Windows --- internetarchive/files.py | 4 +--- internetarchive/utils.py | 32 ++++++++++++++++++++++++++------ tests/test_utils.py | 39 ++++++++++++++++++++++++++++++--------- 3 files changed, 57 insertions(+), 18 deletions(-) diff --git a/internetarchive/files.py b/internetarchive/files.py index 170a1a91..a54063eb 100644 --- a/internetarchive/files.py +++ b/internetarchive/files.py @@ -258,10 +258,8 @@ def download( # noqa: C901,PLR0911,PLR0912,PLR0915 # and relative path validation. This protects against malicious filenames # containing ../ sequences or other path manipulation attempts. try: - # Resolve both paths to handle symlinks and absolute paths - target_path = Path(file_path).resolve() base_dir = Path(destdir).resolve() if destdir else Path.cwd().resolve() - # Ensure the target path is relative to base directory + target_path = (base_dir / file_path).resolve() target_path.relative_to(base_dir) except ValueError: raise ValueError(f"Download path {file_path} is outside target directory {base_dir}") diff --git a/internetarchive/utils.py b/internetarchive/utils.py index 1a1ef407..53b35b37 100644 --- a/internetarchive/utils.py +++ b/internetarchive/utils.py @@ -480,8 +480,9 @@ def sanitize_filepath(filepath: str, avoid_colon: bool = False) -> str: Sanitizes only the filename part of a full file path, leaving the directory path intact. This is useful when you need to ensure the filename is safe for filesystem use - without modifying the directory structure. Typically used before creating files - or directories to prevent invalid filename characters. + without modifying the directory structure. This implementation robustly handles both + simple filenames and full paths by avoiding `os.path` functions that can have + unreliable, platform-specific behavior with certain characters (e.g., colons on Windows). Args: filepath (str): The full file path to sanitize. @@ -491,10 +492,29 @@ def sanitize_filepath(filepath: str, avoid_colon: bool = False) -> str: Returns: str: The sanitized file path with the filename portion percent-encoded as needed. """ - parent_dir = os.path.dirname(filepath) - filename = os.path.basename(filepath) - sanitized_filename = sanitize_filename(filename, avoid_colon) - return os.path.join(parent_dir, sanitized_filename) + # First, handle the case where the input is a simple filename (no directory path). + # This avoids issues with os.path functions misinterpreting characters like colons + # in filenames as drive separators on Windows. + if os.sep not in filepath and '/' not in filepath: + return sanitize_filename(filepath, avoid_colon) + + # For full paths, reliably separate the directory from the filename using + # string splitting, which is safer than relying on os.path functions. + # We prefer the POSIX separator for cross-platform consistency if present. + separator = '/' if '/' in filepath else os.sep + parts = filepath.rsplit(separator, 1) + + if len(parts) == 2: + # The path was successfully split into a directory and a filename. + parent_dir, filename = parts + sanitized_filename = sanitize_filename(filename, avoid_colon) + + # Rejoin the directory with the now-sanitized filename. + return parent_dir + separator + sanitized_filename + else: + # This is a fallback for unusual cases (e.g., an absolute path root '/'). + # We treat the whole string as a filename to ensure it's always sanitized. + return sanitize_filename(filepath, avoid_colon) def sanitize_filename(name: str, avoid_colon: bool = False) -> str: diff --git a/tests/test_utils.py b/tests/test_utils.py index 5867cfd2..fb7e606b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -173,15 +173,36 @@ def test_sanitize_filename(): def test_sanitize_filepath(): - # Test with colon encoding - result = internetarchive.utils.sanitize_filepath('/path/to/file:name.txt', True) - assert result == '/path/to/file%3Aname.txt' + """ + Tests the sanitize_filepath function in both Windows and POSIX environments. + """ + # --- Test POSIX-like behavior (e.g., Linux, macOS) --- + with patch('internetarchive.utils.is_windows', return_value=False): + # On POSIX, colon is only encoded when avoid_colon=True + posix_path = '/path/to/file:name.txt' - # Test without colon encoding - result = internetarchive.utils.sanitize_filepath('/path/to/file:name.txt', False) - assert result == '/path/to/file:name.txt' # Colon not encoded on POSIX by default + # Test with colon encoding enabled + result_avoid_colon = internetarchive.utils.sanitize_filepath(posix_path, avoid_colon=True) + assert result_avoid_colon == '/path/to/file%3Aname.txt' + + # Test with colon encoding disabled (default) + result_no_avoid_colon = internetarchive.utils.sanitize_filepath(posix_path, avoid_colon=False) + assert result_no_avoid_colon == '/path/to/file:name.txt' - # Test Windows path (mocked) + # Test another invalid char that should always be sanitized + result_other_char = internetarchive.utils.sanitize_filepath('/path/to/file/name.txt') + assert result_other_char == '/path/to/file%2Fname.txt' + + # --- Test Windows-specific behavior --- with patch('internetarchive.utils.is_windows', return_value=True): - result = internetarchive.utils.sanitize_filepath('/path/to/con.txt') - assert result == '/path/to/con.txt' # Reserved name sanitized + # On Windows, colon is ALWAYS an invalid character and must be encoded, + # regardless of the 'avoid_colon' flag. + win_path = 'C:\\path\\to\\file:name.txt' + + # Test with avoid_colon=True (should encode the colon) + result_avoid_colon_win = internetarchive.utils.sanitize_filepath(win_path, avoid_colon=True) + assert result_avoid_colon_win == 'C:\\path\\to\\file%3Aname.txt' + + # Test with avoid_colon=False (should STILL encode the colon) + result_no_avoid_colon_win = internetarchive.utils.sanitize_filepath(win_path, avoid_colon=False) + assert result_no_avoid_colon_win == 'C:\\path\\to\\file%3Aname.txt' From 4b809d7fdcffda9b0b79a83db9e14ab7a34f550a Mon Sep 17 00:00:00 2001 From: polymood Date: Tue, 23 Sep 2025 20:33:06 +0200 Subject: [PATCH 2/2] Fix: Strip path from filename to handle absolute paths in metadata --- internetarchive/files.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/internetarchive/files.py b/internetarchive/files.py index a54063eb..0a9da921 100644 --- a/internetarchive/files.py +++ b/internetarchive/files.py @@ -235,6 +235,8 @@ def download( # noqa: C901,PLR0911,PLR0912,PLR0915 self.item.session.mount_http_adapter(max_retries=retries) file_path = file_path or self.name + file_path = os.path.basename(file_path) + # Critical security check: Sanitize only the filename portion of file_path to # prevent invalid characters and potential directory traversal issues. # We use `utils.sanitize_filepath` instead of `utils.sanitize_filename` because: @@ -253,6 +255,7 @@ def download( # noqa: C901,PLR0911,PLR0912,PLR0915 raise OSError(f'{destdir} is not a directory!') file_path = os.path.join(destdir, file_path) + # Critical security check: Prevent directory traversal attacks by ensuring # the download path doesn't escape the target directory using path resolution # and relative path validation. This protects against malicious filenames