Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions internetarchive/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,8 @@ def download( # noqa: C901,PLR0911,PLR0912,PLR0915
self.item.session.mount_http_adapter(max_retries=retries)
file_path = file_path or self.name

file_path = os.path.basename(file_path)

# Critical security check: Sanitize only the filename portion of file_path to
# prevent invalid characters and potential directory traversal issues.
# We use `utils.sanitize_filepath` instead of `utils.sanitize_filename` because:
Expand All @@ -253,15 +255,14 @@ def download( # noqa: C901,PLR0911,PLR0912,PLR0915
raise OSError(f'{destdir} is not a directory!')
file_path = os.path.join(destdir, file_path)


# Critical security check: Prevent directory traversal attacks by ensuring
# the download path doesn't escape the target directory using path resolution
# and relative path validation. This protects against malicious filenames
# containing ../ sequences or other path manipulation attempts.
try:
# Resolve both paths to handle symlinks and absolute paths
target_path = Path(file_path).resolve()
base_dir = Path(destdir).resolve() if destdir else Path.cwd().resolve()
# Ensure the target path is relative to base directory
target_path = (base_dir / file_path).resolve()
target_path.relative_to(base_dir)
except ValueError:
raise ValueError(f"Download path {file_path} is outside target directory {base_dir}")
Expand Down
32 changes: 26 additions & 6 deletions internetarchive/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,8 +480,9 @@ def sanitize_filepath(filepath: str, avoid_colon: bool = False) -> str:
Sanitizes only the filename part of a full file path, leaving the directory path intact.

This is useful when you need to ensure the filename is safe for filesystem use
without modifying the directory structure. Typically used before creating files
or directories to prevent invalid filename characters.
without modifying the directory structure. This implementation robustly handles both
simple filenames and full paths by avoiding `os.path` functions that can have
unreliable, platform-specific behavior with certain characters (e.g., colons on Windows).

Args:
filepath (str): The full file path to sanitize.
Expand All @@ -491,10 +492,29 @@ def sanitize_filepath(filepath: str, avoid_colon: bool = False) -> str:
Returns:
str: The sanitized file path with the filename portion percent-encoded as needed.
"""
parent_dir = os.path.dirname(filepath)
filename = os.path.basename(filepath)
sanitized_filename = sanitize_filename(filename, avoid_colon)
return os.path.join(parent_dir, sanitized_filename)
# First, handle the case where the input is a simple filename (no directory path).
# This avoids issues with os.path functions misinterpreting characters like colons
# in filenames as drive separators on Windows.
if os.sep not in filepath and '/' not in filepath:
return sanitize_filename(filepath, avoid_colon)

# For full paths, reliably separate the directory from the filename using
# string splitting, which is safer than relying on os.path functions.
# We prefer the POSIX separator for cross-platform consistency if present.
separator = '/' if '/' in filepath else os.sep
parts = filepath.rsplit(separator, 1)

if len(parts) == 2:
# The path was successfully split into a directory and a filename.
parent_dir, filename = parts
sanitized_filename = sanitize_filename(filename, avoid_colon)

# Rejoin the directory with the now-sanitized filename.
return parent_dir + separator + sanitized_filename
else:
# This is a fallback for unusual cases (e.g., an absolute path root '/').
# We treat the whole string as a filename to ensure it's always sanitized.
return sanitize_filename(filepath, avoid_colon)


def sanitize_filename(name: str, avoid_colon: bool = False) -> str:
Expand Down
39 changes: 30 additions & 9 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,15 +173,36 @@


def test_sanitize_filepath():
# Test with colon encoding
result = internetarchive.utils.sanitize_filepath('/path/to/file:name.txt', True)
assert result == '/path/to/file%3Aname.txt'
"""
Tests the sanitize_filepath function in both Windows and POSIX environments.
"""
# --- Test POSIX-like behavior (e.g., Linux, macOS) ---
with patch('internetarchive.utils.is_windows', return_value=False):
# On POSIX, colon is only encoded when avoid_colon=True
posix_path = '/path/to/file:name.txt'

# Test without colon encoding
result = internetarchive.utils.sanitize_filepath('/path/to/file:name.txt', False)
assert result == '/path/to/file:name.txt' # Colon not encoded on POSIX by default
# Test with colon encoding enabled
result_avoid_colon = internetarchive.utils.sanitize_filepath(posix_path, avoid_colon=True)
assert result_avoid_colon == '/path/to/file%3Aname.txt'

# Test with colon encoding disabled (default)
result_no_avoid_colon = internetarchive.utils.sanitize_filepath(posix_path, avoid_colon=False)
assert result_no_avoid_colon == '/path/to/file:name.txt'

# Test Windows path (mocked)
# Test another invalid char that should always be sanitized
result_other_char = internetarchive.utils.sanitize_filepath('/path/to/file/name.txt')
assert result_other_char == '/path/to/file%2Fname.txt'

# --- Test Windows-specific behavior ---
with patch('internetarchive.utils.is_windows', return_value=True):
result = internetarchive.utils.sanitize_filepath('/path/to/con.txt')
assert result == '/path/to/con.txt' # Reserved name sanitized
# On Windows, colon is ALWAYS an invalid character and must be encoded,
# regardless of the 'avoid_colon' flag.
win_path = 'C:\\path\\to\\file:name.txt'

# Test with avoid_colon=True (should encode the colon)
result_avoid_colon_win = internetarchive.utils.sanitize_filepath(win_path, avoid_colon=True)
assert result_avoid_colon_win == 'C:\\path\\to\\file%3Aname.txt'

# Test with avoid_colon=False (should STILL encode the colon)
result_no_avoid_colon_win = internetarchive.utils.sanitize_filepath(win_path, avoid_colon=False)

Check failure on line 207 in tests/test_utils.py

View workflow job for this annotation

GitHub Actions / lint_python

Ruff (E501)

tests/test_utils.py:207:103: E501 Line too long (104 > 102)
assert result_no_avoid_colon_win == 'C:\\path\\to\\file%3Aname.txt'
Loading