Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved inference of archive names #53

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions archiver/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,7 @@
ENCRYPTION_ALGORITHM = "AES256"
ENV_VAR_MAPPER_MAX_CPUS = "ARCHIVER_MAX_CPUS_ENV_VAR"
DEFAULT_COMPRESSION_LEVEL = 6
ARCHIVE_SUFFIXES = ['\.part[0-9]+', '\.tar', '\.md5', '\.lz', '\.gpg', '\.lst', '\.parts', '\.txt']
ARCHIVE_SUFFIXES_REG = '$|'.join(ARCHIVE_SUFFIXES) + '$'

MD5_LINE_REGEX = re.compile(r'(\S+)\s+(\S.*)')
2 changes: 1 addition & 1 deletion archiver/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def extract_archive(source_path, destination_directory_path, partial_extraction_
uncompress_and_extract(archive_files, destination_directory_path, threads, partial_extraction_path=partial_extraction_path)

logging.info("Archive extracted to: " + helpers.get_absolute_path_string(destination_directory_path))
return destination_directory_path / helpers.filename_without_extensions(source_path)
return destination_directory_path / helpers.filename_without_archive_extensions(source_path)


def uncompress_and_extract(archive_file_paths, destination_directory_path, threads, partial_extraction_path=None, encrypted=False):
Expand Down
28 changes: 14 additions & 14 deletions archiver/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
import unicodedata

from .constants import READ_CHUNK_BYTE_SIZE, COMPRESSED_ARCHIVE_SUFFIX, \
ENCRYPTED_ARCHIVE_SUFFIX, ENV_VAR_MAPPER_MAX_CPUS, MD5_LINE_REGEX
ENCRYPTED_ARCHIVE_SUFFIX, ENV_VAR_MAPPER_MAX_CPUS, MD5_LINE_REGEX, \
ARCHIVE_SUFFIXES_REG


def get_files_with_type_in_directory_or_terminate(directory, file_type):
Expand Down Expand Up @@ -338,36 +339,35 @@ def file_is_valid_archive_or_terminate(file_path):
terminate_with_message(f"File {file_path.as_posix()} is not a valid archive of type {COMPRESSED_ARCHIVE_SUFFIX} or {ENCRYPTED_ARCHIVE_SUFFIX} or doesn't exist.")


def filename_without_extensions(path):
"""Removes every suffix, including .partX"""
suffixes_string = "".join(path.suffixes)
def filepath_without_archive_extensions(path:Path) -> Path:
"""Removes every archiving suffix"""
while re.match(ARCHIVE_SUFFIXES_REG, path.suffix):
path = path.with_suffix('')
return path

return path.name[:-len(suffixes_string)]

def filename_without_archive_extensions(path):
"""Removes every archiving suffix"""
return filepath_without_archive_extensions(path).name

def filepath_without_extensions(path:Path) -> Path:
"""Removes every suffix, including .partX"""
suffixes_string = "".join(path.suffixes)

return path.parent / path.name[:-len(suffixes_string)]

def infer_source_name(source_path: Path) -> Path:

if not source_path.is_dir():
return filepath_without_extensions(source_path)
return filepath_without_archive_extensions(source_path)
else:
all_files = [p for p in source_path.iterdir() if p.is_file()]
unique_names = list(set([filepath_without_extensions(f) for f in all_files]))
unique_names = list(set([filepath_without_archive_extensions(f) for f in all_files]))

if len(unique_names) == 0:
terminate_with_message('There are no archive files present')
elif len(unique_names) > 1:
terminate_with_message(f'More than one possible archive name detected: {str(unique_names)}')
terminate_with_message(f'Automatic archive name detection has failed. More than one possible archive name detected: {str(unique_names)}\noptionally use --archive_name to specify archive name.')

return unique_names[0]


def filename_without_archive_extensions(path):
def filename_without_archive_extensions_multipart(path):
akahles marked this conversation as resolved.
Show resolved Hide resolved
"""Removes known archive extensions but keeps extensions like .partX"""
name = path.name

Expand Down
19 changes: 11 additions & 8 deletions archiver/integrity.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from .listing import parse_tar_listing


def check_integrity(source_path, deep_flag=False, threads=None, work_dir=None):
def check_integrity(source_path, deep_flag=False, threads=None, work_dir=None, archive_name=None):

archives_with_hashes = get_archives_with_hashes_from_path(source_path)
is_encrypted = helpers.path_target_is_encrypted(source_path)
Expand All @@ -20,9 +20,9 @@ def check_integrity(source_path, deep_flag=False, threads=None, work_dir=None):
check_result = shallow_integrity_check(archives_with_hashes, workers=threads)

if source_path.is_dir():
integrity_result = check_archive_list_integrity(source_path)
integrity_result = check_archive_list_integrity(source_path, archive_name)
else:
file_path = source_path.parent / Path(helpers.filename_without_archive_extensions(source_path))
file_path = source_path.parent / Path(helpers.filename_without_archive_extensions_multipart(source_path))
integrity_result = check_archive_part_integrity(file_path)

if not integrity_result:
Expand Down Expand Up @@ -74,10 +74,13 @@ def check_archive_part_integrity(source_name: Path) -> bool:

return check_result

def check_archive_list_integrity(source_path: Path) -> bool:
def check_archive_list_integrity(source_path: Path, archive_name: str = None) -> bool:

parts = helpers.get_parts(source_path)
source_name = helpers.infer_source_name(source_path)
if archive_name:
source_name = source_path / Path(archive_name)
else:
source_name = helpers.infer_source_name(source_path)

logging.info(f'Found {parts} parts in archive {source_path.as_posix()}')
check_result = True
Expand Down Expand Up @@ -123,7 +126,7 @@ def verify_relative_symbolic_links(archives_with_hashes):
symlink_dict = {} # all symlinks found across listing
for archive in archives_with_hashes:
part_path = archive[0]
part_listing = part_path.parent / (helpers.filename_without_archive_extensions(part_path) + LISTING_SUFFIX)
part_listing = part_path.parent / (helpers.filename_without_archive_extensions_multipart(part_path) + LISTING_SUFFIX)
entries = parse_tar_listing(part_listing)

file_set.update([str(e.path).rstrip('/') for e in entries])
Expand Down Expand Up @@ -234,7 +237,7 @@ def get_hashes_for_archive(archive_path):
hash_file_path = archive_path.parent / (archive_path.name + ".md5")
helpers.terminate_if_path_nonexistent(hash_file_path)

hash_listing_path = archive_path.parent / (helpers.filename_without_archive_extensions(archive_path) + ".md5")
hash_listing_path = archive_path.parent / (helpers.filename_without_archive_extensions_multipart(archive_path) + ".md5")
helpers.terminate_if_path_nonexistent(hash_listing_path)

return [(archive_file_path, hash_file_path, hash_listing_path)]
Expand All @@ -257,7 +260,7 @@ def get_archives_with_hashes_from_directory(source_path):
hash_path = archive.parent / (archive.name + ".md5")
helpers.terminate_if_path_nonexistent(hash_path)

hash_listing_path = Path(archive.parent) / (helpers.filename_without_archive_extensions(archive) + ".md5")
hash_listing_path = Path(archive.parent) / (helpers.filename_without_archive_extensions_multipart(archive) + ".md5")
helpers.terminate_if_path_nonexistent(hash_listing_path)

archive_with_hash_path = (archive, hash_path, hash_listing_path)
Expand Down
2 changes: 1 addition & 1 deletion archiver/listing.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def get_listing_files_for_path(path):

# If specific file is used, maybe not all results of search path will be shown (since they could be in different file)
helpers.file_is_valid_archive_or_terminate(path)
listing_path = path.parent / (helpers.filename_without_archive_extensions(path) + ".tar.lst")
listing_path = path.parent / (helpers.filename_without_archive_extensions_multipart(path) + ".tar.lst")
helpers.terminate_if_path_nonexistent(path)

return [listing_path]
Expand Down
3 changes: 2 additions & 1 deletion archiver/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ def parse_arguments(args):
parser_check.add_argument("archive_dir", type=str, help="Select source archive directory or .tar.lz file")
parser_check.add_argument("-d", "--deep", action="store_true", help="Verify integrity by unpacking archive and hashing each file")
parser_check.add_argument("-n", "--threads", type=int, help=thread_help)
parser_check.add_argument("--archive_name", type=str, help="Provide explicit source name of the archive (if automatic detection fails")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add a test case that covers this scenario?

parser_check.set_defaults(func=handle_check)

# Preparation checks
Expand Down Expand Up @@ -285,7 +286,7 @@ def handle_check(args):
source_path = Path(args.archive_dir)
threads = helpers.get_threads_from_args_or_environment(args.threads)

if not check_integrity(source_path, args.deep, threads, args.work_dir):
if not check_integrity(source_path, args.deep, threads, args.work_dir, args.archive_name):
# return a different error code to the default code of 1 to be able to distinguish
# general errors from a successful run of the program with an unsuccessful outcome
# not taking 2, as it usually stands for command line argument errors
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
install_requires=requirements,
license="MIT license",
long_description=readme,
long_description_content_type='text/markdown',
include_package_data=True,
keywords=['archiving', 'data lifecycle', 'research'],
name='project-archiver',
Expand Down
Loading