From 7efcc0e6c5a93de35962d3545fffdc182a0982a4 Mon Sep 17 00:00:00 2001 From: Andre Kahles Date: Mon, 18 Nov 2024 15:50:20 +0100 Subject: [PATCH 1/7] highlight markdown in setup.py --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index b14977a..6e2bd32 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,7 @@ install_requires=requirements, license="MIT license", long_description=readme, + long_description_content_type='text/markdown', include_package_data=True, keywords=['archiving', 'data lifecycle', 'research'], name='project-archiver', From 48ce9be532b100609a79bbc7610edb5144a83e4b Mon Sep 17 00:00:00 2001 From: Andre Kahles Date: Mon, 18 Nov 2024 16:55:59 +0100 Subject: [PATCH 2/7] Improve inference of archive name - add list of allowed file name extensions - allow to provide archive name explicitly on the command line --- archiver/constants.py | 2 ++ archiver/helpers.py | 31 ++++++++++++++++++++++++++----- archiver/integrity.py | 13 ++++++++----- archiver/main.py | 3 ++- 4 files changed, 38 insertions(+), 11 deletions(-) diff --git a/archiver/constants.py b/archiver/constants.py index b9b4dfc..37c3f71 100644 --- a/archiver/constants.py +++ b/archiver/constants.py @@ -12,5 +12,7 @@ ENCRYPTION_ALGORITHM = "AES256" ENV_VAR_MAPPER_MAX_CPUS = "ARCHIVER_MAX_CPUS_ENV_VAR" DEFAULT_COMPRESSION_LEVEL = 6 +ALLOWED_SUFFIXES = ['.part[0-9]+', '.tar', '.md5', '.lz', '.gpg', '.lst'] +ALLOWED_SUFFIXES_REG = '(' + ')|('.join(ALLOWED_SUFFIXES) + ')' MD5_LINE_REGEX = re.compile(r'(\S+)\s+(\S.*)') diff --git a/archiver/helpers.py b/archiver/helpers.py index d83bad4..3e37c2b 100644 --- a/archiver/helpers.py +++ b/archiver/helpers.py @@ -11,7 +11,8 @@ import unicodedata from .constants import READ_CHUNK_BYTE_SIZE, COMPRESSED_ARCHIVE_SUFFIX, \ - ENCRYPTED_ARCHIVE_SUFFIX, ENV_VAR_MAPPER_MAX_CPUS, MD5_LINE_REGEX + ENCRYPTED_ARCHIVE_SUFFIX, ENV_VAR_MAPPER_MAX_CPUS, MD5_LINE_REGEX, \ + ALLOWED_SUFFIXES_REG def get_files_with_type_in_directory_or_terminate(directory, file_type): @@ -339,15 +340,35 @@ def file_is_valid_archive_or_terminate(file_path): def filename_without_extensions(path): - """Removes every suffix, including .partX""" - suffixes_string = "".join(path.suffixes) + """Removes every allowed suffix, including .partX""" + suffixes = path.suffixes + if len(suffixes) > 0: + allowed_suffixes = [] + for s in suffixes[::-1]: + if re.match(ALLOWED_SUFFIXES_REG, s.lower()): + allowed_suffixes.append(s) + else: + break + suffixes = allowed_suffixes[::-1] + + suffixes_string = "".join(suffixes) return path.name[:-len(suffixes_string)] def filepath_without_extensions(path:Path) -> Path: """Removes every suffix, including .partX""" - suffixes_string = "".join(path.suffixes) + suffixes = path.suffixes + if len(suffixes) > 0: + allowed_suffixes = [] + for s in suffixes[::-1]: + if re.match(ALLOWED_SUFFIXES_REG, s.lower()): + allowed_suffixes.append(s) + else: + break + suffixes = allowed_suffixes[::-1] + + suffixes_string = "".join(suffixes) return path.parent / path.name[:-len(suffixes_string)] @@ -362,7 +383,7 @@ def infer_source_name(source_path: Path) -> Path: if len(unique_names) == 0: terminate_with_message('There are no archive files present') elif len(unique_names) > 1: - terminate_with_message(f'More than one possible archive name detected: {str(unique_names)}') + terminate_with_message(f'Automatic archive name detection has failed. More than one possible archive name detected: {str(unique_names)}\n optionally use --archive_name to specific archive name.') return unique_names[0] diff --git a/archiver/integrity.py b/archiver/integrity.py index e765a58..7656b23 100644 --- a/archiver/integrity.py +++ b/archiver/integrity.py @@ -10,7 +10,7 @@ from .listing import parse_tar_listing -def check_integrity(source_path, deep_flag=False, threads=None, work_dir=None): +def check_integrity(source_path, deep_flag=False, threads=None, work_dir=None, archive_name=None): archives_with_hashes = get_archives_with_hashes_from_path(source_path) is_encrypted = helpers.path_target_is_encrypted(source_path) @@ -20,10 +20,10 @@ def check_integrity(source_path, deep_flag=False, threads=None, work_dir=None): check_result = shallow_integrity_check(archives_with_hashes, workers=threads) if source_path.is_dir(): - integrity_result = check_archive_list_integrity(source_path) + integrity_result = check_archive_list_integrity(source_path, archive_name) else: file_path = source_path.parent / Path(helpers.filename_without_archive_extensions(source_path)) - integrity_result = check_archive_part_integrity(file_path) + integrity_result = check_archive_part_integrity(file_path, archive_name) if not integrity_result: logging.error( @@ -74,10 +74,13 @@ def check_archive_part_integrity(source_name: Path) -> bool: return check_result -def check_archive_list_integrity(source_path: Path) -> bool: +def check_archive_list_integrity(source_path: Path, archive_name: str = None) -> bool: parts = helpers.get_parts(source_path) - source_name = helpers.infer_source_name(source_path) + if archive_name is None: + source_name = helpers.infer_source_name(source_path) + else: + source_name = source_path / Path(archive_name) logging.info(f'Found {parts} parts in archive {source_path.as_posix()}') check_result = True diff --git a/archiver/main.py b/archiver/main.py index 08c397b..05e4a5a 100644 --- a/archiver/main.py +++ b/archiver/main.py @@ -160,6 +160,7 @@ def parse_arguments(args): parser_check.add_argument("archive_dir", type=str, help="Select source archive directory or .tar.lz file") parser_check.add_argument("-d", "--deep", action="store_true", help="Verify integrity by unpacking archive and hashing each file") parser_check.add_argument("-n", "--threads", type=int, help=thread_help) + parser_check.add_argument("--archive_name", type=str, help="Provide explicit source name of the archive (if automatic detection fails") parser_check.set_defaults(func=handle_check) # Preparation checks @@ -285,7 +286,7 @@ def handle_check(args): source_path = Path(args.archive_dir) threads = helpers.get_threads_from_args_or_environment(args.threads) - if not check_integrity(source_path, args.deep, threads, args.work_dir): + if not check_integrity(source_path, args.deep, threads, args.work_dir, args.archive_name): # return a different error code to the default code of 1 to be able to distinguish # general errors from a successful run of the program with an unsuccessful outcome # not taking 2, as it usually stands for command line argument errors From 5f8760453985c87c7a4c881876ce00719adc0381 Mon Sep 17 00:00:00 2001 From: Andre Kahles Date: Mon, 18 Nov 2024 18:08:24 +0100 Subject: [PATCH 3/7] forget to add parts.txt to allowed extension list --- archiver/constants.py | 2 +- archiver/integrity.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/archiver/constants.py b/archiver/constants.py index 37c3f71..33a459f 100644 --- a/archiver/constants.py +++ b/archiver/constants.py @@ -12,7 +12,7 @@ ENCRYPTION_ALGORITHM = "AES256" ENV_VAR_MAPPER_MAX_CPUS = "ARCHIVER_MAX_CPUS_ENV_VAR" DEFAULT_COMPRESSION_LEVEL = 6 -ALLOWED_SUFFIXES = ['.part[0-9]+', '.tar', '.md5', '.lz', '.gpg', '.lst'] +ALLOWED_SUFFIXES = ['.part[0-9]+', '.tar', '.md5', '.lz', '.gpg', '.lst', '.parts', '.txt'] ALLOWED_SUFFIXES_REG = '(' + ')|('.join(ALLOWED_SUFFIXES) + ')' MD5_LINE_REGEX = re.compile(r'(\S+)\s+(\S.*)') diff --git a/archiver/integrity.py b/archiver/integrity.py index 7656b23..e6c31ac 100644 --- a/archiver/integrity.py +++ b/archiver/integrity.py @@ -23,7 +23,7 @@ def check_integrity(source_path, deep_flag=False, threads=None, work_dir=None, a integrity_result = check_archive_list_integrity(source_path, archive_name) else: file_path = source_path.parent / Path(helpers.filename_without_archive_extensions(source_path)) - integrity_result = check_archive_part_integrity(file_path, archive_name) + integrity_result = check_archive_part_integrity(file_path) if not integrity_result: logging.error( From c2451cb0462845599bcdd2ea92ffe7b2035f49fe Mon Sep 17 00:00:00 2001 From: Andre Kahles Date: Tue, 19 Nov 2024 11:17:12 +0100 Subject: [PATCH 4/7] Apply suggestions from code review Co-authored-by: Oleksandr Kulkov --- archiver/constants.py | 4 ++-- archiver/helpers.py | 40 ++++++++++------------------------------ 2 files changed, 12 insertions(+), 32 deletions(-) diff --git a/archiver/constants.py b/archiver/constants.py index 33a459f..728c7b5 100644 --- a/archiver/constants.py +++ b/archiver/constants.py @@ -12,7 +12,7 @@ ENCRYPTION_ALGORITHM = "AES256" ENV_VAR_MAPPER_MAX_CPUS = "ARCHIVER_MAX_CPUS_ENV_VAR" DEFAULT_COMPRESSION_LEVEL = 6 -ALLOWED_SUFFIXES = ['.part[0-9]+', '.tar', '.md5', '.lz', '.gpg', '.lst', '.parts', '.txt'] -ALLOWED_SUFFIXES_REG = '(' + ')|('.join(ALLOWED_SUFFIXES) + ')' +ARCHIVE_SUFFIXES = ['\.part[0-9]+', '\.tar', '\.md5', '\.lz', '\.gpg', '\.lst', '\.parts', '\.txt'] +ARCHIVE_SUFFIXES_REG = '$|'.join(ARCHIVE_SUFFIXES) + '$' MD5_LINE_REGEX = re.compile(r'(\S+)\s+(\S.*)') diff --git a/archiver/helpers.py b/archiver/helpers.py index 3e37c2b..103d477 100644 --- a/archiver/helpers.py +++ b/archiver/helpers.py @@ -12,7 +12,7 @@ from .constants import READ_CHUNK_BYTE_SIZE, COMPRESSED_ARCHIVE_SUFFIX, \ ENCRYPTED_ARCHIVE_SUFFIX, ENV_VAR_MAPPER_MAX_CPUS, MD5_LINE_REGEX, \ - ALLOWED_SUFFIXES_REG + ARCHIVE_SUFFIXES_REG def get_files_with_type_in_directory_or_terminate(directory, file_type): @@ -339,36 +339,16 @@ def file_is_valid_archive_or_terminate(file_path): terminate_with_message(f"File {file_path.as_posix()} is not a valid archive of type {COMPRESSED_ARCHIVE_SUFFIX} or {ENCRYPTED_ARCHIVE_SUFFIX} or doesn't exist.") -def filename_without_extensions(path): - """Removes every allowed suffix, including .partX""" - suffixes = path.suffixes - if len(suffixes) > 0: - allowed_suffixes = [] - for s in suffixes[::-1]: - if re.match(ALLOWED_SUFFIXES_REG, s.lower()): - allowed_suffixes.append(s) - else: - break - suffixes = allowed_suffixes[::-1] - - suffixes_string = "".join(suffixes) - - return path.name[:-len(suffixes_string)] - +def filename_without_archive_extensions(path): + """Removes every archiving suffix""" + return filepath_without_archive_extensions(path).name -def filepath_without_extensions(path:Path) -> Path: - """Removes every suffix, including .partX""" - suffixes = path.suffixes - if len(suffixes) > 0: - allowed_suffixes = [] - for s in suffixes[::-1]: - if re.match(ALLOWED_SUFFIXES_REG, s.lower()): - allowed_suffixes.append(s) - else: - break - suffixes = allowed_suffixes[::-1] - suffixes_string = "".join(suffixes) +def filepath_without_archive_extensions(path:Path) -> Path: + """Removes every archiving suffix""" + while re.match(ARCHIVE_SUFFIXES_REG, path.suffix): + path = path.with_suffix('') + return path return path.parent / path.name[:-len(suffixes_string)] @@ -383,7 +363,7 @@ def infer_source_name(source_path: Path) -> Path: if len(unique_names) == 0: terminate_with_message('There are no archive files present') elif len(unique_names) > 1: - terminate_with_message(f'Automatic archive name detection has failed. More than one possible archive name detected: {str(unique_names)}\n optionally use --archive_name to specific archive name.') + terminate_with_message(f'Automatic archive name detection has failed. More than one possible archive name detected: {str(unique_names)}\noptionally use --archive_name to specify archive name.') return unique_names[0] From 58a36e1337ec3ecd7765e9412e15201569799e6a Mon Sep 17 00:00:00 2001 From: Andre Kahles Date: Tue, 19 Nov 2024 11:17:40 +0100 Subject: [PATCH 5/7] Apply suggestions from code review Co-authored-by: Oleksandr Kulkov --- archiver/integrity.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/archiver/integrity.py b/archiver/integrity.py index e6c31ac..c6c79f2 100644 --- a/archiver/integrity.py +++ b/archiver/integrity.py @@ -77,10 +77,10 @@ def check_archive_part_integrity(source_name: Path) -> bool: def check_archive_list_integrity(source_path: Path, archive_name: str = None) -> bool: parts = helpers.get_parts(source_path) - if archive_name is None: - source_name = helpers.infer_source_name(source_path) - else: + if archive_name: source_name = source_path / Path(archive_name) + else: + source_name = helpers.infer_source_name(source_path) logging.info(f'Found {parts} parts in archive {source_path.as_posix()}') check_result = True From 169f07edce79131cd6fda7ac2db20a15958a6a89 Mon Sep 17 00:00:00 2001 From: Andre Kahles Date: Tue, 19 Nov 2024 12:32:03 +0100 Subject: [PATCH 6/7] Resolved naming conflict of filename_without_archive_extensions - renamed the already existint filename_without_archive_extensions to filename_without_archive_extensions_multipart --- archiver/extract.py | 2 +- archiver/helpers.py | 6 +++--- archiver/integrity.py | 8 ++++---- archiver/listing.py | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/archiver/extract.py b/archiver/extract.py index cb3ca7b..e9658f8 100644 --- a/archiver/extract.py +++ b/archiver/extract.py @@ -64,7 +64,7 @@ def extract_archive(source_path, destination_directory_path, partial_extraction_ uncompress_and_extract(archive_files, destination_directory_path, threads, partial_extraction_path=partial_extraction_path) logging.info("Archive extracted to: " + helpers.get_absolute_path_string(destination_directory_path)) - return destination_directory_path / helpers.filename_without_extensions(source_path) + return destination_directory_path / helpers.filename_without_archive_extensions(source_path) def uncompress_and_extract(archive_file_paths, destination_directory_path, threads, partial_extraction_path=None, encrypted=False): diff --git a/archiver/helpers.py b/archiver/helpers.py index 103d477..dd5a410 100644 --- a/archiver/helpers.py +++ b/archiver/helpers.py @@ -355,10 +355,10 @@ def filepath_without_archive_extensions(path:Path) -> Path: def infer_source_name(source_path: Path) -> Path: if not source_path.is_dir(): - return filepath_without_extensions(source_path) + return filepath_without_archive_extensions(source_path) else: all_files = [p for p in source_path.iterdir() if p.is_file()] - unique_names = list(set([filepath_without_extensions(f) for f in all_files])) + unique_names = list(set([filepath_without_archive_extensions(f) for f in all_files])) if len(unique_names) == 0: terminate_with_message('There are no archive files present') @@ -368,7 +368,7 @@ def infer_source_name(source_path: Path) -> Path: return unique_names[0] -def filename_without_archive_extensions(path): +def filename_without_archive_extensions_multipart(path): """Removes known archive extensions but keeps extensions like .partX""" name = path.name diff --git a/archiver/integrity.py b/archiver/integrity.py index c6c79f2..9f66b33 100644 --- a/archiver/integrity.py +++ b/archiver/integrity.py @@ -22,7 +22,7 @@ def check_integrity(source_path, deep_flag=False, threads=None, work_dir=None, a if source_path.is_dir(): integrity_result = check_archive_list_integrity(source_path, archive_name) else: - file_path = source_path.parent / Path(helpers.filename_without_archive_extensions(source_path)) + file_path = source_path.parent / Path(helpers.filename_without_archive_extensions_multipart(source_path)) integrity_result = check_archive_part_integrity(file_path) if not integrity_result: @@ -126,7 +126,7 @@ def verify_relative_symbolic_links(archives_with_hashes): symlink_dict = {} # all symlinks found across listing for archive in archives_with_hashes: part_path = archive[0] - part_listing = part_path.parent / (helpers.filename_without_archive_extensions(part_path) + LISTING_SUFFIX) + part_listing = part_path.parent / (helpers.filename_without_archive_extensions_multipart(part_path) + LISTING_SUFFIX) entries = parse_tar_listing(part_listing) file_set.update([str(e.path).rstrip('/') for e in entries]) @@ -237,7 +237,7 @@ def get_hashes_for_archive(archive_path): hash_file_path = archive_path.parent / (archive_path.name + ".md5") helpers.terminate_if_path_nonexistent(hash_file_path) - hash_listing_path = archive_path.parent / (helpers.filename_without_archive_extensions(archive_path) + ".md5") + hash_listing_path = archive_path.parent / (helpers.filename_without_archive_extensions_multipart(archive_path) + ".md5") helpers.terminate_if_path_nonexistent(hash_listing_path) return [(archive_file_path, hash_file_path, hash_listing_path)] @@ -260,7 +260,7 @@ def get_archives_with_hashes_from_directory(source_path): hash_path = archive.parent / (archive.name + ".md5") helpers.terminate_if_path_nonexistent(hash_path) - hash_listing_path = Path(archive.parent) / (helpers.filename_without_archive_extensions(archive) + ".md5") + hash_listing_path = Path(archive.parent) / (helpers.filename_without_archive_extensions_multipart(archive) + ".md5") helpers.terminate_if_path_nonexistent(hash_listing_path) archive_with_hash_path = (archive, hash_path, hash_listing_path) diff --git a/archiver/listing.py b/archiver/listing.py index 4c04b29..25b0ca5 100644 --- a/archiver/listing.py +++ b/archiver/listing.py @@ -90,7 +90,7 @@ def get_listing_files_for_path(path): # If specific file is used, maybe not all results of search path will be shown (since they could be in different file) helpers.file_is_valid_archive_or_terminate(path) - listing_path = path.parent / (helpers.filename_without_archive_extensions(path) + ".tar.lst") + listing_path = path.parent / (helpers.filename_without_archive_extensions_multipart(path) + ".tar.lst") helpers.terminate_if_path_nonexistent(path) return [listing_path] From 3e8d3457f49a34cd211c6eaaea5ed384c7183b08 Mon Sep 17 00:00:00 2001 From: Andre Kahles Date: Tue, 19 Nov 2024 14:47:55 +0100 Subject: [PATCH 7/7] addressing review comments --- archiver/helpers.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/archiver/helpers.py b/archiver/helpers.py index dd5a410..f09c28a 100644 --- a/archiver/helpers.py +++ b/archiver/helpers.py @@ -339,18 +339,17 @@ def file_is_valid_archive_or_terminate(file_path): terminate_with_message(f"File {file_path.as_posix()} is not a valid archive of type {COMPRESSED_ARCHIVE_SUFFIX} or {ENCRYPTED_ARCHIVE_SUFFIX} or doesn't exist.") -def filename_without_archive_extensions(path): - """Removes every archiving suffix""" - return filepath_without_archive_extensions(path).name - - def filepath_without_archive_extensions(path:Path) -> Path: """Removes every archiving suffix""" while re.match(ARCHIVE_SUFFIXES_REG, path.suffix): path = path.with_suffix('') return path - return path.parent / path.name[:-len(suffixes_string)] + +def filename_without_archive_extensions(path): + """Removes every archiving suffix""" + return filepath_without_archive_extensions(path).name + def infer_source_name(source_path: Path) -> Path: