diff --git a/archiver/constants.py b/archiver/constants.py index b9b4dfc..728c7b5 100644 --- a/archiver/constants.py +++ b/archiver/constants.py @@ -12,5 +12,7 @@ ENCRYPTION_ALGORITHM = "AES256" ENV_VAR_MAPPER_MAX_CPUS = "ARCHIVER_MAX_CPUS_ENV_VAR" DEFAULT_COMPRESSION_LEVEL = 6 +ARCHIVE_SUFFIXES = ['\.part[0-9]+', '\.tar', '\.md5', '\.lz', '\.gpg', '\.lst', '\.parts', '\.txt'] +ARCHIVE_SUFFIXES_REG = '$|'.join(ARCHIVE_SUFFIXES) + '$' MD5_LINE_REGEX = re.compile(r'(\S+)\s+(\S.*)') diff --git a/archiver/extract.py b/archiver/extract.py index cb3ca7b..e9658f8 100644 --- a/archiver/extract.py +++ b/archiver/extract.py @@ -64,7 +64,7 @@ def extract_archive(source_path, destination_directory_path, partial_extraction_ uncompress_and_extract(archive_files, destination_directory_path, threads, partial_extraction_path=partial_extraction_path) logging.info("Archive extracted to: " + helpers.get_absolute_path_string(destination_directory_path)) - return destination_directory_path / helpers.filename_without_extensions(source_path) + return destination_directory_path / helpers.filename_without_archive_extensions(source_path) def uncompress_and_extract(archive_file_paths, destination_directory_path, threads, partial_extraction_path=None, encrypted=False): diff --git a/archiver/helpers.py b/archiver/helpers.py index d83bad4..f09c28a 100644 --- a/archiver/helpers.py +++ b/archiver/helpers.py @@ -11,7 +11,8 @@ import unicodedata from .constants import READ_CHUNK_BYTE_SIZE, COMPRESSED_ARCHIVE_SUFFIX, \ - ENCRYPTED_ARCHIVE_SUFFIX, ENV_VAR_MAPPER_MAX_CPUS, MD5_LINE_REGEX + ENCRYPTED_ARCHIVE_SUFFIX, ENV_VAR_MAPPER_MAX_CPUS, MD5_LINE_REGEX, \ + ARCHIVE_SUFFIXES_REG def get_files_with_type_in_directory_or_terminate(directory, file_type): @@ -338,36 +339,35 @@ def file_is_valid_archive_or_terminate(file_path): terminate_with_message(f"File {file_path.as_posix()} is not a valid archive of type {COMPRESSED_ARCHIVE_SUFFIX} or {ENCRYPTED_ARCHIVE_SUFFIX} or doesn't exist.") -def filename_without_extensions(path): - """Removes every suffix, including .partX""" - suffixes_string = "".join(path.suffixes) +def filepath_without_archive_extensions(path:Path) -> Path: + """Removes every archiving suffix""" + while re.match(ARCHIVE_SUFFIXES_REG, path.suffix): + path = path.with_suffix('') + return path - return path.name[:-len(suffixes_string)] +def filename_without_archive_extensions(path): + """Removes every archiving suffix""" + return filepath_without_archive_extensions(path).name -def filepath_without_extensions(path:Path) -> Path: - """Removes every suffix, including .partX""" - suffixes_string = "".join(path.suffixes) - - return path.parent / path.name[:-len(suffixes_string)] def infer_source_name(source_path: Path) -> Path: if not source_path.is_dir(): - return filepath_without_extensions(source_path) + return filepath_without_archive_extensions(source_path) else: all_files = [p for p in source_path.iterdir() if p.is_file()] - unique_names = list(set([filepath_without_extensions(f) for f in all_files])) + unique_names = list(set([filepath_without_archive_extensions(f) for f in all_files])) if len(unique_names) == 0: terminate_with_message('There are no archive files present') elif len(unique_names) > 1: - terminate_with_message(f'More than one possible archive name detected: {str(unique_names)}') + terminate_with_message(f'Automatic archive name detection has failed. More than one possible archive name detected: {str(unique_names)}\noptionally use --archive_name to specify archive name.') return unique_names[0] -def filename_without_archive_extensions(path): +def filename_without_archive_extensions_multipart(path): """Removes known archive extensions but keeps extensions like .partX""" name = path.name diff --git a/archiver/integrity.py b/archiver/integrity.py index e765a58..9f66b33 100644 --- a/archiver/integrity.py +++ b/archiver/integrity.py @@ -10,7 +10,7 @@ from .listing import parse_tar_listing -def check_integrity(source_path, deep_flag=False, threads=None, work_dir=None): +def check_integrity(source_path, deep_flag=False, threads=None, work_dir=None, archive_name=None): archives_with_hashes = get_archives_with_hashes_from_path(source_path) is_encrypted = helpers.path_target_is_encrypted(source_path) @@ -20,9 +20,9 @@ def check_integrity(source_path, deep_flag=False, threads=None, work_dir=None): check_result = shallow_integrity_check(archives_with_hashes, workers=threads) if source_path.is_dir(): - integrity_result = check_archive_list_integrity(source_path) + integrity_result = check_archive_list_integrity(source_path, archive_name) else: - file_path = source_path.parent / Path(helpers.filename_without_archive_extensions(source_path)) + file_path = source_path.parent / Path(helpers.filename_without_archive_extensions_multipart(source_path)) integrity_result = check_archive_part_integrity(file_path) if not integrity_result: @@ -74,10 +74,13 @@ def check_archive_part_integrity(source_name: Path) -> bool: return check_result -def check_archive_list_integrity(source_path: Path) -> bool: +def check_archive_list_integrity(source_path: Path, archive_name: str = None) -> bool: parts = helpers.get_parts(source_path) - source_name = helpers.infer_source_name(source_path) + if archive_name: + source_name = source_path / Path(archive_name) + else: + source_name = helpers.infer_source_name(source_path) logging.info(f'Found {parts} parts in archive {source_path.as_posix()}') check_result = True @@ -123,7 +126,7 @@ def verify_relative_symbolic_links(archives_with_hashes): symlink_dict = {} # all symlinks found across listing for archive in archives_with_hashes: part_path = archive[0] - part_listing = part_path.parent / (helpers.filename_without_archive_extensions(part_path) + LISTING_SUFFIX) + part_listing = part_path.parent / (helpers.filename_without_archive_extensions_multipart(part_path) + LISTING_SUFFIX) entries = parse_tar_listing(part_listing) file_set.update([str(e.path).rstrip('/') for e in entries]) @@ -234,7 +237,7 @@ def get_hashes_for_archive(archive_path): hash_file_path = archive_path.parent / (archive_path.name + ".md5") helpers.terminate_if_path_nonexistent(hash_file_path) - hash_listing_path = archive_path.parent / (helpers.filename_without_archive_extensions(archive_path) + ".md5") + hash_listing_path = archive_path.parent / (helpers.filename_without_archive_extensions_multipart(archive_path) + ".md5") helpers.terminate_if_path_nonexistent(hash_listing_path) return [(archive_file_path, hash_file_path, hash_listing_path)] @@ -257,7 +260,7 @@ def get_archives_with_hashes_from_directory(source_path): hash_path = archive.parent / (archive.name + ".md5") helpers.terminate_if_path_nonexistent(hash_path) - hash_listing_path = Path(archive.parent) / (helpers.filename_without_archive_extensions(archive) + ".md5") + hash_listing_path = Path(archive.parent) / (helpers.filename_without_archive_extensions_multipart(archive) + ".md5") helpers.terminate_if_path_nonexistent(hash_listing_path) archive_with_hash_path = (archive, hash_path, hash_listing_path) diff --git a/archiver/listing.py b/archiver/listing.py index 4c04b29..25b0ca5 100644 --- a/archiver/listing.py +++ b/archiver/listing.py @@ -90,7 +90,7 @@ def get_listing_files_for_path(path): # If specific file is used, maybe not all results of search path will be shown (since they could be in different file) helpers.file_is_valid_archive_or_terminate(path) - listing_path = path.parent / (helpers.filename_without_archive_extensions(path) + ".tar.lst") + listing_path = path.parent / (helpers.filename_without_archive_extensions_multipart(path) + ".tar.lst") helpers.terminate_if_path_nonexistent(path) return [listing_path] diff --git a/archiver/main.py b/archiver/main.py index 08c397b..05e4a5a 100644 --- a/archiver/main.py +++ b/archiver/main.py @@ -160,6 +160,7 @@ def parse_arguments(args): parser_check.add_argument("archive_dir", type=str, help="Select source archive directory or .tar.lz file") parser_check.add_argument("-d", "--deep", action="store_true", help="Verify integrity by unpacking archive and hashing each file") parser_check.add_argument("-n", "--threads", type=int, help=thread_help) + parser_check.add_argument("--archive_name", type=str, help="Provide explicit source name of the archive (if automatic detection fails") parser_check.set_defaults(func=handle_check) # Preparation checks @@ -285,7 +286,7 @@ def handle_check(args): source_path = Path(args.archive_dir) threads = helpers.get_threads_from_args_or_environment(args.threads) - if not check_integrity(source_path, args.deep, threads, args.work_dir): + if not check_integrity(source_path, args.deep, threads, args.work_dir, args.archive_name): # return a different error code to the default code of 1 to be able to distinguish # general errors from a successful run of the program with an unsuccessful outcome # not taking 2, as it usually stands for command line argument errors diff --git a/setup.py b/setup.py index b14977a..6e2bd32 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,7 @@ install_requires=requirements, license="MIT license", long_description=readme, + long_description_content_type='text/markdown', include_package_data=True, keywords=['archiving', 'data lifecycle', 'research'], name='project-archiver',