From efc1d336f13e71b353e8c5de08487d1869e72705 Mon Sep 17 00:00:00 2001 From: mikedarcy Date: Fri, 10 May 2024 12:36:16 -0700 Subject: [PATCH] Do _not_ revert bag when updating a bag with `strict` enabled. Update docs and unit tests. --- CHANGELOG.md | 5 +++-- bdbag/bdbag_api.py | 8 +++++--- bdbag/bdbag_cli.py | 5 +++-- doc/api.md | 28 ++++++++++++++-------------- doc/cli.md | 7 ++++--- test/test_api.py | 12 ++++++++++++ 6 files changed, 41 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f9f540..2318495 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,9 @@ encoded for the `filename` field and that whitespace (` ` and `\t`) should _only * NOTE: As a best practice, applications should always pre-encode URLs that are added to `fetch.txt` and not rely on `bdbag` to do so, since only whitespace will be encoded. * Added a new option `strict` to the `make_bag` API function, along with a corresponding CLI argument. If `strict` is enabled, `make_bag` will automatically validate a newly created or updated bag for structural validity and fail if the resultant bag is invalid. -This can be used to ensure that a bag is not persisted without payload file manifests. Additionally, if the created or -updated output bag is not structurally valid, the bag will subsequently be reverted back to a normal directory and a BagValidationError exception will be thrown. +This can be used to ensure that a bag is not persisted without payload file manifests. Additionally, if a created output +bag is not structurally valid, the bag will subsequently be reverted back to a normal directory. An updated bag will _not_ be reverted. +In either case, a BagValidationError exception will be thrown. ## 1.7.2 diff --git a/bdbag/bdbag_api.py b/bdbag/bdbag_api.py index d4e0866..7f10ef2 100644 --- a/bdbag/bdbag_api.py +++ b/bdbag/bdbag_api.py @@ -339,10 +339,12 @@ def make_bag(bag_path, try: bag._validate_structure() except bdbagit.BagValidationError as e: - error = ("The newly created/updated bag is not structurally valid and strict checking has been requested. " - "The bag will be reverted back to a normal directory. Exception: %s\n") % get_typed_exception(e) + error = ("The newly created/updated bag is not structurally valid and strict checking has been requested.%s" + " Exception: %s\n" % (" The bag will be reverted back to a normal directory." if not update else "", + get_typed_exception(e))) logger.error(error) - revert_bag(bag_path) + if not update: + revert_bag(bag_path) raise bdbagit.BagValidationError(error) return bag diff --git a/bdbag/bdbag_cli.py b/bdbag/bdbag_cli.py index 6e8aaf5..7f57537 100644 --- a/bdbag/bdbag_cli.py +++ b/bdbag/bdbag_cli.py @@ -87,8 +87,9 @@ def parse_cli(): strict_arg, action="store_true", help="Automatically validate a newly created or updated bag for structural validity and fail if the resultant " "bag is invalid. This can be used to ensure that a bag is not persisted without payload file manifests. " - "If this flag is set and the created or updated output bag is not structurally valid, the bag will " - "subsequently be reverted back to a normal directory and an error returned.") + "If this flag is set and a created output bag is not structurally valid, the bag will " + "subsequently be reverted back to a normal directory. An updated bag will not be reverted. " + "In either case, an error is returned.") revert_arg = "--revert" standard_args.add_argument( diff --git a/doc/api.md b/doc/api.md index 3900c72..52e30f7 100644 --- a/doc/api.md +++ b/doc/api.md @@ -195,21 +195,21 @@ make_bag(bag_path, Creates or updates the bag denoted by the `bag_path` argument. ##### Parameters -| Param | Type | Description | -|----------------------|-----------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| bag_path | `string` | A normalized, absolute path to a bag directory. | -| algs | `list` | A list of checksum algorithms to use for calculating file fixities. When creating a bag, only the checksums present in this variable will be used. When updating a bag, this function will take the union of any existing bag algorithms and what is specified by this parameter, ***except*** when the `prune_manifests` parameter is specified, in which case then only the algorithms specifed by this parameter will be used. | -| update | `boolean` | If `bag_path` represents an existing bag, update it. If this parameter is not specified when invoking this function on an existing bag, the function is essentially a NOOP and will emit a logging message to that effect. | -| save_manifests | `boolean` | Defaults to `True`. If true, saves all manifests, recalculating all checksums and regenerating `fetch.txt`. If false, only tagfile manifest checksums are recalculated. Use this flag as an optimization (to avoid recalculating payload file checksums) when only the bag metadata has been changed. This parameter is only meaningful during update operations, otherwise it is ignored. | -| prune_manifests | `boolean` | Removes any file and tagfile manifests for checksums that are not listed in the `algs` variable. This parameter is only meaningful during update operations, otherwise it is ignored. | -| metadata | `dict` | A dictionary of key-value pairs that will be written directly to the bag's 'bag-info.txt' file. | -| metadata_file | `string` | A JSON file representation of metadata that will be written directly to the bag's 'bag-info.txt' file. The format of this metadata is described [here](./config.md#metadata). | -| remote_file_manifest | `string` | A path to a JSON file representation of remote file entries that will be used to add remote files to the bag file manifest(s) and used to create the bag's `fetch.txt`. The format of this file is described [here](./config.md/#remote-file-manifest). | -| config_file | `string` | A JSON file representation of configuration data that is used during bag creation and update. The format of this file is described [here](./config.md#bdbag.json). | -| ro_metadata | `dict` | A dictionary that will be used to serialize data into one or more JSON files into the bag's `metadata` directory. The format of this metadata is described [here](./config.md#ro_metadata). | -| ro_metadata_file | `string` | A path to a JSON file representation of RO metadata that will be used to serialize data into one or more JSON files into the bag's `metadata` directory. The format of this metadata is described [here](./config.md#ro_metadata). | +| Param | Type | Description | +|----------------------|-----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| bag_path | `string` | A normalized, absolute path to a bag directory. | +| algs | `list` | A list of checksum algorithms to use for calculating file fixities. When creating a bag, only the checksums present in this variable will be used. When updating a bag, this function will take the union of any existing bag algorithms and what is specified by this parameter, ***except*** when the `prune_manifests` parameter is specified, in which case then only the algorithms specifed by this parameter will be used. | +| update | `boolean` | If `bag_path` represents an existing bag, update it. If this parameter is not specified when invoking this function on an existing bag, the function is essentially a NOOP and will emit a logging message to that effect. | +| save_manifests | `boolean` | Defaults to `True`. If true, saves all manifests, recalculating all checksums and regenerating `fetch.txt`. If false, only tagfile manifest checksums are recalculated. Use this flag as an optimization (to avoid recalculating payload file checksums) when only the bag metadata has been changed. This parameter is only meaningful during update operations, otherwise it is ignored. | +| prune_manifests | `boolean` | Removes any file and tagfile manifests for checksums that are not listed in the `algs` variable. This parameter is only meaningful during update operations, otherwise it is ignored. | +| metadata | `dict` | A dictionary of key-value pairs that will be written directly to the bag's 'bag-info.txt' file. | +| metadata_file | `string` | A JSON file representation of metadata that will be written directly to the bag's 'bag-info.txt' file. The format of this metadata is described [here](./config.md#metadata). | +| remote_file_manifest | `string` | A path to a JSON file representation of remote file entries that will be used to add remote files to the bag file manifest(s) and used to create the bag's `fetch.txt`. The format of this file is described [here](./config.md/#remote-file-manifest). | +| config_file | `string` | A JSON file representation of configuration data that is used during bag creation and update. The format of this file is described [here](./config.md#bdbag.json). | +| ro_metadata | `dict` | A dictionary that will be used to serialize data into one or more JSON files into the bag's `metadata` directory. The format of this metadata is described [here](./config.md#ro_metadata). | +| ro_metadata_file | `string` | A path to a JSON file representation of RO metadata that will be used to serialize data into one or more JSON files into the bag's `metadata` directory. The format of this metadata is described [here](./config.md#ro_metadata). | | idempotent | `boolean` | If `True`, date and time specific metadata such as `Bagging-Date` and `Bagging-Time` will be _removed_ (if present) from `bag-info.txt`. This value defaults to `False` if not passed via argument. However, a global override default value of `True` can be enabled in the [config file](./config.md). NOTE: use of `ro_metadata` and `ro_metadata_file` in conjunction with `idempotent` is not recommended at this time due to the generated RO Metadata not being compatible with bag idempotency. | -| strict | `boolean` | If `True`, automatically validate a newly created or updated bag for structural validity and fail if the resultant bag is invalid. This can be used to ensure that a bag is not persisted without payload file manifests. Furthermore, if this argument is `True` and the created or updated output bag is not structurally valid, the bag will subsequently be reverted back to a normal directory and a BagValidationError exception is thrown. | +| strict | `boolean` | If `True`, automatically validate a newly created or updated bag for structural validity and fail if the resultant bag is invalid. This can be used to ensure that a bag is not persisted without payload file manifests. Furthermore, if this argument is `True` and a created output bag is not structurally valid, the bag will subsequently be reverted back to a normal directory. An updated bag will not be reverted. In either case, a BagValidationError exception is thrown. | **Returns**: `bag` - An instantiated [bagit-python](https://github.com/LibraryOfCongress/bagit-python/blob/master/bagit.py) `bag` compatible class object. diff --git a/doc/cli.md b/doc/cli.md index 8bfa13c..a8ee4d3 100644 --- a/doc/cli.md +++ b/doc/cli.md @@ -88,9 +88,9 @@ Update an existing bag dir, recalculating tag-manifest checksums and regeneratin ---- #### `--strict` Automatically validate a newly created or updated bag for structural validity and fail if the resultant bag is invalid. -This can be used to ensure that a bag is not persisted without payload file manifests. If this flag is set and the -created or updated output bag is not structurally valid, the bag will subsequently be reverted back to a normal directory -and an error returned. +This can be used to ensure that a bag is not persisted without payload file manifests. If this flag is set and a +created output bag is not structurally valid, the bag will subsequently be reverted back to a normal directory. +An updated bag will _not_ be reverted. In either case, an error is returned. ---- #### `--revert` @@ -260,6 +260,7 @@ This following table enumerates the various arguments and compatibility modes. | `` | all | Required argument. When no other options are specified, creates a bag from the target path if that path is a directory and not already a bag; otherwise, if the path represents an archive file in a supported format, the file is extracted. | | `--output-path` | bag archive only | For a certain set of functions that may extract bag archive files (currently only `materialize`, `extract`, or `validate`), this argument dictates the directory where the output results of the extraction should be placed. | | `--update` | bag dir only | An existing bag archive cannot be updated in-place. The bag must first be extracted and then updated. | +| `--strict` | regular dir or bag dir only, create or update only | Strict checking is valid only when creating a new bag from a regular directory or updating an existing bag directory. | | `--revert` | bag dir only | Only a bag directory may be reverted to a non-bag directory. | | `--archiver` | bag dir only | A bag archive cannot be created from an existing bag archive. | | `--checksum` | bag dir only | A checksum manifest cannot be added to an existing bag archive. The bag must be extracted, updated, and re-archived. | diff --git a/test/test_api.py b/test/test_api.py index 7a47c56..67a9e7c 100644 --- a/test/test_api.py +++ b/test/test_api.py @@ -228,6 +228,18 @@ def test_create_bag_strict(self): except Exception as e: self.fail(get_typed_exception(e)) + def test_update_bag_strict(self): + logger.info(self.getTestHeader('update bag strict')) + try: + os.mkdir(self.test_data_dir_empty) + bdb.make_bag(self.test_data_dir_empty) + with self.assertRaises(bdbagit.BagValidationError) as ar: + bdb.make_bag(self.test_data_dir_empty, update=True, strict=True) + self.assertTrue(bdb.is_bag(self.test_data_dir_empty)) + logger.error(get_typed_exception(ar.exception)) + except Exception as e: + self.fail(get_typed_exception(e)) + def test_create_bag_idempotent(self): logger.info(self.getTestHeader('create bag idempotent')) try: