Skip to content

Commit e073a10

Browse files
committed
Resolutions for #13 and #21
- Resolve #21 with a new driver program `aipsip` that generates both the AIP and uses it to make the SIP as well, leaving all in the current working directory (along with two—count 'em, *two*—PDS labels for the price of one!). - Updates the Python `setuptools` metadata to generate the new `aipsip` (helps with #21). - Refactors logging and command-line argument setup (also for #21). - Unifies logging between `aipgen` and `sipgen` with the new `aipsip` so that there are `--debug` and `--quiet` options; without either you get a nominal amount of "hand-holding" of output. - Resolve #13 so that instead of billions of redundant XML parsing and XPath lookups we use a local `sqlite3` database and LRU caching. - Factor out XML parsing from `aipgen` and `sipgen` so we can apply caching. - Clear up logging messages so we can know what's calling what. - Create a temp DB in `sipgen` and populate it with mappings from lidvids to XML files for rapid lookups - But see also #25 for other uses of that DB. - Add standardized `--version` arguments for all three programs. With these changes, running `sipgen` on my Mac¹ can process a 272GiB `insight_cameras` export in 1:03. On `pdsimg-int1`, it handles the 1.5TiB`insight_cameras` dataset in under 4 hours. Footnotes: - ¹2.4 GHz 8-core Intel Core i9, SSD - ²2.3 GHz 8-core Intel Xeon Gold 6140, unknown drive
1 parent cb41ecd commit e073a10

File tree

6 files changed

+275
-140
lines changed

6 files changed

+275
-140
lines changed

setup.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@
6464
entry_points={
6565
'console_scripts': [
6666
'sipgen=pds.aipgen.sip:main',
67-
'aipgen=pds.aipgen.aip:main'
67+
'aipgen=pds.aipgen.aip:main',
68+
'aipsip=pds.aipgen.main:main'
6869
]
6970
},
7071
namespace_packages=['pds'],

src/pds/aipgen/aip.py

+15-20
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131

3232

3333
from .constants import PDS_NS_URI, XML_SCHEMA_INSTANCE_NS_URI, PDS_SCHEMA_URL, XML_MODEL_PI, INFORMATION_MODEL_VERSION
34-
from .utils import getPrimariesAndOtherInfo, getMD5
34+
from .utils import getPrimariesAndOtherInfo, getMD5, parseXML, addLoggingArguments
3535
from datetime import datetime
3636
from lxml import etree
3737
import argparse, logging, sys, os, os.path, hashlib
@@ -41,6 +41,7 @@
4141
# ---------
4242

4343
# For ``--help``:
44+
_version = '0.0.0'
4445
_description = '''Generate an Archive Information Package or AIP. An AIP consists of three files:
4546
➀ a "checksum manifest" which contains MD5 hashes of *all* files in a product;
4647
➁ a "transfer manifest" which lists the "lidvids" for files within each XML label mentioned in a product; and
@@ -53,12 +54,8 @@
5354
# Comment to insert near the top of an AIP XML label
5455
_iaComment = 'Parse name from bundle logical_identifier, e.g. urn:nasa:pds:ladee_mission_bundle would be ladee_mission_bundle'
5556

56-
57-
# Logging
58-
# -------
59-
57+
# Logging:
6058
_logger = logging.getLogger(__name__)
61-
logging.basicConfig(format='%(levelname)s %(message)s', level=logging.INFO)
6259

6360

6461
# Functions
@@ -94,7 +91,7 @@ def _getLIDVIDandFileInventory(xmlFile):
9491
identifier, return None and None.
9592
'''
9693
_logger.debug('📜 Analyzing XML in %s', xmlFile)
97-
tree = etree.parse(xmlFile)
94+
tree = parseXML(xmlFile)
9895
root = tree.getroot()
9996
matches = root.findall(f'./{{{PDS_NS_URI}}}Identification_Area/{{{PDS_NS_URI}}}logical_identifier')
10097
if not matches:
@@ -291,10 +288,10 @@ def _writeLabel(
291288
tree.write(labelOutputFile, encoding='utf-8', xml_declaration=True, pretty_print=True)
292289

293290

294-
def _process(bundle):
291+
def process(bundle):
295292
'''Generate a "checksum manifest", a "transfer manifest", and a PDS label from the given
296293
``bundle``, which is an open file stream (with a ``name`` atribute) on the local
297-
filesystem.
294+
filesystem. Return the name of the generated checksum manifest file.
298295
'''
299296
_logger.info('🏃‍♀️ Starting AIP generation for %s', bundle.name)
300297
d = os.path.dirname(os.path.abspath(bundle.name))
@@ -304,7 +301,7 @@ def _process(bundle):
304301
strippedLogicalID = bundleLID.split(':')[-1]
305302

306303
# Easy one: the checksum† manifest
307-
# †It's actually an MD5 hash, not a checksum 😅
304+
# †It's actually an MD5 *hash*, not a checksum 😅
308305
chksumFN = strippedLogicalID + '_checksum_manifest_v' + bundleVID + '.tab'
309306
chksumMD5, chksumSize, chksumNum = _writeChecksumManifest(chksumFN, d)
310307

@@ -328,27 +325,25 @@ def _process(bundle):
328325
xferSize,
329326
xferNum
330327
)
331-
_logger.info('🎉 Success! All done, files generated:')
328+
_logger.info('🎉 Success! AIP done, files generated:')
332329
_logger.info('• Checksum manifest: %s', chksumFN)
333330
_logger.info('• Transfer manifest: %s', xferFN)
334-
_logger.info('• XML label: %s', labelFN)
331+
_logger.info('• XML label for them both: %s', labelFN)
332+
return chksumFN
335333

336334

337335
def main():
338-
'''Check the command-line for options and create a SIP from the given bundle XML'''
336+
'''Check the command-line for options and create an AIP from the given bundle XML'''
339337
parser = argparse.ArgumentParser(description=_description)
338+
parser.add_argument('--version', action='version', version=f'%(prog)s {_version}')
339+
addLoggingArguments(parser)
340340
parser.add_argument(
341341
'bundle', type=argparse.FileType('rb'), metavar='IN-BUNDLE.XML', help='Root bundle XML file to read'
342342
)
343-
parser.add_argument(
344-
'-v', '--verbose', default=False, action='store_true',
345-
help='Verbose logging; defaults %(default)s'
346-
)
347343
args = parser.parse_args()
348-
if args.verbose:
349-
_logger.setLevel(logging.DEBUG)
344+
logging.basicConfig(level=args.loglevel, format='%(levelname)s %(message)s')
350345
_logger.debug('⚙️ command line args = %r', args)
351-
_process(args.bundle)
346+
process(args.bundle)
352347
_logger.info('👋 Thanks for using this program! Bye!')
353348
sys.exit(0)
354349

src/pds/aipgen/constants.py

+10
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,13 @@
5252

5353
# Filename extension to use with PDS labels
5454
PDS_LABEL_FILENAME_EXTENSION = '.xml'
55+
56+
# Command-line names for hash algorithms mapped to Python *implementation*
57+
# name which are standardized (as lower case, no less) in the ``hashlib``.
58+
# There are a lot more possible message digest algorithms, but we choose
59+
# to support just three.
60+
HASH_ALGORITHMS = {
61+
'MD5': 'md5',
62+
'SHA-1': 'sha1',
63+
'SHA-256': 'sha256',
64+
}

src/pds/aipgen/main.py

+97
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
# encoding: utf-8
2+
#
3+
# Copyright © 2020 California Institute of Technology ("Caltech").
4+
# ALL RIGHTS RESERVED. U.S. Government sponsorship acknowledged.
5+
#
6+
# Redistribution and use in source and binary forms, with or without
7+
# modification, are permitted provided that the following conditions are met:
8+
#
9+
# • Redistributions of source code must retain the above copyright notice,
10+
# this list of conditions and the following disclaimer.
11+
# • Redistributions must reproduce the above copyright notice, this list of
12+
# conditions and the following disclaimer in the documentation and/or other
13+
# materials provided with the distribution.
14+
# • Neither the name of Caltech nor its operating division, the Jet Propulsion
15+
# Laboratory, nor the names of its contributors may be used to endorse or
16+
# promote products derived from this software without specific prior written
17+
# permission.
18+
#
19+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22+
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23+
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24+
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25+
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26+
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27+
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28+
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29+
# POSSIBILITY OF SUCH DAMAGE.
30+
31+
32+
u'''AIP and SIP generation'''
33+
34+
from .aip import process as aipProcess
35+
from .constants import HASH_ALGORITHMS
36+
from .sip import addSIParguments
37+
from .sip import produce as sipProcess
38+
from .utils import addLoggingArguments
39+
import argparse, sys, logging
40+
41+
42+
# Constants
43+
# ---------
44+
45+
# For ``--help``:
46+
_version = '0.0.0'
47+
_description = '''
48+
Generate an Archive Information Package (AIP) and a Submission Information
49+
Package (SIP). This creates three files for the AIP in the current directory
50+
(overwriting them if they already exist): ➀ a "checksum manifest" which
51+
contains MD5 hashes of *all* files in a product; ➁ a "transfer manifest" which
52+
lists the "lidvids" for files within each XML label mentioned in a product;
53+
and ➂ an XML label for these two files. It also creates two files for the SIP
54+
(also overwriting them if they exist): ➀ A "SIP manifest" file; and an XML
55+
label of that file too. The names of the generated files are based on the
56+
logical identifier found in the bundle file, and any existing files are
57+
overwritten. The names of the generated files are printed upon successful
58+
completion.
59+
'''
60+
61+
# Logging:
62+
_logger = logging.getLogger(__name__)
63+
64+
65+
# Functions
66+
# ---------
67+
68+
def main():
69+
'''Make an AIP and a SIP'''
70+
parser = argparse.ArgumentParser(description=_description)
71+
parser.add_argument('--version', action='version', version=f'%(prog)s {_version}')
72+
addSIParguments(parser)
73+
addLoggingArguments(parser)
74+
parser.add_argument(
75+
'bundle', type=argparse.FileType('rb'), metavar='IN-BUNDLE.XML', help='Bundle XML file to read'
76+
)
77+
args = parser.parse_args()
78+
logging.basicConfig(level=args.loglevel, format='%(levelname)s %(message)s')
79+
_logger.debug('⚙️ command line args = %r', args)
80+
chksumFN = aipProcess(args.bundle)
81+
with open(chksumFN, 'rb') as chksumStream:
82+
sipProcess(
83+
args.bundle,
84+
HASH_ALGORITHMS[args.algorithm],
85+
args.url,
86+
args.insecure,
87+
args.site,
88+
args.offline,
89+
args.bundle_base_url,
90+
chksumStream
91+
)
92+
_logger.info("👋 That's it! Thanks for making an AIP and SIP with us today. Bye!")
93+
sys.exit(0)
94+
95+
96+
if __name__ == '__main__':
97+
main()

0 commit comments

Comments
 (0)