diff --git a/perma_web/api/tests/test_link_authorization.py b/perma_web/api/tests/test_link_authorization.py index e162122d1..ff29a36a1 100644 --- a/perma_web/api/tests/test_link_authorization.py +++ b/perma_web/api/tests/test_link_authorization.py @@ -147,14 +147,10 @@ def test_should_allow_user_to_patch_with_file(self): # capture were properly associated with actual web archive files, which is always # the case outside of tests self.link.archive_timestamp = timezone.now() + timedelta(1) - self.link.warc_size = 1 self.link.wacz_size = 1 self.link.save() - # This link has a warc and a wacz self.link.refresh_from_db() - self.assertTrue(self.link.warc_size) - self.assertTrue(self.link.wacz_size) old_primary_capture = self.link.primary_capture @@ -167,11 +163,12 @@ def test_should_allow_user_to_patch_with_file(self): data={'file':file_content}) self.assertTrue(Capture.objects.filter(link_id=self.link.pk, role='primary').exclude(pk=old_primary_capture.pk).exists()) + self.assertTrue(Capture.objects.filter(link_id=self.link.pk, role='provenance_summary').exists()) - # This link now only has a warc, but not a wacz self.link.refresh_from_db() - self.assertTrue(self.link.warc_size) - self.assertFalse(self.link.wacz_size) + + self.assertTrue(self.link.wacz_size) + self.assertTrue(self.link.wacz_size != 1) def test_should_reject_patch_with_file_for_out_of_window_link(self): diff --git a/perma_web/api/tests/test_link_resource.py b/perma_web/api/tests/test_link_resource.py index e08313ec0..6c2c4e386 100644 --- a/perma_web/api/tests/test_link_resource.py +++ b/perma_web/api/tests/test_link_resource.py @@ -94,7 +94,7 @@ def setUp(self): 'private_reason', ] - def assertRecordsInArchive(self, link, upload=False, expected_records=None, check_screenshot=False, check_provenance_summary=False, filetype='wacz'): + def assertRecordsInArchive(self, link, upload=False, expected_records=None, check_screenshot=False, check_provenance_summary=False): def find_recording_in_warc(index, capture_url, content_type): warc_content_type = "application/http; msgtype=response" @@ -129,8 +129,7 @@ def find_attachment_in_warc(index, capture_url): self.assertTrue(link.primary_capture.content_type, "Capture is missing a content type.") # create an index of the warc - extract = filetype == 'wacz' - with link.get_warc(extract) as warc_file: + with link.get_warc() as warc_file: index = index_warc_file(warc_file) # see if the index reports the content is in the warc @@ -655,7 +654,7 @@ def test_should_create_archive_from_pdf_file(self): user=self.org_user) link = Link.objects.get(guid=obj['guid']) - self.assertRecordsInArchive(link, upload=True, filetype='warc') + self.assertRecordsInArchive(link, upload=True) self.assertEqual(link.primary_capture.user_upload, True) def test_should_create_archive_from_jpg_file(self): @@ -666,7 +665,7 @@ def test_should_create_archive_from_jpg_file(self): user=self.org_user) link = Link.objects.get(guid=obj['guid']) - self.assertRecordsInArchive(link, upload=True, filetype='warc') + self.assertRecordsInArchive(link, upload=True) self.assertEqual(link.primary_capture.user_upload, True) def test_should_reject_jpg_file_with_invalid_url(self): @@ -687,7 +686,7 @@ def test_should_should_create_archive_from_jpg_file_with_nonloading_url(self): link = Link.objects.get(guid=obj['guid']) self.assertEqual(link.submitted_url, 'http://asdf.asdf') - self.assertRecordsInArchive(link, upload=True, filetype='warc') + self.assertRecordsInArchive(link, upload=True) self.assertEqual(link.primary_capture.user_upload, True) def test_should_reject_invalid_file(self): diff --git a/perma_web/api/views.py b/perma_web/api/views.py index 38c602bf1..ef25f701d 100644 --- a/perma_web/api/views.py +++ b/perma_web/api/views.py @@ -620,7 +620,7 @@ def patch(self, request, guid, format=None): link.mark_capturejob_superseded() # write new warc and capture - link.write_uploaded_file(uploaded_file, cache_break=True) + link.write_uploaded_file(uploaded_file) # update internet archive if privacy changes if 'is_private' in data and was_private != bool(data.get("is_private")) and link.is_permanent(): diff --git a/perma_web/perma/models.py b/perma_web/perma/models.py index 29e041b43..651a1bf21 100755 --- a/perma_web/perma/models.py +++ b/perma_web/perma/models.py @@ -49,13 +49,12 @@ first_day_of_next_month, pp_date_from_post, prep_for_perma_payments, - preserve_perma_warc, + preserve_perma_wacz, process_perma_payments_transmission, protocol, remove_control_characters, today_next_year, tz_datetime, - write_resource_record_from_asset, ) logger = logging.getLogger(__name__) @@ -1974,9 +1973,9 @@ def get_pages_jsonl(self): ) return "\n".join([json.dumps(row) for row in jsonl_rows]) - def write_uploaded_file(self, uploaded_file, cache_break=False): + def write_uploaded_file(self, uploaded_file): """ - Given a file uploaded by a user, create a Capture record and warc. + Given a file uploaded by a user, create a Capture record and WACZ. """ from api.utils import get_mime_type, mime_type_lookup # local import to avoid circular import @@ -1985,27 +1984,44 @@ def write_uploaded_file(self, uploaded_file, cache_break=False): file_name = f'upload.{mime_type_lookup[mime_type]["new_extension"]}' warc_url = f"file:///{self.guid}/{file_name}" - # append a random number to warc_url if we're replacing a file, to avoid browser cache - if cache_break: - r = random.SystemRandom() - warc_url += f"?version={str(r.random()).replace('.', '')}" - - capture = Capture(link=self, - role='primary', - status='success', - record_type='resource', - user_upload='True', - content_type=mime_type, - url=warc_url) - warc_size = [] # pass a mutable container to the context manager, so that it can populate it with the size of the finished warc - with preserve_perma_warc(self.guid, self.creation_timestamp, self.warc_storage_file(), warc_size) as warc: - uploaded_file.file.seek(0) - write_resource_record_from_asset(uploaded_file.file.read(), warc_url, mime_type, warc) + upload_capture = Capture( + link=self, + role='primary', + status='success', + record_type='resource', + user_upload=True, + content_type=mime_type, + url=warc_url + ) + + provenance_capture = Capture( + link=self, + role='provenance_summary', + status='success', + record_type='resource', + user_upload=False, + content_type='text/html', + url='file:///provenance-summary.html' + ) + + # make the WACZ + self.wacz_size = preserve_perma_wacz( + uploaded_file, + warc_url, + mime_type, + self.guid, + self.submitted_url, + self.submitted_title, + self.creation_timestamp, + self.wacz_storage_file() + ) + self.warc_size = 0 # necessary? + self.captured_by_software = 'upload' self.captured_by_browser = None - self.warc_size = warc_size[0] - self.save(update_fields=['captured_by_software', 'captured_by_browser', 'warc_size']) - capture.save() + self.save(update_fields=['captured_by_software', 'captured_by_browser', 'warc_size', 'wacz_size']) + upload_capture.save() + provenance_capture.save() def safe_delete_warc(self): old_name = self.warc_storage_file() diff --git a/perma_web/perma/settings/deployments/settings_common.py b/perma_web/perma/settings/deployments/settings_common.py index f53054022..6e2aff991 100644 --- a/perma_web/perma/settings/deployments/settings_common.py +++ b/perma_web/perma/settings/deployments/settings_common.py @@ -625,6 +625,10 @@ # Before deployment, we suppress the addition of new capture jobs when this file is present DEPLOYMENT_SENTINEL = '/tmp/perma-deployment-pending' +# for inclusion in datapackage.json for user uploads; to be replaced with a +# short commit hash in deployments +PERMA_VERSION = 'dev' + # Which settings should be available in all Django templates, # without needing to explicitly pass them via the view? TEMPLATE_VISIBLE_SETTINGS = ( diff --git a/perma_web/perma/templates/provenance-summary.html b/perma_web/perma/templates/provenance-summary.html new file mode 100644 index 000000000..e3d529f92 --- /dev/null +++ b/perma_web/perma/templates/provenance-summary.html @@ -0,0 +1,103 @@ + + + + Provenance summary for user upload for {{url}} + + + + + + + + + +
+ +
+

Provenance Summary

+

The data present in this capture, with MIME type {{ mime_type }}, were uploaded by a Perma user at {{ now }} to replace a failed or unsatisfactory capture of {{ url }} at {{ creation_timestamp }}.

+
+ +
+ + + + + diff --git a/perma_web/perma/utils.py b/perma_web/perma/utils.py index ba31a4255..64979c83c 100644 --- a/perma_web/perma/utils.py +++ b/perma_web/perma/utils.py @@ -1,20 +1,24 @@ from collections import OrderedDict -from contextlib import contextmanager +from contextlib import contextmanager, redirect_stdout import csv from datetime import datetime, timedelta from datetime import timezone as tz from functools import reduce, wraps import hashlib +import io import itertools import json import logging import operator import os +import shutil import string import tempfile from typing import Literal, TypeVar +import uuid import unicodedata from wsgiref.util import FileWrapper +import zipfile from dateutil.relativedelta import relativedelta from django.conf import settings @@ -33,6 +37,7 @@ JsonResponse, StreamingHttpResponse, ) +from django.template import loader from django.urls import reverse from django.utils import timezone from django.views.decorators.debug import sensitive_variables @@ -43,6 +48,7 @@ import surt import tempdir from ua_parser import user_agent_parser +from warcio.indexer import Indexer from warcio.warcwriter import BufferWARCWriter from perma.exceptions import ( @@ -498,26 +504,215 @@ def decrypt_from_perma_payments(ciphertext, encoder=encoding.Base64Encoder): return box.decrypt(ciphertext, encoder=encoder) # -# warc writing +# wacz writing # -@contextmanager -def preserve_perma_warc(guid, timestamp, destination, warc_size): +def now(): + """Returns the current time""" + return tuple(datetime.utcnow().timetuple()[:6]) + + +def parse_warc(warc_file, warc_url): + """ Gets length, digest, and offset for uploaded file as well as provenance file """ + targets = [warc_url, "file:///provenance-summary.html"] + response = {target: {key: None for key in ["length", "digest", "offset"]} for target in targets} + f = io.StringIO() + with redirect_stdout(f): + indexer = Indexer(fields=["offset", "length", "warc-target-uri", "warc-block-digest"], inputs=[warc_file], output='-') + indexer.process_all() + out = f.getvalue() + index = [json.loads(o) for o in out.split("\n") if o] + for entry in index: + if "warc-target-uri" in entry: + for target in targets: + if entry["warc-target-uri"] == target: + response[target]["length"] = entry["length"] + response[target]["offset"] = entry["offset"] + response[target]["digest"] = entry["warc-block-digest"] + return response + + +def sha256(input_file, buf_size=65536): + """ Returns the SHA256 hexdigest of a file """ + sha256 = hashlib.sha256() + with open(input_file, 'rb') as f: + while True: + data = f.read(buf_size) + if not data: + break + sha256.update(data) + return sha256.hexdigest() + + +def preserve_perma_wacz(uploaded_file, warc_url, mime_type, guid, url, title, creation_timestamp, wacz_destination): """ - Context manager for opening a perma warc, ready to receive warc records. - Safely closes and saves the file to storage when context is exited. + Creates and writes a perma WACZ for a user upload, returning the WACZ size. + This necessarily creates a WARC, but we no longer save it. """ - # mode set to 'ab+' as a workaround for https://bugs.python.org/issue25341 - out = tempfile.TemporaryFile('ab+') - write_perma_warc_header(out, guid, timestamp) - try: - yield out - finally: - out.flush() - warc_size.append(out.tell()) - out.seek(0) - storages[settings.WARC_STORAGE].store_file(out, destination, overwrite=True) - out.close() + timestamp = datetime.utcnow() + ts_string = timestamp.isoformat() + "Z" + # Link's creation_timestamp has "+00:00" at the end, so + creation_ts_string = creation_timestamp.isoformat().partition("+")[0] + "Z" + + with tempfile.TemporaryDirectory() as tmpdir: + # prepare WARC... + warc_file = f"{tmpdir}/data.warc.gz" + warc = open(warc_file, 'ab+') + write_perma_warc_header(warc, guid, timestamp) + + uploaded_file.file.seek(0) + write_resource_record_from_asset(uploaded_file.file.read(), warc_url, mime_type, ts_string, warc) + + # create provenance summary and add it to the WARC + provenance = loader.get_template("provenance-summary.html") + context = { + "url": url, + "now": ts_string, + "mime_type": mime_type, + "creation_timestamp": creation_ts_string + } + write_resource_record_from_asset( + provenance.render(context).encode(), + "file:///provenance-summary.html", + "text/html", + ts_string, + warc + ) + warc.close() + + # ...set up pages.jsonl... + pages = [ + {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"}, + { + "id": f"{uuid.uuid4()}", + "url": warc_url, + "title": f"User-uploaded file replacing failed capture of {url}", + "ts": ts_string + }, + { + "id": f"{uuid.uuid4()}", + "url": "file:///provenance-summary.html", + "title": "Provenance Summary", + "ts": ts_string + } + ] + pages_bytes = "\n".join([json.dumps(page) for page in pages]).encode("utf-8") + + # ...CDXJ index... + targets = ["file:///provenance-summary.html", warc_url] + selected_warc_headers = parse_warc(warc_file, warc_url) + cdxj = { + target: json.dumps({ + "url": target, + "mime": "text/html" if target.endswith(".html") else mime_type, + "status": 200, + "digest": selected_warc_headers[target]["digest"], + "length": selected_warc_headers[target]["length"], + "offset": selected_warc_headers[target]["offset"], + "filename":"data.warc.gz" + }).replace(" ", "") for target in targets + } + ts = timestamp.strftime("%Y%m%d%H%M%S") + index = "\n".join( + [ + f"{target} {ts} {cdxj[target]}" + for target in targets + ] + ) + + # ...datapackage... + datapackage = { + "profile": "data-package", + "wacz_version": "1.1.1", + "title": title, + "description": f"User upload for {url}", + "mainPageURL": warc_url, + "created": ts_string, + "software": f"Perma.cc {settings.PERMA_VERSION}", + "resources": [ + { + "name": "pages.jsonl", + "path": "pages/pages.jsonl", + "hash": "sha256:" + hashlib.sha256(pages_bytes).hexdigest(), + "bytes": len(pages_bytes) + }, + { + "name": "index.cdx", + "path": "indexes/index.cdx", + "hash": "sha256:" + hashlib.sha256(index.encode()).hexdigest(), + "bytes": len(index) + }, + { + "name": "data.warc.gz", + "path": "archive/data.warc.gz", + "hash": "sha256:" + sha256(warc_file), + "bytes": os.stat(warc_file).st_size + } + ], + } + # ...and datapackage digest + datapackage_digest = { + "path": "datapackage.json", + "hash": hashlib.sha256(json.dumps(datapackage).encode()).hexdigest() + } + + # Now we can create the WACZ file... + wacz_file = f"{tmpdir}/{guid}.wacz" + wacz = zipfile.ZipFile(wacz_file, "w") + + # add index + index_file = zipfile.ZipInfo("indexes/index.cdx", now()) + index_file.compress_type = zipfile.ZIP_DEFLATED + wacz.writestr(index_file, index.encode("utf-8")) + + # add pages.jsonl + pages_file = zipfile.ZipInfo("pages/pages.jsonl", now()) + pages_file.compress_type = zipfile.ZIP_DEFLATED + wacz.writestr(pages_file, pages_bytes) + + # add WARC file + archive_file = zipfile.ZipInfo.from_file( + warc_file, "archive/data.warc.gz" + ) + with wacz.open(archive_file, "w") as out_fh: + with open(warc_file, "rb") as in_fh: + shutil.copyfileobj(in_fh, out_fh) + + # add datapackage + datapackage_file = zipfile.ZipInfo("datapackage.json", now()) + datapackage_file.compress_type = zipfile.ZIP_DEFLATED + wacz.writestr( + datapackage_file, + json.dumps(datapackage).encode("utf-8") + ) + + # and datapackage digest + datapackage_digest_file = zipfile.ZipInfo( + "datapackage-digest.json", now() + ) + datapackage_digest_file.compress_type = zipfile.ZIP_DEFLATED + wacz.writestr( + datapackage_digest_file, + json.dumps(datapackage_digest).encode("utf-8") + ) + + # and close the file + wacz.close() + + # now store it + with open(wacz_file, "rb") as f: + storages[settings.WACZ_STORAGE].store_file(f, wacz_destination, overwrite=True) + + wacz_size = os.stat(wacz_file).st_size + + # (no need to clean up, because the context manager will do it) + + # ...and return the size + return wacz_size + +# +# warc writing +# def write_perma_warc_header(out_file, guid, timestamp): # build warcinfo header @@ -528,7 +723,7 @@ def write_perma_warc_header(out_file, guid, timestamp): ] warcinfo_fields = [ b'operator: Perma.cc', - b'format: WARC File Format 1.0', + b'format: WARC file version 1.0', bytes(f'Perma-GUID: {guid}', 'utf-8') ] data = b'\r\n'.join(warcinfo_fields) + b'\r\n' @@ -553,7 +748,7 @@ def make_detailed_warcinfo(filename, guid, coll_title, coll_desc, rec_title, pag writer = BufferWARCWriter(gzip=True) params = OrderedDict([('operator', 'Perma.cc download'), ('Perma-GUID', guid), - ('format', 'WARC File Format 1.0'), + ('format', 'WARC file version 1.0'), ('json-metadata', json.dumps(coll_metadata))]) record = writer.create_warcinfo_record(filename, params) @@ -568,18 +763,17 @@ def make_detailed_warcinfo(filename, guid, coll_title, coll_desc, rec_title, pag return writer.get_contents() -def write_resource_record_from_asset(data, url, content_type, out_file, extra_headers=None): +def write_resource_record_from_asset(data, url, content_type, warc_date, out_file, extra_headers=None): """ Constructs a single WARC resource record from an asset (screenshot, uploaded file, etc.) and writes to out_file. """ - warc_date = warctools.warc.warc_datetime_str(timezone.now()).replace(b'+00:00Z', b'Z') headers = [ (warctools.WarcRecord.TYPE, warctools.WarcRecord.RESOURCE), (warctools.WarcRecord.ID, warctools.WarcRecord.random_warc_uuid()), - (warctools.WarcRecord.DATE, warc_date), + (warctools.WarcRecord.DATE, bytes(warc_date, 'utf-8')), (warctools.WarcRecord.URL, bytes(url, 'utf-8')), - (warctools.WarcRecord.BLOCK_DIGEST, bytes(f'sha1:{hashlib.sha1(data).hexdigest()}', 'utf-8')) + (warctools.WarcRecord.BLOCK_DIGEST, bytes(f'sha256:{hashlib.sha256(data).hexdigest()}', 'utf-8')) ] if extra_headers: headers.extend(extra_headers) diff --git a/perma_web/requirements.txt b/perma_web/requirements.txt index bb2840f2f..77058783b 100644 --- a/perma_web/requirements.txt +++ b/perma_web/requirements.txt @@ -419,9 +419,9 @@ hypothesis==6.98.17 \ --hash=sha256:313f64b9f9f95e12c8b5342466bef7f352d2608afeeb434817c039602b45f0c4 \ --hash=sha256:bbd227000cc21a9686a00867f031479c3812d8ab076e4af1c813f6b3a50c98f5 # via -r requirements.in -idna==3.7 \ - --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \ - --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0 +idna==3.10 \ + --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \ + --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3 # via # requests # tldextract