diff --git a/perma_web/api/tests/test_link_authorization.py b/perma_web/api/tests/test_link_authorization.py
index e162122d1..ff29a36a1 100644
--- a/perma_web/api/tests/test_link_authorization.py
+++ b/perma_web/api/tests/test_link_authorization.py
@@ -147,14 +147,10 @@ def test_should_allow_user_to_patch_with_file(self):
# capture were properly associated with actual web archive files, which is always
# the case outside of tests
self.link.archive_timestamp = timezone.now() + timedelta(1)
- self.link.warc_size = 1
self.link.wacz_size = 1
self.link.save()
- # This link has a warc and a wacz
self.link.refresh_from_db()
- self.assertTrue(self.link.warc_size)
- self.assertTrue(self.link.wacz_size)
old_primary_capture = self.link.primary_capture
@@ -167,11 +163,12 @@ def test_should_allow_user_to_patch_with_file(self):
data={'file':file_content})
self.assertTrue(Capture.objects.filter(link_id=self.link.pk, role='primary').exclude(pk=old_primary_capture.pk).exists())
+ self.assertTrue(Capture.objects.filter(link_id=self.link.pk, role='provenance_summary').exists())
- # This link now only has a warc, but not a wacz
self.link.refresh_from_db()
- self.assertTrue(self.link.warc_size)
- self.assertFalse(self.link.wacz_size)
+
+ self.assertTrue(self.link.wacz_size)
+ self.assertTrue(self.link.wacz_size != 1)
def test_should_reject_patch_with_file_for_out_of_window_link(self):
diff --git a/perma_web/api/tests/test_link_resource.py b/perma_web/api/tests/test_link_resource.py
index e08313ec0..6c2c4e386 100644
--- a/perma_web/api/tests/test_link_resource.py
+++ b/perma_web/api/tests/test_link_resource.py
@@ -94,7 +94,7 @@ def setUp(self):
'private_reason',
]
- def assertRecordsInArchive(self, link, upload=False, expected_records=None, check_screenshot=False, check_provenance_summary=False, filetype='wacz'):
+ def assertRecordsInArchive(self, link, upload=False, expected_records=None, check_screenshot=False, check_provenance_summary=False):
def find_recording_in_warc(index, capture_url, content_type):
warc_content_type = "application/http; msgtype=response"
@@ -129,8 +129,7 @@ def find_attachment_in_warc(index, capture_url):
self.assertTrue(link.primary_capture.content_type, "Capture is missing a content type.")
# create an index of the warc
- extract = filetype == 'wacz'
- with link.get_warc(extract) as warc_file:
+ with link.get_warc() as warc_file:
index = index_warc_file(warc_file)
# see if the index reports the content is in the warc
@@ -655,7 +654,7 @@ def test_should_create_archive_from_pdf_file(self):
user=self.org_user)
link = Link.objects.get(guid=obj['guid'])
- self.assertRecordsInArchive(link, upload=True, filetype='warc')
+ self.assertRecordsInArchive(link, upload=True)
self.assertEqual(link.primary_capture.user_upload, True)
def test_should_create_archive_from_jpg_file(self):
@@ -666,7 +665,7 @@ def test_should_create_archive_from_jpg_file(self):
user=self.org_user)
link = Link.objects.get(guid=obj['guid'])
- self.assertRecordsInArchive(link, upload=True, filetype='warc')
+ self.assertRecordsInArchive(link, upload=True)
self.assertEqual(link.primary_capture.user_upload, True)
def test_should_reject_jpg_file_with_invalid_url(self):
@@ -687,7 +686,7 @@ def test_should_should_create_archive_from_jpg_file_with_nonloading_url(self):
link = Link.objects.get(guid=obj['guid'])
self.assertEqual(link.submitted_url, 'http://asdf.asdf')
- self.assertRecordsInArchive(link, upload=True, filetype='warc')
+ self.assertRecordsInArchive(link, upload=True)
self.assertEqual(link.primary_capture.user_upload, True)
def test_should_reject_invalid_file(self):
diff --git a/perma_web/api/views.py b/perma_web/api/views.py
index 38c602bf1..ef25f701d 100644
--- a/perma_web/api/views.py
+++ b/perma_web/api/views.py
@@ -620,7 +620,7 @@ def patch(self, request, guid, format=None):
link.mark_capturejob_superseded()
# write new warc and capture
- link.write_uploaded_file(uploaded_file, cache_break=True)
+ link.write_uploaded_file(uploaded_file)
# update internet archive if privacy changes
if 'is_private' in data and was_private != bool(data.get("is_private")) and link.is_permanent():
diff --git a/perma_web/perma/models.py b/perma_web/perma/models.py
index 29e041b43..651a1bf21 100755
--- a/perma_web/perma/models.py
+++ b/perma_web/perma/models.py
@@ -49,13 +49,12 @@
first_day_of_next_month,
pp_date_from_post,
prep_for_perma_payments,
- preserve_perma_warc,
+ preserve_perma_wacz,
process_perma_payments_transmission,
protocol,
remove_control_characters,
today_next_year,
tz_datetime,
- write_resource_record_from_asset,
)
logger = logging.getLogger(__name__)
@@ -1974,9 +1973,9 @@ def get_pages_jsonl(self):
)
return "\n".join([json.dumps(row) for row in jsonl_rows])
- def write_uploaded_file(self, uploaded_file, cache_break=False):
+ def write_uploaded_file(self, uploaded_file):
"""
- Given a file uploaded by a user, create a Capture record and warc.
+ Given a file uploaded by a user, create a Capture record and WACZ.
"""
from api.utils import get_mime_type, mime_type_lookup # local import to avoid circular import
@@ -1985,27 +1984,44 @@ def write_uploaded_file(self, uploaded_file, cache_break=False):
file_name = f'upload.{mime_type_lookup[mime_type]["new_extension"]}'
warc_url = f"file:///{self.guid}/{file_name}"
- # append a random number to warc_url if we're replacing a file, to avoid browser cache
- if cache_break:
- r = random.SystemRandom()
- warc_url += f"?version={str(r.random()).replace('.', '')}"
-
- capture = Capture(link=self,
- role='primary',
- status='success',
- record_type='resource',
- user_upload='True',
- content_type=mime_type,
- url=warc_url)
- warc_size = [] # pass a mutable container to the context manager, so that it can populate it with the size of the finished warc
- with preserve_perma_warc(self.guid, self.creation_timestamp, self.warc_storage_file(), warc_size) as warc:
- uploaded_file.file.seek(0)
- write_resource_record_from_asset(uploaded_file.file.read(), warc_url, mime_type, warc)
+ upload_capture = Capture(
+ link=self,
+ role='primary',
+ status='success',
+ record_type='resource',
+ user_upload=True,
+ content_type=mime_type,
+ url=warc_url
+ )
+
+ provenance_capture = Capture(
+ link=self,
+ role='provenance_summary',
+ status='success',
+ record_type='resource',
+ user_upload=False,
+ content_type='text/html',
+ url='file:///provenance-summary.html'
+ )
+
+ # make the WACZ
+ self.wacz_size = preserve_perma_wacz(
+ uploaded_file,
+ warc_url,
+ mime_type,
+ self.guid,
+ self.submitted_url,
+ self.submitted_title,
+ self.creation_timestamp,
+ self.wacz_storage_file()
+ )
+ self.warc_size = 0 # necessary?
+
self.captured_by_software = 'upload'
self.captured_by_browser = None
- self.warc_size = warc_size[0]
- self.save(update_fields=['captured_by_software', 'captured_by_browser', 'warc_size'])
- capture.save()
+ self.save(update_fields=['captured_by_software', 'captured_by_browser', 'warc_size', 'wacz_size'])
+ upload_capture.save()
+ provenance_capture.save()
def safe_delete_warc(self):
old_name = self.warc_storage_file()
diff --git a/perma_web/perma/settings/deployments/settings_common.py b/perma_web/perma/settings/deployments/settings_common.py
index f53054022..6e2aff991 100644
--- a/perma_web/perma/settings/deployments/settings_common.py
+++ b/perma_web/perma/settings/deployments/settings_common.py
@@ -625,6 +625,10 @@
# Before deployment, we suppress the addition of new capture jobs when this file is present
DEPLOYMENT_SENTINEL = '/tmp/perma-deployment-pending'
+# for inclusion in datapackage.json for user uploads; to be replaced with a
+# short commit hash in deployments
+PERMA_VERSION = 'dev'
+
# Which settings should be available in all Django templates,
# without needing to explicitly pass them via the view?
TEMPLATE_VISIBLE_SETTINGS = (
diff --git a/perma_web/perma/templates/provenance-summary.html b/perma_web/perma/templates/provenance-summary.html
new file mode 100644
index 000000000..e3d529f92
--- /dev/null
+++ b/perma_web/perma/templates/provenance-summary.html
@@ -0,0 +1,103 @@
+
+
+
+ Provenance summary for user upload for {{url}}
+
+
+
+
+
+
+
+
+
+
+
+
+ Provenance Summary
+ The data present in this capture, with MIME type {{ mime_type }}
, were uploaded by a Perma user at {{ now }} to replace a failed or unsatisfactory capture of {{ url }} at {{ creation_timestamp }}.
+
+
+
+
+
+
+
+
diff --git a/perma_web/perma/utils.py b/perma_web/perma/utils.py
index ba31a4255..64979c83c 100644
--- a/perma_web/perma/utils.py
+++ b/perma_web/perma/utils.py
@@ -1,20 +1,24 @@
from collections import OrderedDict
-from contextlib import contextmanager
+from contextlib import contextmanager, redirect_stdout
import csv
from datetime import datetime, timedelta
from datetime import timezone as tz
from functools import reduce, wraps
import hashlib
+import io
import itertools
import json
import logging
import operator
import os
+import shutil
import string
import tempfile
from typing import Literal, TypeVar
+import uuid
import unicodedata
from wsgiref.util import FileWrapper
+import zipfile
from dateutil.relativedelta import relativedelta
from django.conf import settings
@@ -33,6 +37,7 @@
JsonResponse,
StreamingHttpResponse,
)
+from django.template import loader
from django.urls import reverse
from django.utils import timezone
from django.views.decorators.debug import sensitive_variables
@@ -43,6 +48,7 @@
import surt
import tempdir
from ua_parser import user_agent_parser
+from warcio.indexer import Indexer
from warcio.warcwriter import BufferWARCWriter
from perma.exceptions import (
@@ -498,26 +504,215 @@ def decrypt_from_perma_payments(ciphertext, encoder=encoding.Base64Encoder):
return box.decrypt(ciphertext, encoder=encoder)
#
-# warc writing
+# wacz writing
#
-@contextmanager
-def preserve_perma_warc(guid, timestamp, destination, warc_size):
+def now():
+ """Returns the current time"""
+ return tuple(datetime.utcnow().timetuple()[:6])
+
+
+def parse_warc(warc_file, warc_url):
+ """ Gets length, digest, and offset for uploaded file as well as provenance file """
+ targets = [warc_url, "file:///provenance-summary.html"]
+ response = {target: {key: None for key in ["length", "digest", "offset"]} for target in targets}
+ f = io.StringIO()
+ with redirect_stdout(f):
+ indexer = Indexer(fields=["offset", "length", "warc-target-uri", "warc-block-digest"], inputs=[warc_file], output='-')
+ indexer.process_all()
+ out = f.getvalue()
+ index = [json.loads(o) for o in out.split("\n") if o]
+ for entry in index:
+ if "warc-target-uri" in entry:
+ for target in targets:
+ if entry["warc-target-uri"] == target:
+ response[target]["length"] = entry["length"]
+ response[target]["offset"] = entry["offset"]
+ response[target]["digest"] = entry["warc-block-digest"]
+ return response
+
+
+def sha256(input_file, buf_size=65536):
+ """ Returns the SHA256 hexdigest of a file """
+ sha256 = hashlib.sha256()
+ with open(input_file, 'rb') as f:
+ while True:
+ data = f.read(buf_size)
+ if not data:
+ break
+ sha256.update(data)
+ return sha256.hexdigest()
+
+
+def preserve_perma_wacz(uploaded_file, warc_url, mime_type, guid, url, title, creation_timestamp, wacz_destination):
"""
- Context manager for opening a perma warc, ready to receive warc records.
- Safely closes and saves the file to storage when context is exited.
+ Creates and writes a perma WACZ for a user upload, returning the WACZ size.
+ This necessarily creates a WARC, but we no longer save it.
"""
- # mode set to 'ab+' as a workaround for https://bugs.python.org/issue25341
- out = tempfile.TemporaryFile('ab+')
- write_perma_warc_header(out, guid, timestamp)
- try:
- yield out
- finally:
- out.flush()
- warc_size.append(out.tell())
- out.seek(0)
- storages[settings.WARC_STORAGE].store_file(out, destination, overwrite=True)
- out.close()
+ timestamp = datetime.utcnow()
+ ts_string = timestamp.isoformat() + "Z"
+ # Link's creation_timestamp has "+00:00" at the end, so
+ creation_ts_string = creation_timestamp.isoformat().partition("+")[0] + "Z"
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ # prepare WARC...
+ warc_file = f"{tmpdir}/data.warc.gz"
+ warc = open(warc_file, 'ab+')
+ write_perma_warc_header(warc, guid, timestamp)
+
+ uploaded_file.file.seek(0)
+ write_resource_record_from_asset(uploaded_file.file.read(), warc_url, mime_type, ts_string, warc)
+
+ # create provenance summary and add it to the WARC
+ provenance = loader.get_template("provenance-summary.html")
+ context = {
+ "url": url,
+ "now": ts_string,
+ "mime_type": mime_type,
+ "creation_timestamp": creation_ts_string
+ }
+ write_resource_record_from_asset(
+ provenance.render(context).encode(),
+ "file:///provenance-summary.html",
+ "text/html",
+ ts_string,
+ warc
+ )
+ warc.close()
+
+ # ...set up pages.jsonl...
+ pages = [
+ {"format": "json-pages-1.0", "id": "pages", "title": "All Pages"},
+ {
+ "id": f"{uuid.uuid4()}",
+ "url": warc_url,
+ "title": f"User-uploaded file replacing failed capture of {url}",
+ "ts": ts_string
+ },
+ {
+ "id": f"{uuid.uuid4()}",
+ "url": "file:///provenance-summary.html",
+ "title": "Provenance Summary",
+ "ts": ts_string
+ }
+ ]
+ pages_bytes = "\n".join([json.dumps(page) for page in pages]).encode("utf-8")
+
+ # ...CDXJ index...
+ targets = ["file:///provenance-summary.html", warc_url]
+ selected_warc_headers = parse_warc(warc_file, warc_url)
+ cdxj = {
+ target: json.dumps({
+ "url": target,
+ "mime": "text/html" if target.endswith(".html") else mime_type,
+ "status": 200,
+ "digest": selected_warc_headers[target]["digest"],
+ "length": selected_warc_headers[target]["length"],
+ "offset": selected_warc_headers[target]["offset"],
+ "filename":"data.warc.gz"
+ }).replace(" ", "") for target in targets
+ }
+ ts = timestamp.strftime("%Y%m%d%H%M%S")
+ index = "\n".join(
+ [
+ f"{target} {ts} {cdxj[target]}"
+ for target in targets
+ ]
+ )
+
+ # ...datapackage...
+ datapackage = {
+ "profile": "data-package",
+ "wacz_version": "1.1.1",
+ "title": title,
+ "description": f"User upload for {url}",
+ "mainPageURL": warc_url,
+ "created": ts_string,
+ "software": f"Perma.cc {settings.PERMA_VERSION}",
+ "resources": [
+ {
+ "name": "pages.jsonl",
+ "path": "pages/pages.jsonl",
+ "hash": "sha256:" + hashlib.sha256(pages_bytes).hexdigest(),
+ "bytes": len(pages_bytes)
+ },
+ {
+ "name": "index.cdx",
+ "path": "indexes/index.cdx",
+ "hash": "sha256:" + hashlib.sha256(index.encode()).hexdigest(),
+ "bytes": len(index)
+ },
+ {
+ "name": "data.warc.gz",
+ "path": "archive/data.warc.gz",
+ "hash": "sha256:" + sha256(warc_file),
+ "bytes": os.stat(warc_file).st_size
+ }
+ ],
+ }
+ # ...and datapackage digest
+ datapackage_digest = {
+ "path": "datapackage.json",
+ "hash": hashlib.sha256(json.dumps(datapackage).encode()).hexdigest()
+ }
+
+ # Now we can create the WACZ file...
+ wacz_file = f"{tmpdir}/{guid}.wacz"
+ wacz = zipfile.ZipFile(wacz_file, "w")
+
+ # add index
+ index_file = zipfile.ZipInfo("indexes/index.cdx", now())
+ index_file.compress_type = zipfile.ZIP_DEFLATED
+ wacz.writestr(index_file, index.encode("utf-8"))
+
+ # add pages.jsonl
+ pages_file = zipfile.ZipInfo("pages/pages.jsonl", now())
+ pages_file.compress_type = zipfile.ZIP_DEFLATED
+ wacz.writestr(pages_file, pages_bytes)
+
+ # add WARC file
+ archive_file = zipfile.ZipInfo.from_file(
+ warc_file, "archive/data.warc.gz"
+ )
+ with wacz.open(archive_file, "w") as out_fh:
+ with open(warc_file, "rb") as in_fh:
+ shutil.copyfileobj(in_fh, out_fh)
+
+ # add datapackage
+ datapackage_file = zipfile.ZipInfo("datapackage.json", now())
+ datapackage_file.compress_type = zipfile.ZIP_DEFLATED
+ wacz.writestr(
+ datapackage_file,
+ json.dumps(datapackage).encode("utf-8")
+ )
+
+ # and datapackage digest
+ datapackage_digest_file = zipfile.ZipInfo(
+ "datapackage-digest.json", now()
+ )
+ datapackage_digest_file.compress_type = zipfile.ZIP_DEFLATED
+ wacz.writestr(
+ datapackage_digest_file,
+ json.dumps(datapackage_digest).encode("utf-8")
+ )
+
+ # and close the file
+ wacz.close()
+
+ # now store it
+ with open(wacz_file, "rb") as f:
+ storages[settings.WACZ_STORAGE].store_file(f, wacz_destination, overwrite=True)
+
+ wacz_size = os.stat(wacz_file).st_size
+
+ # (no need to clean up, because the context manager will do it)
+
+ # ...and return the size
+ return wacz_size
+
+#
+# warc writing
+#
def write_perma_warc_header(out_file, guid, timestamp):
# build warcinfo header
@@ -528,7 +723,7 @@ def write_perma_warc_header(out_file, guid, timestamp):
]
warcinfo_fields = [
b'operator: Perma.cc',
- b'format: WARC File Format 1.0',
+ b'format: WARC file version 1.0',
bytes(f'Perma-GUID: {guid}', 'utf-8')
]
data = b'\r\n'.join(warcinfo_fields) + b'\r\n'
@@ -553,7 +748,7 @@ def make_detailed_warcinfo(filename, guid, coll_title, coll_desc, rec_title, pag
writer = BufferWARCWriter(gzip=True)
params = OrderedDict([('operator', 'Perma.cc download'),
('Perma-GUID', guid),
- ('format', 'WARC File Format 1.0'),
+ ('format', 'WARC file version 1.0'),
('json-metadata', json.dumps(coll_metadata))])
record = writer.create_warcinfo_record(filename, params)
@@ -568,18 +763,17 @@ def make_detailed_warcinfo(filename, guid, coll_title, coll_desc, rec_title, pag
return writer.get_contents()
-def write_resource_record_from_asset(data, url, content_type, out_file, extra_headers=None):
+def write_resource_record_from_asset(data, url, content_type, warc_date, out_file, extra_headers=None):
"""
Constructs a single WARC resource record from an asset (screenshot, uploaded file, etc.)
and writes to out_file.
"""
- warc_date = warctools.warc.warc_datetime_str(timezone.now()).replace(b'+00:00Z', b'Z')
headers = [
(warctools.WarcRecord.TYPE, warctools.WarcRecord.RESOURCE),
(warctools.WarcRecord.ID, warctools.WarcRecord.random_warc_uuid()),
- (warctools.WarcRecord.DATE, warc_date),
+ (warctools.WarcRecord.DATE, bytes(warc_date, 'utf-8')),
(warctools.WarcRecord.URL, bytes(url, 'utf-8')),
- (warctools.WarcRecord.BLOCK_DIGEST, bytes(f'sha1:{hashlib.sha1(data).hexdigest()}', 'utf-8'))
+ (warctools.WarcRecord.BLOCK_DIGEST, bytes(f'sha256:{hashlib.sha256(data).hexdigest()}', 'utf-8'))
]
if extra_headers:
headers.extend(extra_headers)
diff --git a/perma_web/requirements.txt b/perma_web/requirements.txt
index bb2840f2f..77058783b 100644
--- a/perma_web/requirements.txt
+++ b/perma_web/requirements.txt
@@ -419,9 +419,9 @@ hypothesis==6.98.17 \
--hash=sha256:313f64b9f9f95e12c8b5342466bef7f352d2608afeeb434817c039602b45f0c4 \
--hash=sha256:bbd227000cc21a9686a00867f031479c3812d8ab076e4af1c813f6b3a50c98f5
# via -r requirements.in
-idna==3.7 \
- --hash=sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc \
- --hash=sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0
+idna==3.10 \
+ --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \
+ --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3
# via
# requests
# tldextract