Skip to content

Commit

Permalink
Merge pull request #3451 from rebeccacremona/ia-upload-query
Browse files Browse the repository at this point in the history
Tweak strategy for finding links to upload to IA
  • Loading branch information
rebeccacremona authored Dec 11, 2023
2 parents 9f4beaa + 2bf0ceb commit de30b45
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 52 deletions.
18 changes: 18 additions & 0 deletions perma_web/perma/migrations/0035_link_ia_eligible_for_date_idx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 4.2.8 on 2023-12-11 20:13

from django.db import migrations, models
import django.db.models.functions.datetime


class Migration(migrations.Migration):

dependencies = [
('perma', '0034_remove_historicallink_thumbnail_status_and_more'),
]

operations = [
migrations.AddIndex(
model_name='link',
index=models.Index(models.F('user_deleted'), django.db.models.functions.datetime.TruncDate('creation_timestamp'), models.F('is_private'), models.F('is_unlisted'), models.F('cached_can_play_back'), name='ia_eligible_for_date_idx'),
),
]
72 changes: 20 additions & 52 deletions perma_web/perma/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from django.core.files.storage import default_storage
from django.db import models, transaction
from django.db.models import Q, Max, Count, Sum, JSONField
from django.db.models.functions import Now, Upper
from django.db.models.functions import Now, Upper, TruncDate
from django.db.models.query import QuerySet
from django.contrib.postgres.indexes import GistIndex, GinIndex, OpClass
from django.template.defaultfilters import truncatechars
Expand Down Expand Up @@ -1442,62 +1442,29 @@ def visible_to_ia(self):
def ineligible_for_ia(self):
return self.exclude(Link.DISCOVERABLE_FILTER, cached_can_play_back=True)

def ia_upload_pending(self, date_string=None, limit=100):
def ia_upload_pending(self, date_string, limit=100):
# Get all Links we think should have been uploaded to IA,
# and then filter out the ones that have already been uploaded
# to a "daily" item.
#
# Do so with our own SQL because the SQL generated from the
# intuitive ORM query proved to be impossibly slow.
# links = Link.objects.visible_to_ia().filter(
# creation_timestamp__date=InternetArchiveItem.date(date_string)
# ).exclude(
# internet_archive_items__span__isempty=False
# )
date_clause = '''
AND (
(perma_link.creation_timestamp AT TIME ZONE 'UTC')::DATE = %s
)
'''
limit_clause = "LIMIT %s"
sql = f'''
WITH links_with_daily_items AS (
SELECT
perma_link.guid
FROM
perma_link
INNER JOIN perma_internetarchivefile ON
perma_link.guid = perma_internetarchivefile.link_id
INNER JOIN perma_internetarchiveitem ON
perma_internetarchivefile.item_id = perma_internetarchiveitem.identifier
WHERE
isempty(
perma_internetarchiveitem.span
) = False
if date_string > "2022-10-03":
# No links created after 2022-10-03 were uploaded to IA as individual Items:
# use a simplified query
logger.debug("Running simple IA eligibility query.")
query = Link.objects.filter(
creation_timestamp__date=date_string
).visible_to_ia().filter(
internet_archive_files=None
)
else:
logger.debug("Running full IA eligibility query.")
query = Link.objects.filter(
creation_timestamp__date=date_string
).visible_to_ia().exclude(
internet_archive_items__span__isempty=False
)
SELECT
perma_link.guid
FROM
perma_link
LEFT JOIN links_with_daily_items ON
perma_link.guid = links_with_daily_items.guid
WHERE
links_with_daily_items.guid IS NULL
AND (
perma_link.user_deleted = False AND perma_link.is_private = False AND perma_link.is_unlisted = False AND perma_link.cached_can_play_back = True
)
{date_clause if date_string else ''}
{limit_clause if limit else ''}
'''

params = []
if date_string:
params.append(date_string)
if limit:
params.append(str(limit))

return self.model.objects.raw(sql, params)
query = query[:limit]
return query


LinkManager = DeletableManager.from_queryset(LinkQuerySet)
Expand Down Expand Up @@ -1563,6 +1530,7 @@ class Link(DeletableModel):
class Meta:
indexes = [
models.Index(fields=['user_deleted', 'is_private', 'is_unlisted', 'cached_can_play_back', 'internet_archive_upload_status']),
models.Index('user_deleted', TruncDate('creation_timestamp'), 'is_private', 'is_unlisted', 'cached_can_play_back', name="ia_eligible_for_date_idx"),
models.Index(fields=['creation_timestamp']),
models.Index(fields=['submitted_url_surt']),
GinIndex(OpClass(Upper('guid'), name='gin_trgm_ops'), name='guid_case_insensitive_idx'),
Expand Down

0 comments on commit de30b45

Please sign in to comment.