Skip to content

Commit 8813d37

Browse files
authored
Optimize memory footprint of pdf problem transcription task (#2433)
* removing unused arg * adding dep * removing old dep * try install from source * pin python * pin python * test * explicitely use right version of python * explicitely use right version of python * remobing unused steps * remove comment * fixing other bug with canvas task json serialization * adding latext .tex files * adding migration * fix test * pinning to pypi and removing unused ci steps * pushing fix for ci * test
1 parent 39a2f69 commit 8813d37

File tree

8 files changed

+116
-34
lines changed

8 files changed

+116
-34
lines changed

.github/workflows/ci.yml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,16 @@ jobs:
4747
with:
4848
python-version-file: "pyproject.toml"
4949
cache: "poetry"
50-
50+
- name: Install poetry with pip
51+
run: python -m pip install poetry
5152
- name: Validate lockfile
5253
run: poetry check --lock
53-
54+
- name: Set Poetry Python
55+
run: poetry env use python3.12
5456
- name: Install dependencies
55-
run: poetry install --no-interaction
57+
run: |
58+
source $(poetry env info --path)/bin/activate
59+
poetry install --no-interaction
5660
5761
- name: Create test local state
5862
run: ./scripts/test/stub-data.sh

learning_resources/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ class LearningResourceRelationTypes(TextChoices):
147147
".json",
148148
".md",
149149
".pdf",
150+
".tex",
150151
".ppt",
151152
".pptx",
152153
".rtf",

learning_resources/etl/canvas.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@
99
from pathlib import Path
1010
from tempfile import TemporaryDirectory
1111

12+
import pypdfium2 as pdfium
1213
from defusedxml import ElementTree
1314
from django.conf import settings
1415
from litellm import completion
15-
from pdf2image import convert_from_path
1616
from PIL import Image
1717

1818
from learning_resources.constants import (
@@ -71,7 +71,7 @@ def sync_canvas_archive(bucket, key: str, overwrite):
7171
run.checksum = checksum
7272
run.save()
7373

74-
return resource_readable_id, run
74+
return resource_readable_id
7575

7676

7777
def _course_url(course_archive_path) -> str:
@@ -323,7 +323,7 @@ def extract_resources_by_identifierref(manifest_xml: str) -> dict:
323323
return dict(resources_dict)
324324

325325

326-
def pdf_to_base64_images(pdf_path, dpi=200, fmt="JPEG", max_size=2000, quality=85):
326+
def pdf_to_base64_images(pdf_path, fmt="JPEG", max_size=2000, quality=85):
327327
"""
328328
Convert a PDF file to a list of base64 encoded images (one per page).
329329
Resizes images to reduce file size while keeping good OCR quality.
@@ -338,26 +338,24 @@ def pdf_to_base64_images(pdf_path, dpi=200, fmt="JPEG", max_size=2000, quality=8
338338
Returns:
339339
list: List of base64 encoded strings (one per page)
340340
"""
341-
images = convert_from_path(pdf_path, dpi=dpi)
342-
base64_images = []
343341

344-
for image in images:
342+
pdf = pdfium.PdfDocument(pdf_path)
343+
for page_index in range(len(pdf)):
344+
page = pdf.get_page(page_index)
345+
image = page.render(scale=2).to_pil()
346+
page.close()
345347
# Resize the image if it's too large (preserving aspect ratio)
346348
if max(image.size) > max_size:
347349
image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
348-
349350
buffered = BytesIO()
350-
351351
# Save with optimized settings
352352
if fmt.upper() == "JPEG":
353353
image.save(buffered, format="JPEG", quality=quality, optimize=True)
354354
else: # PNG
355355
image.save(buffered, format="PNG", optimize=True)
356-
357356
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
358-
base64_images.append(img_str)
359-
360-
return base64_images
357+
yield img_str
358+
pdf.close()
361359

362360

363361
def _pdf_to_markdown(pdf_path):
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Generated by Django 4.2.23 on 2025-08-14 15:20
2+
3+
import django.contrib.postgres.fields
4+
from django.db import migrations, models
5+
6+
7+
class Migration(migrations.Migration):
8+
dependencies = [
9+
("learning_resources", "0093_tutorproblem_view_group"),
10+
]
11+
12+
operations = [
13+
migrations.AlterField(
14+
model_name="contentsummarizerconfiguration",
15+
name="allowed_extensions",
16+
field=django.contrib.postgres.fields.ArrayField(
17+
base_field=models.CharField(
18+
choices=[
19+
(".csv", ".csv"),
20+
(".doc", ".doc"),
21+
(".docx", ".docx"),
22+
(".htm", ".htm"),
23+
(".html", ".html"),
24+
(".json", ".json"),
25+
(".m", ".m"),
26+
(".mat", ".mat"),
27+
(".md", ".md"),
28+
(".pdf", ".pdf"),
29+
(".ppt", ".ppt"),
30+
(".pptx", ".pptx"),
31+
(".ps", ".ps"),
32+
(".py", ".py"),
33+
(".r", ".r"),
34+
(".rtf", ".rtf"),
35+
(".sjson", ".sjson"),
36+
(".srt", ".srt"),
37+
(".txt", ".txt"),
38+
(".vtt", ".vtt"),
39+
(".xls", ".xls"),
40+
(".xlsx", ".xlsx"),
41+
(".xml", ".xml"),
42+
(".doc", ".doc"),
43+
(".docx", ".docx"),
44+
(".htm", ".htm"),
45+
(".html", ".html"),
46+
(".json", ".json"),
47+
(".md", ".md"),
48+
(".pdf", ".pdf"),
49+
(".tex", ".tex"),
50+
(".ppt", ".ppt"),
51+
(".pptx", ".pptx"),
52+
(".rtf", ".rtf"),
53+
(".sjson", ".sjson"),
54+
(".srt", ".srt"),
55+
(".txt", ".txt"),
56+
(".vtt", ".vtt"),
57+
(".xml", ".xml"),
58+
],
59+
max_length=128,
60+
),
61+
blank=True,
62+
default=list,
63+
null=True,
64+
size=None,
65+
),
66+
),
67+
]

learning_resources/tasks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -511,7 +511,7 @@ def sync_canvas_courses(canvas_course_ids, overwrite):
511511
for archive in latest_archives.values():
512512
key = archive.key
513513
log.info("Ingesting canvas course %s", key)
514-
resource_readable_id, canvas_run = ingest_canvas_course(
514+
resource_readable_id = ingest_canvas_course(
515515
key,
516516
overwrite=overwrite,
517517
)

learning_resources/tasks_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -658,7 +658,7 @@ def test_sync_canvas_courses(settings, mocker, django_assert_num_queries, canvas
658658
# Patch ingest_canvas_course to return the readable_ids for the two non-stale courses
659659
mock_ingest_course = mocker.patch(
660660
"learning_resources.tasks.ingest_canvas_course",
661-
side_effect=[("course1", lr1.runs.first()), ("course2", lr2.runs.first())],
661+
side_effect=["course1", "course2"],
662662
)
663663
sync_canvas_courses(canvas_course_ids=canvas_ids, overwrite=False)
664664

poetry.lock

Lines changed: 24 additions & 16 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ django-cors-headers = "^4.0.0"
3434
django-filter = "^2.4.0"
3535
django-guardian = "^3.0.0"
3636
django-health-check = { git = "https://github.com/revsys/django-health-check", rev="b0500d14c338040984f02ee34ffbe6643b005084" } # pragma: allowlist secret
37+
38+
3739
django-imagekit = "^5.0.0"
3840
django-ipware = "^7.0.0"
3941
django-json-widget = "^2.0.0"
@@ -111,7 +113,9 @@ uwsgi = "^2.0.29"
111113
uwsgitop = "^0.12"
112114
wrapt = "^1.14.1"
113115
youtube-transcript-api = "^1.0.0"
114-
pdf2image = "^1.17.0"
116+
pypdfium2 = "^4.30.0"
117+
118+
115119

116120
[tool.poetry.group.dev.dependencies]
117121
bpython = "^0.25"

0 commit comments

Comments
 (0)