Optimize memory footprint of pdf problem transcription task (#2433)

shanbady · web-flow · commit 8813d37c4c17 · 2025-08-15T11:39:42.000-04:00
* removing unused arg

* adding dep

* removing old dep

* try install from source

* pin python

* pin python

* test

* explicitely use right version of python

* explicitely use right version of python

* remobing unused steps

* remove comment

* fixing other bug with canvas task json serialization

* adding latext .tex files

* adding migration

* fix test

* pinning to pypi and removing unused ci steps

* pushing fix for ci

* test
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -47,12 +47,16 @@ jobs:
         with:
           python-version-file: "pyproject.toml"
           cache: "poetry"
-
+      - name: Install poetry with pip
+        run: python -m pip install poetry
       - name: Validate lockfile
         run: poetry check --lock
-
+      - name: Set Poetry Python
+        run: poetry env use python3.12
       - name: Install dependencies
-        run: poetry install --no-interaction
+        run: |
+          source $(poetry env info --path)/bin/activate
+          poetry install --no-interaction
 
       - name: Create test local state
         run: ./scripts/test/stub-data.sh
diff --git a/learning_resources/constants.py b/learning_resources/constants.py
@@ -147,6 +147,7 @@ class LearningResourceRelationTypes(TextChoices):
     ".json",
     ".md",
     ".pdf",
+    ".tex",
     ".ppt",
     ".pptx",
     ".rtf",
diff --git a/learning_resources/etl/canvas.py b/learning_resources/etl/canvas.py
@@ -9,10 +9,10 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
 
+import pypdfium2 as pdfium
 from defusedxml import ElementTree
 from django.conf import settings
 from litellm import completion
-from pdf2image import convert_from_path
 from PIL import Image
 
 from learning_resources.constants import (
@@ -71,7 +71,7 @@ def sync_canvas_archive(bucket, key: str, overwrite):
             run.checksum = checksum
             run.save()
 
-    return resource_readable_id, run
+    return resource_readable_id
 
 
 def _course_url(course_archive_path) -> str:
@@ -323,7 +323,7 @@ def extract_resources_by_identifierref(manifest_xml: str) -> dict:
     return dict(resources_dict)
 
 
-def pdf_to_base64_images(pdf_path, dpi=200, fmt="JPEG", max_size=2000, quality=85):
+def pdf_to_base64_images(pdf_path, fmt="JPEG", max_size=2000, quality=85):
     """
     Convert a PDF file to a list of base64 encoded images (one per page).
     Resizes images to reduce file size while keeping good OCR quality.
@@ -338,26 +338,24 @@ def pdf_to_base64_images(pdf_path, dpi=200, fmt="JPEG", max_size=2000, quality=8
     Returns:
         list: List of base64 encoded strings (one per page)
     """
-    images = convert_from_path(pdf_path, dpi=dpi)
-    base64_images = []
 
-    for image in images:
+    pdf = pdfium.PdfDocument(pdf_path)
+    for page_index in range(len(pdf)):
+        page = pdf.get_page(page_index)
+        image = page.render(scale=2).to_pil()
+        page.close()
         # Resize the image if it's too large (preserving aspect ratio)
         if max(image.size) > max_size:
             image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
-
         buffered = BytesIO()
-
         # Save with optimized settings
         if fmt.upper() == "JPEG":
             image.save(buffered, format="JPEG", quality=quality, optimize=True)
         else:  # PNG
             image.save(buffered, format="PNG", optimize=True)
-
         img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
-        base64_images.append(img_str)
-
-    return base64_images
+        yield img_str
+    pdf.close()
 
 
 def _pdf_to_markdown(pdf_path):
diff --git a/learning_resources/migrations/0094_alter_contentsummarizerconfiguration_allowed_extensions.py b/learning_resources/migrations/0094_alter_contentsummarizerconfiguration_allowed_extensions.py
@@ -0,0 +1,67 @@
+# Generated by Django 4.2.23 on 2025-08-14 15:20
+
+import django.contrib.postgres.fields
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("learning_resources", "0093_tutorproblem_view_group"),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name="contentsummarizerconfiguration",
+            name="allowed_extensions",
+            field=django.contrib.postgres.fields.ArrayField(
+                base_field=models.CharField(
+                    choices=[
+                        (".csv", ".csv"),
+                        (".doc", ".doc"),
+                        (".docx", ".docx"),
+                        (".htm", ".htm"),
+                        (".html", ".html"),
+                        (".json", ".json"),
+                        (".m", ".m"),
+                        (".mat", ".mat"),
+                        (".md", ".md"),
+                        (".pdf", ".pdf"),
+                        (".ppt", ".ppt"),
+                        (".pptx", ".pptx"),
+                        (".ps", ".ps"),
+                        (".py", ".py"),
+                        (".r", ".r"),
+                        (".rtf", ".rtf"),
+                        (".sjson", ".sjson"),
+                        (".srt", ".srt"),
+                        (".txt", ".txt"),
+                        (".vtt", ".vtt"),
+                        (".xls", ".xls"),
+                        (".xlsx", ".xlsx"),
+                        (".xml", ".xml"),
+                        (".doc", ".doc"),
+                        (".docx", ".docx"),
+                        (".htm", ".htm"),
+                        (".html", ".html"),
+                        (".json", ".json"),
+                        (".md", ".md"),
+                        (".pdf", ".pdf"),
+                        (".tex", ".tex"),
+                        (".ppt", ".ppt"),
+                        (".pptx", ".pptx"),
+                        (".rtf", ".rtf"),
+                        (".sjson", ".sjson"),
+                        (".srt", ".srt"),
+                        (".txt", ".txt"),
+                        (".vtt", ".vtt"),
+                        (".xml", ".xml"),
+                    ],
+                    max_length=128,
+                ),
+                blank=True,
+                default=list,
+                null=True,
+                size=None,
+            ),
+        ),
+    ]
diff --git a/learning_resources/tasks.py b/learning_resources/tasks.py
@@ -511,7 +511,7 @@ def sync_canvas_courses(canvas_course_ids, overwrite):
     for archive in latest_archives.values():
         key = archive.key
         log.info("Ingesting canvas course %s", key)
-        resource_readable_id, canvas_run = ingest_canvas_course(
+        resource_readable_id = ingest_canvas_course(
             key,
             overwrite=overwrite,
         )
diff --git a/learning_resources/tasks_test.py b/learning_resources/tasks_test.py
@@ -658,7 +658,7 @@ def test_sync_canvas_courses(settings, mocker, django_assert_num_queries, canvas
     # Patch ingest_canvas_course to return the readable_ids for the two non-stale courses
     mock_ingest_course = mocker.patch(
         "learning_resources.tasks.ingest_canvas_course",
-        side_effect=[("course1", lr1.runs.first()), ("course2", lr2.runs.first())],
+        side_effect=["course1", "course2"],
     )
     sync_canvas_courses(canvas_course_ids=canvas_ids, overwrite=False)
 
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,6 +34,8 @@ django-cors-headers = "^4.0.0"
 django-filter = "^2.4.0"
 django-guardian = "^3.0.0"
 django-health-check = { git = "https://github.com/revsys/django-health-check", rev="b0500d14c338040984f02ee34ffbe6643b005084" }  # pragma: allowlist secret
+
+
 django-imagekit = "^5.0.0"
 django-ipware = "^7.0.0"
 django-json-widget = "^2.0.0"
@@ -111,7 +113,9 @@ uwsgi = "^2.0.29"
 uwsgitop = "^0.12"
 wrapt = "^1.14.1"
 youtube-transcript-api = "^1.0.0"
-pdf2image = "^1.17.0"
+pypdfium2 = "^4.30.0"
+
+
 
 [tool.poetry.group.dev.dependencies]
 bpython = "^0.25"

Original file line number	Diff line number	Diff line change
`@@ -511,7 +511,7 @@ def sync_canvas_courses(canvas_course_ids, overwrite):`
`511`	`511`	`for archive in latest_archives.values():`
`512`	`512`	`key = archive.key`
`513`	`513`	`log.info("Ingesting canvas course %s", key)`
`514`		`- resource_readable_id, canvas_run = ingest_canvas_course(`
	`514`	`+ resource_readable_id = ingest_canvas_course(`
`515`	`515`	`key,`
`516`	`516`	`overwrite=overwrite,`
`517`	`517`	`)`
Original file line number	Diff line number	Diff line change
`@@ -658,7 +658,7 @@ def test_sync_canvas_courses(settings, mocker, django_assert_num_queries, canvas`
`658`	`658`	`# Patch ingest_canvas_course to return the readable_ids for the two non-stale courses`
`659`	`659`	`mock_ingest_course = mocker.patch(`
`660`	`660`	`"learning_resources.tasks.ingest_canvas_course",`
`661`		`- side_effect=[("course1", lr1.runs.first()), ("course2", lr2.runs.first())],`
	`661`	`+ side_effect=["course1", "course2"],`
`662`	`662`	`)`
`663`	`663`	`sync_canvas_courses(canvas_course_ids=canvas_ids, overwrite=False)`
`664`	`664`