Skip to content

Commit 79d7ef6

Browse files
jmolzcybermaggedon
authored andcommitted
fix: reject invalid PDF decoder input (#977)
1 parent e1c9351 commit 79d7ef6

2 files changed

Lines changed: 75 additions & 25 deletions

File tree

tests/unit/test_decoding/test_pdf_decoder.py

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ async def test_processor_initialization(self, mock_producer, mock_consumer):
4949
async def test_on_message_success(self, mock_pdf_loader_class, mock_producer, mock_consumer):
5050
"""Test successful PDF processing"""
5151
# Mock PDF content
52-
pdf_content = b"fake pdf content"
52+
pdf_content = b"%PDF-1.7\nfake pdf content"
5353
pdf_base64 = base64.b64encode(pdf_content).decode('utf-8')
5454

5555
# Mock PyPDFLoader
@@ -88,13 +88,55 @@ async def test_on_message_success(self, mock_pdf_loader_class, mock_producer, mo
8888
# Verify triples were sent for each page (provenance)
8989
assert mock_triples_flow.send.call_count == 2
9090

91+
@patch('trustgraph.base.librarian_client.Consumer')
92+
@patch('trustgraph.base.librarian_client.Producer')
93+
@patch('trustgraph.decoding.pdf.pdf_decoder.PyPDFLoader')
94+
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
95+
async def test_on_message_rejects_librarian_content_that_is_not_pdf(self, mock_pdf_loader_class, mock_producer, mock_consumer):
96+
"""Test rejecting non-PDF content before invoking the PDF loader"""
97+
html_content = b"<html><body>Not found</body></html>"
98+
html_base64 = base64.b64encode(html_content)
99+
100+
mock_metadata = Metadata(id="test-doc")
101+
mock_document = Document(metadata=mock_metadata, document_id="doc-123")
102+
mock_msg = MagicMock()
103+
mock_msg.value.return_value = mock_document
104+
105+
mock_output_flow = AsyncMock()
106+
mock_triples_flow = AsyncMock()
107+
mock_flow = MagicMock(side_effect=lambda name: {
108+
"output": mock_output_flow,
109+
"triples": mock_triples_flow,
110+
}.get(name))
111+
mock_flow.librarian.fetch_document_metadata = AsyncMock(
112+
return_value=MagicMock(kind="application/pdf")
113+
)
114+
mock_flow.librarian.fetch_document_content = AsyncMock(
115+
return_value=html_base64
116+
)
117+
mock_flow.librarian.save_child_document = AsyncMock()
118+
119+
config = {
120+
'id': 'test-pdf-decoder',
121+
'taskgroup': AsyncMock()
122+
}
123+
124+
processor = Processor(**config)
125+
126+
await processor.on_message(mock_msg, None, mock_flow)
127+
128+
mock_pdf_loader_class.assert_not_called()
129+
mock_output_flow.send.assert_not_called()
130+
mock_triples_flow.send.assert_not_called()
131+
mock_flow.librarian.save_child_document.assert_not_called()
132+
91133
@patch('trustgraph.base.librarian_client.Consumer')
92134
@patch('trustgraph.base.librarian_client.Producer')
93135
@patch('trustgraph.decoding.pdf.pdf_decoder.PyPDFLoader')
94136
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
95137
async def test_on_message_empty_pdf(self, mock_pdf_loader_class, mock_producer, mock_consumer):
96138
"""Test handling of empty PDF"""
97-
pdf_content = b"fake pdf content"
139+
pdf_content = b"%PDF-1.7\nfake pdf content"
98140
pdf_base64 = base64.b64encode(pdf_content).decode('utf-8')
99141

100142
mock_loader = MagicMock()
@@ -126,7 +168,7 @@ async def test_on_message_empty_pdf(self, mock_pdf_loader_class, mock_producer,
126168
@patch('trustgraph.base.async_processor.AsyncProcessor', MockAsyncProcessor)
127169
async def test_on_message_unicode_content(self, mock_pdf_loader_class, mock_producer, mock_consumer):
128170
"""Test handling of unicode content in PDF"""
129-
pdf_content = b"fake pdf content"
171+
pdf_content = b"%PDF-1.7\nfake pdf content"
130172
pdf_base64 = base64.b64encode(pdf_content).decode('utf-8')
131173

132174
mock_loader = MagicMock()

trustgraph-flow/trustgraph/decoding/pdf/pdf_decoder.py

Lines changed: 30 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232
default_ident = "document-decoder"
3333

3434

35+
def _looks_like_pdf(content):
36+
return content.lstrip().startswith(b"%PDF-")
37+
38+
3539
class Processor(FlowProcessor):
3640

3741
def __init__(self, **params):
@@ -94,33 +98,37 @@ async def on_message(self, msg, consumer, flow):
9498
)
9599
return
96100

97-
with tempfile.NamedTemporaryFile(delete_on_close=False, suffix='.pdf') as fp:
98-
temp_path = fp.name
101+
# Check if we should fetch from librarian or use inline data
102+
if v.document_id:
103+
# Fetch from librarian via Pulsar
104+
logger.info(f"Fetching document {v.document_id} from librarian...")
99105

100-
# Check if we should fetch from librarian or use inline data
101-
if v.document_id:
102-
# Fetch from librarian via Pulsar
103-
logger.info(f"Fetching document {v.document_id} from librarian...")
104-
fp.close()
106+
content = await flow.librarian.fetch_document_content(
107+
document_id=v.document_id,
105108

106-
content = await flow.librarian.fetch_document_content(
107-
document_id=v.document_id,
108-
109-
)
109+
)
110110

111-
# Content is base64 encoded
112-
if isinstance(content, str):
113-
content = content.encode('utf-8')
114-
decoded_content = base64.b64decode(content)
111+
# Content is base64 encoded
112+
if isinstance(content, str):
113+
content = content.encode('utf-8')
114+
decoded_content = base64.b64decode(content)
115115

116-
with open(temp_path, 'wb') as f:
117-
f.write(decoded_content)
116+
logger.info(f"Fetched {len(decoded_content)} bytes from librarian")
117+
else:
118+
# Use inline data (backward compatibility)
119+
decoded_content = base64.b64decode(v.data)
118120

119-
logger.info(f"Fetched {len(decoded_content)} bytes from librarian")
120-
else:
121-
# Use inline data (backward compatibility)
122-
fp.write(base64.b64decode(v.data))
123-
fp.close()
121+
if not _looks_like_pdf(decoded_content):
122+
logger.error(
123+
f"Document {v.metadata.id} is not valid PDF content. "
124+
f"Ignoring document."
125+
)
126+
return
127+
128+
with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as fp:
129+
temp_path = fp.name
130+
fp.write(decoded_content)
131+
fp.close()
124132

125133
global PyPDFLoader
126134
if PyPDFLoader is None:

0 commit comments

Comments
 (0)