@@ -49,7 +49,7 @@ async def test_processor_initialization(self, mock_producer, mock_consumer):
4949 async def test_on_message_success (self , mock_pdf_loader_class , mock_producer , mock_consumer ):
5050 """Test successful PDF processing"""
5151 # Mock PDF content
52- pdf_content = b"fake pdf content"
52+ pdf_content = b"%PDF-1.7 \n fake pdf content"
5353 pdf_base64 = base64 .b64encode (pdf_content ).decode ('utf-8' )
5454
5555 # Mock PyPDFLoader
@@ -88,13 +88,55 @@ async def test_on_message_success(self, mock_pdf_loader_class, mock_producer, mo
8888 # Verify triples were sent for each page (provenance)
8989 assert mock_triples_flow .send .call_count == 2
9090
91+ @patch ('trustgraph.base.librarian_client.Consumer' )
92+ @patch ('trustgraph.base.librarian_client.Producer' )
93+ @patch ('trustgraph.decoding.pdf.pdf_decoder.PyPDFLoader' )
94+ @patch ('trustgraph.base.async_processor.AsyncProcessor' , MockAsyncProcessor )
95+ async def test_on_message_rejects_librarian_content_that_is_not_pdf (self , mock_pdf_loader_class , mock_producer , mock_consumer ):
96+ """Test rejecting non-PDF content before invoking the PDF loader"""
97+ html_content = b"<html><body>Not found</body></html>"
98+ html_base64 = base64 .b64encode (html_content )
99+
100+ mock_metadata = Metadata (id = "test-doc" )
101+ mock_document = Document (metadata = mock_metadata , document_id = "doc-123" )
102+ mock_msg = MagicMock ()
103+ mock_msg .value .return_value = mock_document
104+
105+ mock_output_flow = AsyncMock ()
106+ mock_triples_flow = AsyncMock ()
107+ mock_flow = MagicMock (side_effect = lambda name : {
108+ "output" : mock_output_flow ,
109+ "triples" : mock_triples_flow ,
110+ }.get (name ))
111+ mock_flow .librarian .fetch_document_metadata = AsyncMock (
112+ return_value = MagicMock (kind = "application/pdf" )
113+ )
114+ mock_flow .librarian .fetch_document_content = AsyncMock (
115+ return_value = html_base64
116+ )
117+ mock_flow .librarian .save_child_document = AsyncMock ()
118+
119+ config = {
120+ 'id' : 'test-pdf-decoder' ,
121+ 'taskgroup' : AsyncMock ()
122+ }
123+
124+ processor = Processor (** config )
125+
126+ await processor .on_message (mock_msg , None , mock_flow )
127+
128+ mock_pdf_loader_class .assert_not_called ()
129+ mock_output_flow .send .assert_not_called ()
130+ mock_triples_flow .send .assert_not_called ()
131+ mock_flow .librarian .save_child_document .assert_not_called ()
132+
91133 @patch ('trustgraph.base.librarian_client.Consumer' )
92134 @patch ('trustgraph.base.librarian_client.Producer' )
93135 @patch ('trustgraph.decoding.pdf.pdf_decoder.PyPDFLoader' )
94136 @patch ('trustgraph.base.async_processor.AsyncProcessor' , MockAsyncProcessor )
95137 async def test_on_message_empty_pdf (self , mock_pdf_loader_class , mock_producer , mock_consumer ):
96138 """Test handling of empty PDF"""
97- pdf_content = b"fake pdf content"
139+ pdf_content = b"%PDF-1.7 \n fake pdf content"
98140 pdf_base64 = base64 .b64encode (pdf_content ).decode ('utf-8' )
99141
100142 mock_loader = MagicMock ()
@@ -126,7 +168,7 @@ async def test_on_message_empty_pdf(self, mock_pdf_loader_class, mock_producer,
126168 @patch ('trustgraph.base.async_processor.AsyncProcessor' , MockAsyncProcessor )
127169 async def test_on_message_unicode_content (self , mock_pdf_loader_class , mock_producer , mock_consumer ):
128170 """Test handling of unicode content in PDF"""
129- pdf_content = b"fake pdf content"
171+ pdf_content = b"%PDF-1.7 \n fake pdf content"
130172 pdf_base64 = base64 .b64encode (pdf_content ).decode ('utf-8' )
131173
132174 mock_loader = MagicMock ()
0 commit comments