|
50 | 50 | MAX_CONCURRENCY_LEVEL = 50 |
51 | 51 | MIN_PAGES_PER_SPLIT = 2 |
52 | 52 | MAX_PAGES_PER_SPLIT = 20 |
| 53 | +HI_RES_STRATEGY = 'hi_res' |
| 54 | +MAX_PAGE_LENGTH = 4000 |
53 | 55 |
|
54 | 56 |
|
55 | 57 | async def _order_keeper(index: int, coro: Awaitable) -> Tuple[int, httpx.Response]: |
@@ -334,6 +336,8 @@ def before_request( |
334 | 336 | if split_size >= page_count and page_count == len(pdf.pages): |
335 | 337 | return request |
336 | 338 |
|
| 339 | + pdf = self._trim_large_pages(pdf, form_data) |
| 340 | + |
337 | 341 | if self.cache_tmp_data_feature: |
338 | 342 | pdf_chunk_paths = self._get_pdf_chunk_paths( |
339 | 343 | pdf, |
@@ -423,6 +427,34 @@ async def call_api_partial( |
423 | 427 |
|
424 | 428 | return response |
425 | 429 |
|
| 430 | + def _trim_large_pages(self, pdf: PdfReader, form_data: dict[str, Any]) -> PdfReader: |
| 431 | + if form_data['strategy'] != HI_RES_STRATEGY: |
| 432 | + return pdf |
| 433 | + |
| 434 | + max_page_length = MAX_PAGE_LENGTH |
| 435 | + any_page_over_maximum_length = False |
| 436 | + for page in pdf.pages: |
| 437 | + if page.mediabox.height >= max_page_length: |
| 438 | + any_page_over_maximum_length = True |
| 439 | + |
| 440 | + # early exit if all pages are safely under the max page length |
| 441 | + if not any_page_over_maximum_length: |
| 442 | + return pdf |
| 443 | + |
| 444 | + w = PdfWriter() |
| 445 | + |
| 446 | + # trims large pages that exceed the maximum supported height for processing |
| 447 | + for page in pdf.pages: |
| 448 | + if page.mediabox.height >= max_page_length: |
| 449 | + page.mediabox.top = page.mediabox.height |
| 450 | + page.mediabox.bottom = page.mediabox.top - max_page_length |
| 451 | + w.add_page(page) |
| 452 | + |
| 453 | + chunk_buffer = io.BytesIO() |
| 454 | + w.write(chunk_buffer) |
| 455 | + chunk_buffer.seek(0) |
| 456 | + return PdfReader(chunk_buffer) |
| 457 | + |
426 | 458 | def _get_pdf_chunks_in_memory( |
427 | 459 | self, |
428 | 460 | pdf: PdfReader, |
|
0 commit comments