pdf_oxide/examples/ocr_example.py at main · yfedoseev/pdf_oxide · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python3
"""
OCR text extraction example using pdf_oxide.

This script demonstrates how to extract text from scanned PDFs using
PaddleOCR models via ONNX Runtime.

Prerequisites:
    1. Build pdf_oxide with OCR feature:
       maturin develop --features python,ocr

    2. Download PaddleOCR ONNX models:
       ./scripts/setup_ocr_models.sh
       Or manually download the recommended V4 det + V5 rec combination:
       - det.onnx   (ch_PP-OCRv4_det from https://huggingface.co/deepghs/paddleocr)
       - rec.onnx   (en_PP-OCRv5_mobile_rec from https://huggingface.co/monkt/paddleocr-onnx)
       - en_dict.txt (character dictionary, must have space as last line)

    3. ONNX Runtime (libonnxruntime.so v1.23+) must be on LD_LIBRARY_PATH.

Usage:
    python ocr_example.py <pdf_file> --det <det_model> --rec <rec_model> --dict <dict_file>

Example (recommended V4 det + V5 rec):
    python ocr_example.py scanned.pdf \\
        --det .models/det.onnx \\
        --rec .models/rec.onnx \\
        --dict .models/en_dict.txt

Example (full V5 stack):
    python ocr_example.py scanned.pdf \\
        --det .models/v5/det.onnx \\
        --rec .models/v5/rec.onnx \\
        --dict .models/v5/en_dict.txt \\
        --v5
"""

import argparse
import sys
from pathlib import Path


def main():
    parser = argparse.ArgumentParser(description="Extract text from scanned PDFs using OCR")
    parser.add_argument("pdf", help="Path to PDF file")
    parser.add_argument("--det", required=True, help="Path to detection model (ONNX)")
    parser.add_argument("--rec", required=True, help="Path to recognition model (ONNX)")
    parser.add_argument("--dict", required=True, help="Path to character dictionary")
    parser.add_argument("--page", type=int, help="Process only this page (0-indexed)")
    parser.add_argument(
        "--v5",
        action="store_true",
        help="Use PP-OCRv5 config (high-res detection input, for V5 detection models)",
    )
    args = parser.parse_args()

    # Import pdf_oxide
    try:
        from pdf_oxide import PdfDocument
    except ImportError as e:
        print(f"Error: Failed to import pdf_oxide: {e}")
        print("Make sure to build with: maturin develop --features python,ocr")
        sys.exit(1)

    # Import OCR classes (only available when built with 'ocr' feature)
    try:
        from pdf_oxide import OcrConfig, OcrEngine
    except ImportError:
        print("Error: pdf_oxide was not built with OCR support")
        print("Rebuild with: maturin develop --features python,ocr")
        sys.exit(1)

    # Validate paths
    if not Path(args.pdf).exists():
        print(f"Error: PDF file not found: {args.pdf}")
        sys.exit(1)
    for path, name in [
        (args.det, "detection model"),
        (args.rec, "recognition model"),
        (args.dict, "dictionary"),
    ]:
        if not Path(path).exists():
            print(f"Error: {name} not found: {path}")
            sys.exit(1)

    print("=" * 70)
    print("PDF OCR Example")
    print("=" * 70)
    print()

    # Create OCR configuration
    # use_v5=True preserves high-resolution input for V5 detection models.
    # For the default V4 det + V5 rec combination, use_v5 should be False.
    if args.v5:
        print("Using PP-OCRv5 config (high-resolution detection input)")
    config = OcrConfig(use_v5=args.v5)
    print(f"Config: {config}")

    # Load OCR engine
    print("\nLoading OCR models...")
    try:
        engine = OcrEngine(
            det_model_path=args.det,
            rec_model_path=args.rec,
            dict_path=args.dict,
            config=config,
        )
        print("OCR engine loaded successfully!")
    except Exception as e:
        print(f"Error loading OCR engine: {e}")
        sys.exit(1)

    # Open PDF
    print(f"\nOpening PDF: {args.pdf}")
    try:
        doc = PdfDocument(args.pdf)
        page_count = doc.page_count()
        print(f"PDF has {page_count} pages")
    except Exception as e:
        print(f"Error opening PDF: {e}")
        sys.exit(1)

    # Determine pages to process
    if args.page is not None:
        if args.page < 0 or args.page >= page_count:
            print(f"Error: Page {args.page} out of range (0-{page_count - 1})")
            sys.exit(1)
        pages = [args.page]
    else:
        pages = range(page_count)

    # Process each page
    for page_idx in pages:
        print()
        print("-" * 70)
        print(f"Page {page_idx + 1} of {page_count}")
        print("-" * 70)

        try:
            text = doc.extract_text_ocr(page=page_idx, engine=engine)
            if text.strip():
                print("\nExtracted text:")
                print(text)
            else:
                print("(No text detected)")
        except Exception as e:
            print(f"OCR failed: {e}")

    print()
    print("=" * 70)
    print("Done!")
    print("=" * 70)


if __name__ == "__main__":
    main()