-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_table_extraction.py
More file actions
68 lines (49 loc) · 1.78 KB
/
Copy pathtest_table_extraction.py
File metadata and controls
68 lines (49 loc) · 1.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python
"""
Quick test of table extraction from PDF documents.
Shows extracted tables with metadata for structured data retrieval.
"""
from __future__ import annotations
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parents[0]))
from ingestion.parser import extract_tables_with_metadata
from ingestion.table_extractor import format_table_for_llm
def main() -> int:
# Test with the UG Regulations PDF
pdf_path = "data/uploads" # Look for uploaded documents
# Find the first PDF
pdf_files = list(Path(pdf_path).glob("*.pdf"))
if not pdf_files:
print("[ERROR] No PDF files found in data/uploads/")
return 1
pdf_file = pdf_files[0]
print(f"[INFO] Extracting tables from: {pdf_file.name}\n")
tables = extract_tables_with_metadata(str(pdf_file))
if not tables:
print("[WARN] No tables found in document")
return 0
print(f"[✓] Found {len(tables)} tables\n")
for idx, table in enumerate(tables, 1):
print("=" * 80)
print(f"[TABLE {idx}] {table['title']}")
print(f" Type: {table['type']}")
print(f" Page: {table['page']}")
print(f" Size: {len(table['rows'])} rows × {len(table['headers'])} columns")
print("=" * 80)
print()
# Show formatted table for LLM
from ingestion.table_extractor import ExtractedTable
table_obj = ExtractedTable(
title=table['title'],
headers=table['headers'],
rows=table['rows'],
page=table['page'],
text=table['text']
)
formatted = format_table_for_llm(table_obj)
print(formatted)
print("\n")
return 0
if __name__ == "__main__":
raise SystemExit(main())