@@ -45,7 +45,7 @@ from typing import Optional
4545
4646import pandas as pd
4747from pydantic import BaseModel, Field
48- from document_extraction_pipeline import main
48+ from document_extraction_pipeline import extract_structured_data
4949
5050
5151# 1. Define your schema
@@ -77,7 +77,7 @@ If a field is missing, return null.
7777if __name__ == " __main__" :
7878 invoice_paths = [" invoice1.pdf" , " invoice2.pdf" ]
7979
80- result_df = main (
80+ result_df = extract_structured_data (
8181 image_paths = invoice_paths,
8282 output_cls = Invoice,
8383 prompt = INVOICE_PROMPT ,
@@ -90,10 +90,10 @@ if __name__ == "__main__":
9090
9191## API Reference
9292
93- ### ` main ()` Function
93+ ### ` extract_structured_data ()` Function
9494
9595``` python
96- def main (
96+ def extract_structured_data (
9797 image_paths : List[str ],
9898 output_cls : Type[BaseModel],
9999 prompt : str ,
@@ -127,15 +127,15 @@ def main(
127127# ## Basic Extraction
128128
129129```python
130- from document_extraction_pipeline import main
130+ from document_extraction_pipeline import extract_structured_data
131131from pydantic import BaseModel, Field
132132
133133class BusinessCard(BaseModel):
134134 name: str = Field(description = " Person's name" )
135135 company: str = Field(description = " Company name" )
136136 email: str = Field(description = " Email address" )
137137
138- result = main (
138+ result = extract_structured_data (
139139 image_paths = [" card.jpg" ],
140140 output_cls = BusinessCard,
141141 prompt = " Extract business card info: {context_str} " ,
@@ -148,7 +148,7 @@ result = main(
148148from pathlib import Path
149149from extract_receipts_pipeline import Receipt
150150
151- result = main (
151+ result = extract_structured_data (
152152 image_paths = [" low_res.jpg" ],
153153 output_cls = Receipt,
154154 prompt = " Extract receipt: {context_str} " ,
@@ -168,7 +168,7 @@ def clean_data(df: pd.DataFrame) -> pd.DataFrame:
168168 df[" email" ] = df[" email" ].str.lower()
169169 return df
170170
171- result = main (
171+ result = extract_structured_data (
172172 image_paths = [" form.pdf" ],
173173 output_cls = FormData,
174174 prompt = " Extract: {context_str} " ,
@@ -180,7 +180,7 @@ result = main(
180180
181181To create a new document extractor (like the receipt pipeline):
182182
183- 1 . Import the generic `main ` function from `document_extraction_pipeline`
183+ 1 . Import the generic `extract_structured_data ` function from `document_extraction_pipeline`
1841842 . Define your Pydantic schema(s)
1851853 . (Optional) Create transformation function
1861864 . Define extraction prompt
0 commit comments