diff --git a/README.md b/README.md index 9bf437e..fbac120 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ from aipdf import ocr api_key = 'your_openai_api_key' file = open('somepdf.pdf', 'rb') -markdown_pages = ocr(file, api_key, prompt="extract markdown, extract tables and turn charts into tables") +markdown_pages = ocr(file, api_key) ``` @@ -47,7 +47,7 @@ We chose that you pass a file object, because that way it is flexible for you to pdf_file = io.BytesIO(requests.get('https://arxiv.org/pdf/2410.02467').content) # extract -pages = ocr(pdf_file, api_key, prompt="extract tables and turn charts into tables, return each table in json") +pages = ocr(pdf_file, api_key, prompt="extract tables, return each table in json") ``` ### From S3 @@ -62,7 +62,7 @@ s3 = boto3.client('s3', config=Config(signature_version='s3v4'), pdf_file = io.BytesIO(s3.get_object(Bucket=bucket_name, Key=object_key)['Body'].read()) # extract -pages = ocr(pdf_file, api_key, prompt="extract tables and turn charts into tables, return each table in json") +pages = ocr(pdf_file, api_key, prompt="extract charts data, turn it into tables that represent the variables in the chart") ``` diff --git a/src/aipdf/ocr.py b/src/aipdf/ocr.py index 40dfc54..d4df7ab 100644 --- a/src/aipdf/ocr.py +++ b/src/aipdf/ocr.py @@ -1,17 +1,23 @@ import io -from pdf2image import convert_from_bytes -from PIL import Image import base64 -import requests -import os import logging import concurrent.futures + +from pdf2image import convert_from_bytes from openai import OpenAI # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -DEFAULT_PROMPT = "Please analyze this image and provide a markdown representation of its content. Include headings, lists, and any other relevant markdown formatting." +DEFAULT_PROMPT = """ +Extract the full markdown text from the given image, following these guidelines: +- Respond only with markdown, no additional commentary. +- Capture all the text, respecting titles, headers, subheaders, equations, etc. +- If there are tables in this page, convert each one into markdown table format and include it in the response. +- If there are images, provide a brief description of what is shown in each image, and include it in the response. +- if there are charts, for each chart include a markdown table with the data represents the chart, a column for each of the variables of the cart and the relevant estimated values + +""" def process_image_to_markdown(file_object, client, model="gpt-4o", prompt = DEFAULT_PROMPT): """ @@ -26,9 +32,11 @@ def process_image_to_markdown(file_object, client, model="gpt-4o", prompt = DEF Returns: str: The markdown representation of the image content, or None if an error occurs. """ - - base64_image = base64.b64encode(file_object.read()).decode('utf-8') + # Log that we're about to process a page + logging.info("About to process a page") + base64_image = base64.b64encode(file_object.read()).decode('utf-8') + try: response = client.chat.completions.create( model=model, @@ -53,6 +61,7 @@ def process_image_to_markdown(file_object, client, model="gpt-4o", prompt = DEF # Extract the markdown content from the response markdown_content = response.choices[0].message.content + logging.info("Page processed successfully") return markdown_content except Exception as e: @@ -99,7 +108,7 @@ def pdf_to_image_files(pdf_file): return image_files -def ocr(pdf_file, api_key, model="gpt-4o", base_url= 'https://api.openai.com/v1', prompt=DEFAULT_PROMPT): +def ocr(pdf_file, api_key, model="gpt-4o", base_url= 'https://api.openai.com/v1', prompt=DEFAULT_PROMPT, pages_list = None): """ Convert a PDF file to a list of markdown-formatted pages using OpenAI's API. @@ -109,13 +118,17 @@ def ocr(pdf_file, api_key, model="gpt-4o", base_url= 'https://api.openai.com/v1' model (str, optional): by default is gpt-4o base_url (str): You can use this one to point the client whereever you need it like Ollama prompt (str, optional): The prompt to send to the API. Defaults to DEFAULT_PROMPT. - + pages_list (list, optional): A list of page numbers to process. If provided, only these pages will be converted. Defaults to None, which processes all pages. Returns: list: A list of strings, each containing the markdown representation of a PDF page. """ client = OpenAI(api_key=api_key, base_url = base_url) # Create OpenAI client # Convert PDF to image files image_files = pdf_to_image_files(pdf_file) + + if pages_list: + # Filter image_files to only include pages in page_list + image_files = [img for i, img in enumerate(image_files) if i + 1 in pages_list] # List to store markdown content for each page markdown_pages = [None] * len(image_files)