Skip to content

Commit

Permalink
Merge pull request #56 from datalogics-jacksonm/generate-pdf-text-sum…
Browse files Browse the repository at this point in the history
…mary-example

PDFCLOUD-2730 | Add Generate PDF Text Summary example
  • Loading branch information
datalogics-tsmith authored Nov 10, 2023
2 parents 257673e + 854a256 commit c02dcad
Showing 1 changed file with 116 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import openai
from requests_toolbelt import MultipartEncoder
import requests
import tiktoken
from tenacity import (
retry,
stop_after_attempt,
wait_random_exponential,
) # for exponential backoff

# Configurations


openai.api_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" # place your OpenAI API Key here
pdfRest_api_key = 'xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx' # place your pdfRest API Key here
extract_text_endpoint_url = 'https://api.pdfrest.com/extracted-text'
query_prompt = ('Assuming the following text is from a PDF Document, derive a relatively short summary of the '
'contents. \n\n')
file_name = 'FileName.pdf' # file name of the PDF to send to /extracted-text
file_path = '/PATH/TO/FILE/'
MAX_CONTEXT_SIZE = 127000

# This is used for determining the length of a given string in tokens, so that we don't try sending
# too much data to ChatGPT
enc = tiktoken.get_encoding("cl100k_base")


# Helper function that allows larger documents to process without hitting the rate limits of ChatGPT
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def completion_with_backoff(**kwargs):
return openai.ChatCompletion.create(**kwargs)


# The /extracted-text endpoint can take a single PDF file or id as input.
mp_encoder_extractText = MultipartEncoder(
fields={
'file': (
file_name,
open(file_path + file_name, 'rb'),
'application/pdf'
)
}
)

# Let's set the headers that the /extracted-text endpoint expects. Since MultipartEncoder is used, the 'Content-Type'
# header gets set to 'multipart/form-data' via the content_type attribute below.
headers = {
'Accept': 'application/json',
'Content-Type': mp_encoder_extractText.content_type,
'Api-Key': pdfRest_api_key
}

# Send the POST request to the /extracted-text endpoint
response = requests.post(extract_text_endpoint_url, data=mp_encoder_extractText, headers=headers)

if response.ok:
print("Building prompt...")
response_json = response.json()

# To get the full text of the document, we grab the fullText attribute out of the resulting JSON
fullText = response_json["fullText"]

# In order to keep track of where we are in the document, we're going to split the resulting string into a list,
# delimited by spaces.
fullTextArray = fullText.split()

# Append the query_prompt to the beginning of our JSON output returned from /extracted-text
query_string = query_prompt

# This logic sets up a loop that will continue until all the contents of the document have been processed,
# keeping track of any summaries returned by ChatGPT.
shouldLoop = True
summaryList = []
i = 0
while shouldLoop:
shouldLoop = False
while len(enc.encode(query_string)) < MAX_CONTEXT_SIZE and i < len(fullTextArray):
query_string += fullTextArray[i] + " "
i += 1
shouldLoop = True

# For visual feedback, just printing out how much of the document has been processed by each request being sent
print(f"Got to element #{i} out of {len(fullTextArray)}. \n")

# Send the query off to ChatGPT using the gpt-4-1106-preview (also known as the GPT 4 turbo) model
chat_completion = completion_with_backoff(model="gpt-4-1106-preview",
messages=[{"role": "user", "content": query_string}])

# Reset query_string back to the default value of query_prompt
query_string = query_prompt

# Add the newly returned summary to the summaryList
summaryList.append(chat_completion.choices[0].message.content)

# If either of these conditions happen, we should break from the loop
if len(enc.encode(query_string)) > MAX_CONTEXT_SIZE or i >= len(fullTextArray):
break

# If it took multiple summaries to process the entire document, compile the summaries and summarize them again
# into a more cohesive singular summary.
if len(summaryList) > 1:
summary_string = ""
for summary in summaryList:
summary_string += " " + summary

summary_query = ("Assuming the following text is a compilation of summaries about the contents of a single PDF "
"document, create a detailed comprehensive summary of the given text. \n\n") + summary_string
final_chat_completion = completion_with_backoff(model="gpt-4-1106-preview",
messages=[{"role": "user", "content": summary_query}])

print("\n" + final_chat_completion.choices[0].message.content + "\n")
elif len(summaryList) == 1:
print("\n" + summaryList[0] + "\n")

else:
print(response.text)

0 comments on commit c02dcad

Please sign in to comment.