Skip to content

Commit 86f943f

Browse files
committed
AWS_PDF_Table_Textracts
1 parent 5acfefd commit 86f943f

File tree

2 files changed

+284
-0
lines changed

2 files changed

+284
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
import boto3
2+
import json
3+
import sys
4+
import time
5+
import urllib.parse
6+
7+
class ProcessType:
8+
DETECTION = 1
9+
ANALYSIS = 2
10+
11+
12+
class DocumentProcessor:
13+
jobId = ''
14+
textract = boto3.client('textract')
15+
sqs = boto3.client('sqs')
16+
sns = boto3.client('sns')
17+
18+
roleArn = ''
19+
bucket = ''
20+
document = ''
21+
22+
sqsQueueUrl = ''
23+
snsTopicArn = ''
24+
processType = ''
25+
26+
def main(self, bucketName, documentName):
27+
self.roleArn = 'arn:aws:iam::357171621133:role/ETLlambdaAccessRole'
28+
29+
self.bucket = bucketName
30+
self.document = documentName
31+
32+
self.CreateTopicandQueue()
33+
self.ProcessDocument(ProcessType.ANALYSIS)
34+
self.DeleteTopicandQueue()
35+
36+
def ProcessDocument(self, type):
37+
jobFound = False
38+
39+
self.processType = type
40+
validType = False
41+
42+
# Determine which type of processing to perform
43+
if self.processType == ProcessType.DETECTION:
44+
response = self.textract.start_document_text_detection(DocumentLocation={'S3Object': {'Bucket': self.bucket, 'Name': self.document}},
45+
NotificationChannel={'RoleArn': self.roleArn, 'SNSTopicArn': self.snsTopicArn})
46+
print('Processing type: Detection')
47+
validType = True
48+
49+
if self.processType == ProcessType.ANALYSIS:
50+
response = self.textract.start_document_analysis(DocumentLocation={'S3Object': {'Bucket': self.bucket, 'Name': self.document}},
51+
FeatureTypes=[
52+
"TABLES"],
53+
NotificationChannel={'RoleArn': self.roleArn, 'SNSTopicArn': self.snsTopicArn})
54+
print('Processing type: Analysis')
55+
validType = True
56+
57+
if validType == False:
58+
print("Invalid processing type. Choose Detection or Analysis.")
59+
return
60+
61+
print('Start Job Id: ' + response['JobId'])
62+
dotLine = 0
63+
while jobFound == False:
64+
sqsResponse = self.sqs.receive_message(QueueUrl=self.sqsQueueUrl, MessageAttributeNames=['ALL'],
65+
MaxNumberOfMessages=10)
66+
67+
if sqsResponse:
68+
69+
if 'Messages' not in sqsResponse:
70+
if dotLine < 40:
71+
print('.', end='')
72+
dotLine = dotLine+1
73+
else:
74+
print()
75+
dotLine = 0
76+
sys.stdout.flush()
77+
time.sleep(5)
78+
continue
79+
80+
for message in sqsResponse['Messages']:
81+
notification = json.loads(message['Body'])
82+
textMessage = json.loads(notification['Message'])
83+
print(textMessage['JobId'])
84+
print(textMessage['Status'])
85+
if str(textMessage['JobId']) == response['JobId']:
86+
print('Matching Job Found:' + textMessage['JobId'])
87+
jobFound = True
88+
results = self.GetResults(textMessage['JobId'])
89+
self.StoreInS3(results)
90+
self.sqs.delete_message(QueueUrl=self.sqsQueueUrl,
91+
ReceiptHandle=message['ReceiptHandle'])
92+
else:
93+
print("Job didn't match:" +
94+
str(textMessage['JobId']) + ' : ' + str(response['JobId']))
95+
# Delete the unknown message. Consider sending to dead letter queue
96+
self.sqs.delete_message(QueueUrl=self.sqsQueueUrl,
97+
ReceiptHandle=message['ReceiptHandle'])
98+
99+
print('Done!')
100+
101+
# Store the result in a S3 bucket
102+
def StoreInS3(self, response):
103+
print('registering in s3 bucket...')
104+
outputInJsonText = str(response)
105+
pdfTextExtractionS3ObjectName = self.document.replace('.pdf', '')
106+
pdfTextExtractionS3Bucket = self.bucket
107+
108+
s3 = boto3.client('s3')
109+
110+
outputFileName = pdfTextExtractionS3ObjectName + '.json'
111+
s3.put_object(Body=outputInJsonText,
112+
Bucket=pdfTextExtractionS3Bucket, Key=outputFileName)
113+
print('file ' + outputFileName + ' registered successfully!')
114+
115+
def CreateTopicandQueue(self):
116+
117+
millis = str(int(round(time.time() * 1000)))
118+
119+
# Create SNS topic
120+
snsTopicName = "AmazonTextractTopic" + millis
121+
122+
topicResponse = self.sns.create_topic(Name=snsTopicName)
123+
self.snsTopicArn = topicResponse['TopicArn']
124+
125+
# create SQS queue
126+
sqsQueueName = "AmazonTextractQueue" + millis
127+
self.sqs.create_queue(QueueName=sqsQueueName)
128+
self.sqsQueueUrl = self.sqs.get_queue_url(
129+
QueueName=sqsQueueName)['QueueUrl']
130+
131+
attribs = self.sqs.get_queue_attributes(QueueUrl=self.sqsQueueUrl,
132+
AttributeNames=['QueueArn'])['Attributes']
133+
134+
sqsQueueArn = attribs['QueueArn']
135+
136+
# Subscribe SQS queue to SNS topic
137+
self.sns.subscribe(
138+
TopicArn=self.snsTopicArn,
139+
Protocol='sqs',
140+
Endpoint=sqsQueueArn)
141+
142+
# Authorize SNS to write SQS queue
143+
policy = """{{
144+
"Version":"2012-10-17",
145+
"Statement":[
146+
{{
147+
"Sid":"MyPolicy",
148+
"Effect":"Allow",
149+
"Principal" : {{"AWS" : "*"}},
150+
"Action":"SQS:SendMessage",
151+
"Resource": "{}",
152+
"Condition":{{
153+
"ArnEquals":{{
154+
"aws:SourceArn": "{}"
155+
}}
156+
}}
157+
}}
158+
]
159+
}}""".format(sqsQueueArn, self.snsTopicArn)
160+
161+
response = self.sqs.set_queue_attributes(
162+
QueueUrl=self.sqsQueueUrl,
163+
Attributes={
164+
'Policy': policy
165+
})
166+
167+
def DeleteTopicandQueue(self):
168+
self.sqs.delete_queue(QueueUrl=self.sqsQueueUrl)
169+
self.sns.delete_topic(TopicArn=self.snsTopicArn)
170+
171+
def GetResults(self, jobId):
172+
maxResults = 1000
173+
paginationToken = None
174+
finished = False
175+
pages = []
176+
177+
while finished == False:
178+
179+
response = None
180+
181+
if self.processType == ProcessType.ANALYSIS:
182+
if paginationToken == None:
183+
response = self.textract.get_document_analysis(JobId=jobId,
184+
MaxResults=maxResults)
185+
else:
186+
response = self.textract.get_document_analysis(JobId=jobId,
187+
MaxResults=maxResults,
188+
NextToken=paginationToken)
189+
190+
if self.processType == ProcessType.DETECTION:
191+
if paginationToken == None:
192+
response = self.textract.get_document_text_detection(JobId=jobId,
193+
MaxResults=maxResults)
194+
else:
195+
response = self.textract.get_document_text_detection(JobId=jobId,
196+
MaxResults=maxResults,
197+
NextToken=paginationToken)
198+
199+
# Put response on pages list
200+
pages.append(response)
201+
print('Document Detected.')
202+
203+
if 'NextToken' in response:
204+
paginationToken = response['NextToken']
205+
else:
206+
finished = True
207+
208+
# convert pages as JSON pattern
209+
pages = json.dumps(pages)
210+
return pages
211+
212+
213+
def lambda_handler(event, context):
214+
analyzer = DocumentProcessor()
215+
216+
# Get the object from the event and show its content type
217+
bucket = event['Records'][0]['s3']['bucket']['name']
218+
key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
219+
try:
220+
analyzer.main(bucket, key)
221+
222+
return 'Processing Done!'
223+
224+
except Exception as e:
225+
print(e)
226+
print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
227+
raise e
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
## Login AZ Accounts
2+
az login --use-device-code
3+
4+
## Create a web app in Azure
5+
1) Create RG
6+
$LOCATION='eastus'
7+
$RESOURCE_GROUP_NAME='python-flask-webapp-rg'
8+
9+
# Create a resource group
10+
az group create `
11+
--location $LOCATION `
12+
--name $RESOURCE_GROUP_NAME
13+
14+
2) Create App Service Plan
15+
$APP_SERVICE_PLAN_NAME='python-flask-webapp-plan'
16+
17+
az appservice plan create `
18+
--name $APP_SERVICE_PLAN_NAME `
19+
--resource-group $RESOURCE_GROUP_NAME `
20+
--sku B1 `
21+
--is-linux
22+
23+
3) Create App service web app
24+
$APP_SERVICE_NAME='python-flask-webapp-quicklabs'
25+
26+
az webapp create `
27+
--name $APP_SERVICE_NAME `
28+
--runtime 'PYTHON:3.9' `
29+
--plan $APP_SERVICE_PLAN_NAME `
30+
--resource-group $RESOURCE_GROUP_NAME `
31+
--query 'defaultHostName' `
32+
--output table
33+
34+
35+
4) Enable build automation.
36+
az webapp config appsettings set `
37+
--resource-group $RESOURCE_GROUP_NAME `
38+
--name $APP_SERVICE_NAME `
39+
--settings SCM_DO_BUILD_DURING_DEPLOYMENT=true
40+
41+
42+
5) Zip file upload.
43+
az webapp deploy `
44+
--name $APP_SERVICE_NAME `
45+
--resource-group $RESOURCE_GROUP_NAME `
46+
--src-path F:\RekhuAll\Azure\Run_Sample_Flask_App_on_AzureWebApp\msdocs-python-flask-webapp-quickstart\msdocs-python-flask-webapp-quickstart.zip
47+
48+
6) Stream logs - Configuration
49+
az webapp log config `
50+
--web-server-logging filesystem `
51+
--name $APP_SERVICE_NAME `
52+
--resource-group $RESOURCE_GROUP_NAME
53+
54+
7) Stream the log trail
55+
az webapp log tail `
56+
--name $APP_SERVICE_NAME `
57+
--resource-group $RESOURCE_GROUP_NAME

0 commit comments

Comments
 (0)