-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexam_text_extractor.py
113 lines (75 loc) · 3.55 KB
/
exam_text_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import datastore_manager
from google.cloud import storage
from google.cloud import vision
import os
import textract
import requests
import json
lcp = os.environ['LCP']
lcp = lcp.upper()
project = os.environ['PROJECT_ID']
#app_hostname = os.environ['APP_HOSTNAME']
app_hostname = "{lcp}-exambae-backend-dot-demolisherapp.appspot.com"
app_hostname = app_hostname.format(lcp=lcp)
insert_document_endpoint = 'http://{app_hostname}/document/insert'.format(app_hostname=app_hostname)
storage_client = storage.Client()
vision_client = vision.ImageAnnotatorClient()
exams_to_be_processed_kind = 'exams_to_be_processed_{LCP}'.format(LCP=lcp)
exam_dm = datastore_manager.DatastoreManager(project, exams_to_be_processed_kind, lcp=lcp)
gql_query = "SELECT * from {kind}".format(kind=exams_to_be_processed_kind)
query = exam_dm.client.query(kind=exams_to_be_processed_kind)
results = exam_dm.run_query(query)
print results
json_dicts = []
for result in results:
blob_info = result['file'].split('/o/')
bucket_name = blob_info[0]
bucket = storage_client.get_bucket(bucket_name)
blob_name = blob_info[1]
blob = storage.blob.Blob(blob_name, bucket)
with open(blob_name, 'wb') as file_obj:
blob.download_to_file(file_obj)
local_text = None
try:
text = textract.process(blob_name)
print("local**********************")
print(text)
local_text = text
local_text = local_text.replace('\f', '').replace('\n', '')
print("local*******************")
except:
print("textract fucked up")
gcp_document_text_response = vision_client.document_text_detection({'source': {'image_uri': 'gs://{bucket_name}/{blob_name}'.format(bucket_name=bucket_name, blob_name=blob_name)}})
image = vision.types.Image()
image.source.image_uri = 'gs://{bucket_name}/{blob_name}'.format(bucket_name=bucket_name, blob_name=blob_name)
handWriting_image_context = vision.types.ImageContext(
language_hints=['en-t-i0-handwrit'])
gcp_text_response = vision_client.text_detection(image=image)
gcp_handwriting_response = vision_client.document_text_detection(image=image,
image_context=handWriting_image_context)
print("GCP****************")
print(gcp_document_text_response.full_text_annotation.text)
print("GCP****************")
if (gcp_document_text_response.full_text_annotation.text is not None and gcp_document_text_response.full_text_annotation.text != "") or ( local_text is not None and local_text != "") or ( gcp_handwriting_response is not None and gcp_handwriting_response != "") or ( gcp_text_response is not None and gcp_text_response != ""):
gcp_document_text = gcp_document_text_response.full_text_annotation.text.encode('ascii', 'ignore')
gcp_handwriting_text = gcp_handwriting_response.full_text_annotation.text.encode('ascii', 'ignore')
gcp_text = gcp_handwriting_response.full_text_annotation.text.encode('ascii', 'ignore')
json_dict = {}
json_dict['school'] = result['school']
json_dict['description'] = result['description']
json_dict['school_class'] = result['school_class']
json_dict['local_text'] = local_text
json_dict['gcp_document_text'] = gcp_document_text
json_dict['gcp_handwriting_text'] = gcp_handwriting_text
json_dict['gcp_text'] = gcp_text
json_dict['id'] = str(result.key.id_or_name)
json_dicts.append(json_dict)
print("the dictionary ")
print("the dictionary ")
print("the dictionary ")
print(json_dicts)
json_dict = {}
json_dict['entities'] = json_dicts
if len(json_dicts) > 0:
response = requests.post(insert_document_endpoint, data=json.dumps(json_dict) )
print response