-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpfootprint-generate-jsons.py
executable file
·140 lines (129 loc) · 4.96 KB
/
pfootprint-generate-jsons.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
''' SCRIPT TO CREATE JSON FILES USING IBM WATSON
ARGUMENTS:
-d: project directory
-l: language (default is 'en'), see IBM docs for supported languages
-u: url (optional)
This script finds all .txt files in a directory (and its subdirectories)
and create a "json-footprints" folder with .json files generated by IBM Watson
(based on the .txt files). IBM Watson truncates queries with text files
larger than 50kb, a workaround is to upload your files on a server and to
use their -u argument url instead (limit is 600kb).
Please update USERNAME and PASSWORD variables with your IBM Watson details'''
import sys
import os.path
import getopt
import json
sys.path.append(os.path.join(os.getcwd(), '..'))
import watson_developer_cloud
import watson_developer_cloud.natural_language_understanding.features.v1 as features
USERNAME = "" # IBM Watson username
PASSWORD = "" # IBM Watson password
DIR_JSON = "json-footprints"
def get_input(argv):
directory = None
url = None
# GET PARAMETERS -d for directory
language = 'en'
try:
opts, args = getopt.getopt(argv, "hd:l:u:",
["dir=", "language=", "url="])
except getopt.GetoptError:
print 'pfoot.py -d <dir> -l <language> -u <url>'
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print 'pfoot.py -d <dir> -l <language> -u <url>'
sys.exit()
elif opt in ("-d", "--dir"):
directory = arg
elif opt in ("-u", "--url"):
url = arg
elif opt in ("-l", "--language"):
language = arg
if directory is None:
print 'pfoot.py -d <dir> -l <language> -u <url>'
else:
return directory, language, url
# CALL WATSON WITH TEXT FILE
def call_watson(text, language):
nlu = watson_developer_cloud.NaturalLanguageUnderstandingV1(
version='2017-02-27',
username=USERNAME,
password=PASSWORD)
entities = nlu.analyze(text=text,
language=language,
features=[features.Entities(emotion=True,
sentiment=True)])
keywords = nlu.analyze(text=text,
language=language,
features=[features.Keywords(emotion=True,
sentiment=True)])
return entities, keywords
# GET WATSON WITH URL
def call_watson_url(url, language):
nlu = watson_developer_cloud.NaturalLanguageUnderstandingV1(
version='2017-02-27',
username=USERNAME,
password=PASSWORD)
entities = nlu.analyze(url=url,
language=language,
features=[features.Entities(emotion=True,
sentiment=True)])
keywords = nlu.analyze(url=url,
language=language,
features=[features.Keywords(emotion=True,
sentiment=True)])
return entities, keywords
args = get_input(sys.argv[1:]) # read command arguments
DIR = args[0]
language = args[1]
URL = args[2]
try:
os.makedirs(os.path.join(DIR, DIR_JSON))
except Exception:
pass
if URL is None:
# OPEN ALL .TXT FILES
for root, dirs, files in os.walk(DIR):
for name in sorted(files):
if not name.startswith('.') and name.endswith('.txt'):
if os.path.getsize(os.path.join(root, name)) < 49000:
# CREATE JSON FILES USING WATSON API
parts_name = name.split('-')
file = open(os.path.join(root, name))
text = file.read()
results = call_watson(text, language)
entities = open(os.path.join(
DIR,
DIR_JSON,
os.path.splitext(name)[0] + "-0.json"), "w")
json.dump(results[0], entities)
entities.close()
keywords = open(os.path.join(
DIR,
DIR_JSON,
os.path.splitext(name)[0] + "-1.json"), "w")
json.dump(results[1], keywords)
keywords.close()
file.close()
print os.path.join(root, name), 'parsed successfuly'
else:
print os.path.join(root, name),
' is too big, please parse url instead'
else:
# CREATE JSON FILES USING THE URL
name = os.path.splitext(os.path.basename(URL))[0]
results = call_watson_url(URL, language)
entities = open(os.path.join(
DIR,
DIR_JSON,
name + "-0.json"), "w")
json.dump(results[0], entities)
entities.close()
keywords = open(os.path.join(
DIR,
DIR_JSON,
name + "-1.json"), "w")
json.dump(results[1], keywords)
keywords.close()
print 'url content parsed'