forked from jvshields/coffeepapers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquery.py
executable file
·289 lines (243 loc) · 9.41 KB
/
query.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
#!/usr/bin/env python
# coding: utf-8
"""Usage: ./query.py [OPTION]... TITLES
Update website by querying ArXiV for papers with TITLES.
Example: ./query.py -F '%Y-%m-%d' 'Delta Scuti Variables'
Options:
-a, --append Add another paper to the home page
-t, --test Test run, no file changes are made
-b, --backup Create backup of current data files
-d, --date=DATE Set the date as the string DATE
default is next Tuesday or Thursday
-F, --format=FORMAT Use FORMAT as the datetime format
-m, --method=METHOD Search ArXiV by METHOD
Options include the following:
ti: Title (default)
au: Author
abs: Abstract
co: Comment
jr: Journal Reference
cat: Subject Category
rn: Report Number
id: Id
all: All of the above
-h, --help This message"""
import sys
import os
import json
import getopt
import tarfile
from datetime import datetime, timedelta
import urllib.request as libreq
import feedparser
PAPERS = "papers.js"
OLD_PAPERS_JS = "old_papers.js"
OLD_PAPERS_JSON = "old_papers.json"
SCHEDULE = [1, 3] # Tuesday and Thursday iso time - see https://www.educative.io/answers/what-is-the-datetime-dateweekday-function-in-python
def make_backup():
backup_name = "papersbackup_{}.tar.gz".format(datetime.today().isoformat())
with tarfile.open(backup_name, "w") as compressed_backup:
for filename in (PAPERS, OLD_PAPERS_JS, OLD_PAPERS_JSON):
if os.path.isfile(filename):
compressed_backup.add(filename)
print("Created backup of current papers files in {}".format(backup_name))
def cleanhtml(raw_html):
cleaned_html = raw_html.translate(
raw_html.maketrans(
{"\\": None, "\n": None, "\t": " ", "'": "’", '"': "“"}
)
)
return cleaned_html
def determine_similarity(string1, string2):
"""determine the cosine similarity between strings"""
char_dicts = ({}, {})
strings = (string1.upper().split(), string2.upper().split())
for char_dict, string in zip(char_dicts, strings):
for word in string:
if word not in char_dict:
char_dict[word] = 0
char_dict[word] += 1
all_words = sorted(
list(set(list(char_dicts[0].keys()) + list(char_dicts[1].keys())))
)
vectors = [
[char_dict[word] if word in char_dict else 0 for word in all_words]
for char_dict in (char_dicts)
]
mags = [sum(v**2 for v in vector) ** 0.5 for vector in vectors]
similarity = sum(v1 * v2 / (mags[0] * mags[1]) for v1, v2 in zip(*vectors))
return similarity
def user_select_alternatives(querytitle, feed):
if len(feed) == 0:
return None
titles = [cleanhtml(data.title) for data in feed]
similarities = [
determine_similarity(cleanhtml(querytitle), title) for title in titles
]
options = zip(titles, similarities)
order = sorted(range(len(feed)), key=lambda i: similarities[i])[::-1]
print(f"Found {len(feed)} Similar Articles:")
print("{:<6s} | {:<5s} | {:<6s}".format("Index", "Score", "Title"))
for j, i in enumerate(order):
print(f"{j:<6d} | {similarities[i]:<1.3f} | {titles[i]:<6s}")
answer = "."
while answer.upper() != "Q":
answer = input(
"Please select an article index to you instead [0-9/q(uit)]:"
)
if answer in map(str, range(10)):
print(f"You've selected: {titles[order[int(answer)]]}")
return feed[order[int(answer)]]
return None
def query(querytitle, share_date, method="ti"):
"""
method: Search method, see https://arxiv.org/help/api/basics
default is title
"""
if method == "ti":
num_entries = 10
else:
num_entries = 0
queryfixed = querytitle.replace(" ", "+").replace("_", "+AND+ti:")
base_url = "http://export.arxiv.org/api/query?search_query={}:".format(
method
)
q = base_url + queryfixed + f"&start=0&max_results={num_entries}"
with libreq.urlopen(q) as url:
r = url.read()
feed = feedparser.parse(r)
assert (
feed.entries
), f"Query failed to find '{querytitle}' using search method '{method}'"
data = feed.entries[0]
uncleantitle = data.title
title = cleanhtml(uncleantitle)
if method == "ti": # Verify the title is what was requested
similarity = determine_similarity(cleanhtml(querytitle), title)
if similarity < 0.9:
print(
"The title returned by the query seems to differ from the title requested"
)
print(f" Requested Title: '{cleanhtml(querytitle)}'")
print(f" Found Title: '{title}'")
print(f"Cosine similarity: {similarity:.3f}")
print("------------------------------------")
while True:
response = input(
"Would you like to select this article? [Y/n]:"
)
if response.upper() == "Y":
break
elif response.upper() == "N":
data = user_select_alternatives(
querytitle, feed.entries[1:]
)
if data is None:
print("No changes were applied... Exiting")
exit(1)
else:
uncleantitle = data.title
title = cleanhtml(uncleantitle)
break
authors = ", ".join(author.name for author in data.authors).replace(
"'", "’"
)
link = data.link
for link2 in data.links:
if link2.rel == "alternate":
pass
elif link2.title == "pdf":
pdf = link2.href
dictionary = {
"title": title,
"authors": authors,
"url": link,
"pdf": pdf,
"date": share_date, # Date queried, not necessarily posted
}
return dictionary
def main(titles, share_date, method="ti", test_mode=False, append=False):
if test_mode:
print("Running in Test Mode: No Files will be written")
new_papers = [query(title, share_date, method) for title in titles]
current_papers = []
if append:
with open(PAPERS, "r") as f:
current_papers = json.loads(
f.read().strip().lstrip(r"data='").strip(r"'")
)
if test_mode:
print("Sample Output:")
print(PAPERS)
print("---------")
print(json.dumps(new_papers + current_papers).join([r"data='", r"'"]))
print("---------")
return 0
with open(PAPERS, "w") as outfile:
outfile.write(
json.dumps(new_papers + current_papers).join([r"data='", r"'"])
)
# The following loads and updates the archive json file
# It's annoying and messy tbh. Python's JSON reader and the html one
# are using different formats. I keep a file "old_papers.json" that is a
# cumulative list of all past and current papers. We load and dump to that,
# then use the updated data to prep "old_papers.js" for the html script.
# It works but isn't very clean.
# Make sure to check "old_papers.json" exists.
all_papers = (
new_papers # Build the list of all previous and current papers
)
if os.path.isfile(OLD_PAPERS_JSON): # if old_papers.json exists, append
with open(OLD_PAPERS_JSON, "r") as f:
all_papers += json.load(f)
# write out our new database of old papers
with open(OLD_PAPERS_JSON, "w") as f_json, open(
OLD_PAPERS_JS, "w"
) as f_js:
json.dump(all_papers, f_json)
f_js.write(json.dumps(all_papers).join([r"data='", r"'"]))
if __name__ == "__main__":
# The last arg is the date it'll be shared.
# Any format, just a string. "Tu Oct 12" etc is fine.
# We have to pass the last arg as the date though, or it'll mess up.
optlist, args = getopt.getopt(
sys.argv[1:],
"atbd:m:hF:",
["append", "test", "backup", "help", "date=", "format=", "method="],
)
method = "ti"
share_date = None
test_mode = False
append = False
format_string = "%a %b %d, %Y"
for opt, value in optlist:
if opt in ("-b", "--backup"):
make_backup()
if not args:
exit(0)
elif opt in ("-d", "--date"):
share_date = value
elif opt in ("-F", "--format"):
format_string = value
elif opt in ("-m", "--method"):
method = value
elif opt in ("-h", "--help"):
print(__doc__)
exit(0)
elif opt in ("-t", "--test"):
test_mode = True
elif opt in ("-a", "--append"):
append = True
if share_date is None:
today = datetime.today() + timedelta(hours=13, minutes=30)
weekend = 7 - today.weekday()
next_coffee = min(
(weekend + SCHEDULE[0]) % 7, (weekend + SCHEDULE[1]) % 7
)
share_date = (
today.replace(hour=10, minute=30, second=0, microsecond=0)
+ timedelta(days=next_coffee)
).strftime(format_string)
titles = args
assert titles, "Provide 1 or more paper titles"
main(titles, share_date, method, test_mode, append)