-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathprepare_step2.py
executable file
·110 lines (99 loc) · 2.85 KB
/
prepare_step2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python2
from bs4 import BeautifulSoup
from datetime import datetime
import re, random
moons = [
"Adrastea",
"Aitne",
"Amalthea",
"Ananke",
"Aoede",
"Arche",
"Autonoe",
"Callirrhoe",
"Callisto",
"Carme",
"Carpo",
"Chaldene",
"Cyllene",
"Elara",
"Erinome",
"Euanthe",
"Eukelade",
"Euporie",
"Europa",
"Eurydome",
"Ganymede",
"Harpalyke",
"Hegemone",
"Helike",
"Hermippe",
"Herse",
"Himalia",
"Io",
"Iocaste",
"Isonoe",
"Kale",
"Kallichore",
"Kalyke",
"Kore",
"Leda",
"Lysithea",
"Megaclite",
"Metis",
"Mneme",
"Orthosie",
"Pasiphae",
"Pasithee",
"Praxidike",
"Sinope",
"Sponde",
"Taygete",
"Thebe",
"Thelxinoe",
"Themisto",
"Thyone"
]
def random_moon(matchobj):
return "#" + random.sample(moons, 1)[0]
def random_mayo(matchobj):
if random.randint(0, 100) < 10:
possible = ["#spider", "#mayo"]
return random.sample(possible, 1)[0]
else:
return matchobj.group(0)
def main():
prepared_html = open("prepared.html")
soup = BeautifulSoup(prepared_html, "lxml")
prepared_html.close()
# Save intermediate step
output = open("prepared2.html", "w")
all_ps = soup.find_all("p")
print "Number of <p> tags found: {}".format(len(all_ps))
all_lines = [x.get_text().encode("ascii", errors="ignore") for x in all_ps]
print "Number of lines found: {}".format(len(all_lines))
all_text = "\n".join(all_lines)
output.write(all_text)
output.close()
to_replace = ["@?postgres", "@?mongo", "@?couch", "@?riak", "@?basho", "@?level",
"@?cassandra", "@?mysql", "@?maria", "@?redis", "@?rethink", "@?inno",
"@?volt", "@?datomic"]
for item in to_replace:
regex = re.compile(item, re.IGNORECASE)
all_text = regex.sub("Oleg", all_text) # This is Oleg and not OlegDB for a reason
for item in range(2000, datetime.now().year):
year_str = r"{}".format(item)
current_year_str = r"{}".format(datetime.now().year)
regex = re.compile(year_str, re.IGNORECASE)
all_text = regex.sub(current_year_str, all_text) # This is Oleg and not OlegDB for a reason
regex = re.compile(r"@[\w]+")
all_text = regex.sub(random_moon, all_text)
regex = re.compile(r"#[\w]+")
all_text = regex.sub(random_mayo, all_text)
regex = re.compile(r"[\?:\(\s^]?.?https?:\/\/[\w\.\/\)\?\d&=-]*")
all_text = regex.sub("", all_text)
output = open("prepared3.txt", "w")
output.write(all_text)
output.close()
if __name__ == '__main__':
main()