-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.py
More file actions
50 lines (40 loc) · 1.43 KB
/
scrape.py
File metadata and controls
50 lines (40 loc) · 1.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def scrape(no_of_pagedowns = 20, site='http://www.buzzfeed.com'):
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1920,1080))
display.start()
browser = webdriver.Chrome()
browser.get(site)
time.sleep(1)
elem = browser.find_element_by_tag_name("body")
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)
no_of_pagedowns-=1
post_elems = browser.find_elements_by_tag_name('h1') + browser.find_elements_by_tag_name('h2')
lines = [remove_emoji(_.text.strip()) for _ in post_elems if len(_.text) > 15]
browser.quit()
display.stop()
return lines
def remove_emoji(line):
import re
emoji = re.compile('[\U00010000-\U0010ffff]')
"credit to Martijn Pieters at https://stackoverflow.com/a/12636588"
return emoji.sub('', line).strip()
def sample_to_file(sample, f = 'sample.txt'):
with open(f, 'w') as workfile:
for l in sample:
workfile.write(f'{l}\n')
def sample_from_file(f = 'sample.txt'):
import os
with open(f) as workfile:
sample = workfile.readlines()
return sample
"""def find_object(line):
from spacy.en import English
nlp = English()
doc = nlp(line)
sub_toks = [tok for tok in doc if (tok.dep_ == "nsubj") ]
return sub_toks or list(doc.noun_chunks)[0]"""