-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
33 lines (29 loc) · 1.05 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from bs4 import BeautifulSoup as bs
from langdetect import detect
import requests
import time
def get_quotes(num, category):
url = 'https://www.goodreads.com/quotes/tag/' + category + '?page=' + str(num)
req = requests.get(url)
soup = bs(req.text, 'html.parser')
filteredSoup = soup.find_all('div', class_="quoteText")
write_to_file(filteredSoup)
def write_to_file(quotes):
f = open("quotes.txt","a+")
for q in quotes:
string = q.get_text().strip()
end = string.find('―') - 6
sliced_text = string[1:end]
if (sliced_text not in [' ... ', '']) and (detect(sliced_text) == 'en') and (len(sliced_text.split()) < 30):
final_text = sliced_text + "\n"
f.write(final_text)
def start_scrape(categories, sleep_time=5):
for category in categories:
for n in range(1,71):
print('WE ARE ON PAGE:',n, 'CATEGORY:', category)
get_quotes(n, category)
time.sleep(sleep_time)
start_scrape(
categories=['philosophy', 'inspiration'],
sleep_time = 10
)