-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathweb_scrape_description.py
95 lines (83 loc) · 2.93 KB
/
web_scrape_description.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
## import required libraries
import pandas as pd
import csv
import requests
import sys
import webbrowser
from bs4 import BeautifulSoup
import time
import os
from collections import Counter
## base path
path = "" #add the project base path here
## read the original datatset
original_data_filename = "" #add the original data file(in .csv format) which is present at the path provided above
original_dataset_path = os.path.join(path, original_data_filename)
data_df = pd.read_csv(original_dataset_path, error_bad_lines=False)
## clean data
strip_df = data_df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
clean_df = strip_df.replace('Undeclared', '')
clean_df = clean_df.replace('unknown', '')
## Create a new csv file to save the output
csvFile = "webScrape_product_description.csv"
csvPath = os.path.join(path, csvFile)
output_file = open(csvPath, 'a')
csv_writer = csv.writer(output_file)
csv_writer.writerow(['Product', 'Web_Description'])
## function to scrape web (google.com) for extracting the description for each product
def scrape_description(product):
tot_tries = 11
delay = 10800
hrs = delay/3600
prod = product.replace("-", " ")
itr=1
while tot_tries >= 1:
res = requests.get('https://google.com/search?q='+''.join(prod))
if res.status_code == 200:
tot_tries = 0
elif res.status_code == 429:
if tot_tries == 1:
res.raise_for_status()
tot_tries -= 1
print("HTTPError: 429 - Too Many Requests; Will retry in %d hours..."% (hrs))
time.sleep(delay)
itr += 1
soup = BeautifulSoup(res.text, 'html.parser')
flag = False
i=0
j=0
terms = []
divs = soup.select('div')
for d in divs:
if d.get_text() == 'All results':
flag = True
i+=1
else:
i+=1
if flag and i<len(divs)-1:
if j > 20:
break
i+=1
j+=1
for term in divs[i].get_text().split():
if 'Missing' in term:
break
terms.append(term)
return terms
## starter code
scrape_descr_df = pd.read_csv(csvPath)
for i in range(len(clean_df)):
product = clean_df.iloc[i]['Product']
check = scrape_descr_df.apply(lambda x: product in x.values, axis=1).any() ## if a product name has already been checked for, skip that product
if not check:
time.sleep(1) # delay of 1 second before each web request
terms = scrape_description(product)
prod_description = ''
for word in terms:
if ".com" in word or "www." in word:
continue
else:
prod_description += word + ' '
scrape_descr_df.loc[i, 'Product'] = product
scrape_descr_df.loc[i, 'Web_Description'] = prod_description
scrape_descr_df.to_csv(csvPath, encoding='utf-8', index=False)