-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcollector.py
42 lines (32 loc) · 1.1 KB
/
collector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from random import betavariate
from bs4 import BeautifulSoup
import os
import pandas as pd
df={
'title': [],
'orig_p': [],
'disc_p': [],
'link': []
}
for files in os.listdir("data"):
try:
with open(f"data/{files}", 'r', encoding='utf-8') as f:
html_doc=f.read()
soup=BeautifulSoup(html_doc, 'html.parser')
t=soup.find('h2')
title= t.get_text() #product title
l=soup.find('a')
link='https://amazon.in/' + l['href'] #product link
dp=soup.find('span', attrs={"class": 'a-price-whole'})
disc_p=dp.get_text() if dp else 0 #product market price
op=soup.find('div', attrs={"class":"a-section aok-inline-block"})
op2=op.find('span', attrs={"class":'a-offscreen'}) if op else 0
orig_p=op2.get_text()[1:] if op2 else 0 #product original price
df['title'].append(title)
df['link'].append(link)
df['disc_p'].append(disc_p)
df['orig_p'].append(orig_p)
except Exception as e:
print(e)
data=pd.DataFrame(data=df)
data.to_csv("data.csv")