-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
175 lines (152 loc) · 7.16 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
__author__ = "Amirhossein Douzendeh Zenoozi"
__license__ = "MIT"
__version__ = "1.0"
from openpyxl import load_workbook
from tqdm import tqdm
import random
import time
import requests
import sqlite3
class AparatCrawler:
def __init__(self, **kwargs):
# DataBase Connection Config
self.db = sqlite3.connect('aparat.db')
try:
self.create_video_table()
except sqlite3.Error as error:
print('========== VideoTable Error ==========')
print(error)
print('========== End of VideoTable Error ==========')
pass
try:
self.create_categories_table()
except sqlite3.Error as error:
print('========== CategoryTable Error ==========')
print(error)
print('========== End of CategoryTable Error ==========')
pass
def create_video_table(self):
self.db.execute('''CREATE TABLE videos
(id INTEGER PRIMARY KEY AUTOINCREMENT,
video_id CHAR(50),
title TEXT NOT NULL,
description TEXT,
username CHAR(50),
duration INTEGER,
date DATETIME,
cat_id INTEGER,
like_count INTEGER,
visit_count INTEGER);''')
def create_categories_table(self):
self.db.execute('''CREATE TABLE categories
(id INTEGER PRIMARY KEY AUTOINCREMENT,
cat_id CHAR(50),
name CHAR(50),
link CHAR(50),
video_count INTEGER);''')
def is_video_processed(self, video_id):
database_record = self.db.execute("""SELECT video_id FROM videos WHERE video_id = (?) LIMIT 1""", (video_id,)).fetchone()
return bool(database_record)
def insert_video_item_to_db(self, video_id, username, description, duration, like_count, visit_count, title, date, cat_id):
try:
self.db.execute("""INSERT INTO videos (video_id, username, description, duration, like_count, visit_count, title, date, cat_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", (video_id, username, description, duration, like_count, visit_count, title, date, cat_id))
self.db.commit()
except sqlite3.Error as error:
print(error)
pass
def insert_cat_item_to_db(self, cat_id, name, link, video_count):
try:
self.db.execute("""INSERT INTO categories (cat_id, name, link, video_count) VALUES (?, ?, ?, ?)""", (cat_id, name, link, video_count))
self.db.commit()
except sqlite3.Error as error:
print(error)
pass
def update_cat_item(self, cat_id, name, link, video_count):
try:
self.db.execute("""UPDATE categories SET name = (?), link = (?), video_count = (?) WHERE cat_id = (?)""", (name, link, video_count, cat_id))
self.db.commit()
except sqlite3.Error as error:
print(error)
pass
def is_cat_item_exist(self, cat_id):
database_record = self.db.execute("""SELECT cat_id FROM categories WHERE cat_id = (?) LIMIT 1""", (cat_id,)).fetchone()
return bool(database_record)
def get_data(self, url):
try:
req = requests.get(url)
data = req.json()
return data
except:
print('================ Error ================')
print(f'Error in {req.status_code}')
return None
def close_db( self ):
self.db.close()
def get_home_page(self, page_count):
for i in tqdm(range(page_count)):
if i == 0:
url = 'https://www.aparat.com/api/fa/v1/video/video/list/tagid/1'
else:
url = data['links']['next']
data = self.get_data(url)
if data is not None and data != []:
for index, record in enumerate(data['included']):
if record['type'] == 'Video' and not self.is_video_processed(record['attributes']['id']):
video_id = record['attributes']['id']
username = record['attributes']['username']
description = record['attributes']['description']
duration = record['attributes']['duration']
like_count = record['attributes']['like_cnt']
visit_count = record['attributes']['visit_cnt_int']
title = record['attributes']['title']
date = record['attributes']['sdate_rss']
cat_id = record['attributes']['catId']
# Insert Data to DB
self.insert_video_item_to_db(video_id, username, description, duration, like_count, visit_count, title, date, cat_id)
# Set Random Sleep
time.sleep(random.randint(1, 5))
def get_all_categories(self):
data = self.get_data('https://www.aparat.com/etc/api/categories')
if data is not None and data != []:
for index, record in enumerate(data['categories']):
cat_id = record['id']
name = record['name']
link = record['link']
count = record['videoCnt']
if not self.is_cat_item_exist(cat_id):
self.insert_cat_item_to_db(cat_id, name, link, count)
else:
self.update_cat_item(cat_id, name, link, count)
def get_single_categories(self, cat_number, page=10):
is_first_call = True
processed_page = 0
if is_first_call:
url = f'https://www.aparat.com/etc/api/categoryvideos/perpage/30/cat/{cat_number}'
else:
url = data['ui']['pagingForward'].replace('//etc/', '/etc/')
while processed_page < page:
data = self.get_data(url)
if data is not None and data != []:
for index, record in enumerate(data['categoryvideos']):
if not self.is_video_processed(record['id']):
video_id = record['id']
username = record['username']
description = 'None'
duration = record['duration']
like_count = -1
visit_count = record['visit_cnt']
title = record['title']
date = record['create_date']
cat_id = cat_number
# Insert Data to DB
self.insert_video_item_to_db(video_id, username, description, duration, like_count, visit_count, title, date, cat_id)
# Set Random Sleep
time.sleep(random.randint(1, 5))
def main():
aparat_crawler = AparatCrawler()
aparat_crawler.get_home_page(100)
# aparat_crawler.get_all_categories()
# aparat_crawler.get_single_categories(22, 10)
aparat_crawler.close_db()
if __name__ == '__main__':
main()