Skip to content

Commit

Permalink
🎉 Initial Commit
Browse files Browse the repository at this point in the history
  • Loading branch information
amirzenoozi committed Oct 5, 2022
0 parents commit 5e57463
Show file tree
Hide file tree
Showing 6 changed files with 336 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
APARAT_USERNAME=''
APARAT_PASSWORD=''
57 changes: 57 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# See http://help.github.com/ignore-files/ for more about ignoring files.

# compiled output
/dist
/tmp
/out-tsc
# Only exists if Bazel was run
/bazel-out

# dependencies
/node_modules

# profiling files
chrome-profiler-events*.json
speed-measure-plugin*.json

# IDEs and editors
/.idea
.project
.classpath
.c9/
*.launch
.settings/
*.sublime-workspace

# IDE - VSCode
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
.history/*

# misc
/.sass-cache
/connect.lock
/coverage
/libpeerconnection.log
npm-debug.log
yarn-error.log
testem.log
/typings

# System Files
.DS_Store
Thumbs.db

# Just For This Project
/__pycache__
/*/__pycache__
*.jpg
*.txt
*.jpeg
/.env
/.venv
/*.db
/downloads/*/*
5 changes: 5 additions & 0 deletions .vscode/extensions.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"recommendations": [
"alexcvzz.vscode-sqlite"
]
}
29 changes: 29 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
BSD 3-Clause License

Copyright (c) 2022, Rango Tools
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
20 changes: 20 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Pornhub Crawler CLI
🚨 This Repository Is JUST Developed For Scientific Porpose 🚨
This is Simple CLI To Get Video from [pornhub](https://www.pornhub.com/)


## Installation

### Prerequisites
First you need to install `Python v3.x` or upper after that you need to setup the `chrome driver` for Selenium Library.

```bash
git clone https://github.com/rango-tools/pornhub-crawler-cli
```

### Usage
To Using This Repository You Just Need To Run This Command:

```text
python pornhub.py video [--download=<folder-name>] [--browser] [--login] [--socks5=<proxy-addr>] [--spliter=<second>] <videoID>
```
223 changes: 223 additions & 0 deletions aparat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
__author__ = "Amirhossein Douzendeh Zenoozi"
__license__ = "MIT"
__version__ = "1.0"
__proxy__ = False
__doc__ = """
Aparat CLI Crawler
Usage:
aparat.py archive [--browser] [--page=<page-number>] <url>
aparat.py video [--browser] <videoID>
aparat.py -h | --help
aparat.py -v | --version
------------------------------------------------------------------
Options:
--page=<page-number> Total Pages.
--browser Showing Browser if You Need.
-h --help Show this screen.
-v --version Show version.
"""

from selenium import webdriver
from selenium.common import exceptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from docopt import docopt
from tqdm import tqdm

import os
import sqlite3
import json
import time

class AparatCrawler:
def __init__(self, **kwargs):
# DataBase Connection Config
self.userAgent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
self.showBrowser = kwargs.get('showBrowser', True)
self.youtubeDlOptions = {}
self.dataBaseConnection = sqlite3.connect(f'aparat.db')

try:
if (not os.path.exists('un_proccessed.txt')):
f = open('un_proccessed.txt', 'w')
f.close()
except Exception as error:
print(error)
pass

try:
self.dataBaseConnection.execute('''CREATE TABLE aparat_videos
(id INTEGER PRIMARY KEY AUTOINCREMENT,
video_id CHAR(50),
categories_list TEXT NOT NULL,
view_count INTEGER,
video_title TEXT NOT NULL)''')
except sqlite3.Error as error:
print(error)
pass

# Selenium Driver Options
self.driverOption = webdriver.ChromeOptions()
self.driverOption.add_argument(f'user-agent={self.userAgent}')
self.driverOption.add_argument('log-level=3')

if( not self.showBrowser ):
self.driverOption.add_argument('headless')

self.driver = webdriver.Chrome(options=self.driverOption)
self.driver.maximize_window()

def process_archive_page( self, archiveUrl, toPage ):
archiveUrl = f'{archiveUrl}'
self.driver.get(archiveUrl)
totalVideos = []
videosWrapperElem = []


for pageNumber in range(toPage or 1):
# Get Single Archive Page Video ID's
videosWrapperElem = [v.get_attribute('data-uid') for v in self.driver.find_elements(By.CSS_SELECTOR, 'div.thumbnail-video') if str(v.get_attribute('data-uid')) not in totalVideos]
for videoID in videosWrapperElem:
if (not self.is_video_processed(videoID) and not self.is_video_saved_in_unProccessed(videoID)):
self.insert_video_to_unProccessed(videoID)
self.infinite_scroll(5, 1)
totalVideos = totalVideos + videosWrapperElem

def process_un_processed_file(self):
with open("un_proccessed.txt", "r") as fp:
unProccessedVideos = fp.readlines()

for line in tqdm(unProccessedVideos):
videID = line.strip("\n")
if self.process_single_video(videID):
self.remove_video_from_unProccessed(videID)

def process_single_video( self, videoID ):
if (not self.is_video_processed(videoID)):
generatedVideoUrl = f'https://www.aparat.com/v/{videoID}'
self.driver.get(generatedVideoUrl)

# Empty Lists
videoTitle = ''
viewCount = ''
videoCategories = []

try:
FollowBtn = WebDriverWait(self.driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'button.button.follow.add-button')))

videoTitle = self.driver.find_element(By.CSS_SELECTOR, 'h1.single-details__title').text
viewCount = int(self.driver.find_element(By.CSS_SELECTOR, 'div.single-details__view > span.view-text').text.replace(',', ''))

for cat in self.driver.find_elements(By.CSS_SELECTOR, 'div.item-tag.video-tag a'):
videoCategories.append(cat.text)

self.insert_video_details_to_database(videoID, json.dumps(videoCategories), viewCount, videoTitle)

return True

except exceptions.TimeoutException:
print("=========== TimeOut Getting Element! ===========")
else:
print(f'=========== This Video is already proccessed! ===========')

return False

def is_video_processed( self, videoID ):
database_record = self.dataBaseConnection.execute("""SELECT * FROM aparat_videos WHERE video_id = (?) LIMIT 1""", (videoID,)).fetchone()
return database_record

def is_video_saved_in_unProccessed(self, videoID):
with open("un_proccessed.txt", "r") as fp:
unProccessedVideos = fp.readlines()
for line in unProccessedVideos:
if line.strip("\n") == videoID:
return True
else:
return False

def insert_video_details_to_database( self, videoID, categoriesList, viewCount, videoTitle ):
try:
self.dataBaseConnection.execute("""INSERT INTO aparat_videos (video_id, categories_list, view_count, video_title) VALUES (?, ?, ?, ?)""", (videoID, categoriesList, viewCount, videoTitle))
self.dataBaseConnection.commit()
except sqlite3.Error as error:
print(error)
pass

def insert_video_to_unProccessed(self, videoID):
with open("un_proccessed.txt", "a+") as fp:
# Move read cursor to the start of file.
fp.seek(0)
# If file is not empty then append '\n'
data = fp.read(100)
if len(data) > 0 :
fp.write("\n")

# Append text at the end of file
fp.write(videoID)

def remove_video_from_unProccessed(self, videoID):
with open("un_proccessed.txt", "r") as input:
input.seek(0)
with open("temp_un_proccessed.txt", "w") as output:
# iterate all lines from file
for line in input:
# if text matches then don't write it
if line.strip("\n") != videoID:
output.write(line)
else:
print(line.strip("\n"), videoID)


# replace file with original name
os.replace('temp_un_proccessed.txt', 'un_proccessed.txt')

def infinite_scroll(self, timeout, counte):
scrollPauseTime = timeout

# Get scroll height
lastHeight = self.driver.execute_script("return document.body.scrollHeight")
loopIndex = 0

while ( loopIndex <= counte ):
# Scroll down to bottom
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep( scrollPauseTime )
# Calculate new scroll height and compare with last scroll height
newHeight = self.driver.execute_script("return document.body.scrollHeight")
if newHeight == lastHeight:
# If heights are the same it will exit the function
break
lastHeight = newHeight

# Make Infinite Loop
if ( counte != 0 ):
loopIndex += 1

def close_driver( self ):
self.dataBaseConnection.close()
self.driver.close()
self.driver.quit()

def main():
arguments = docopt(__doc__, version='v1.0')
pageNumber = int(arguments['--page'])
showBrowser = arguments['--browser']
pageUrl = arguments['<url>']
videoID = arguments['<videoID>']

aparat = AparatCrawler(showBrowser=showBrowser)

if ( arguments['video'] ):
aparat.process_single_video( videoID=videoID )
elif ( arguments['archive'] ):
aparat.process_archive_page( pageUrl, toPage=pageNumber )
aparat.process_un_processed_file()

aparat.close_driver()

if __name__ == '__main__':
main()

0 comments on commit 5e57463

Please sign in to comment.