🎉 Initial Commit

rango-tools · Oct 5, 2022 · 5e57463 · 5e57463
commit 5e57463
Show file tree

Hide file tree

Showing 6 changed files with 336 additions and 0 deletions.
diff --git a/.env.example b/.env.example
@@ -0,0 +1,2 @@
+APARAT_USERNAME=''
+APARAT_PASSWORD=''
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,57 @@
+# See http://help.github.com/ignore-files/ for more about ignoring files.
+
+# compiled output
+/dist
+/tmp
+/out-tsc
+# Only exists if Bazel was run
+/bazel-out
+
+# dependencies
+/node_modules
+
+# profiling files
+chrome-profiler-events*.json
+speed-measure-plugin*.json
+
+# IDEs and editors
+/.idea
+.project
+.classpath
+.c9/
+*.launch
+.settings/
+*.sublime-workspace
+
+# IDE - VSCode
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+.history/*
+
+# misc
+/.sass-cache
+/connect.lock
+/coverage
+/libpeerconnection.log
+npm-debug.log
+yarn-error.log
+testem.log
+/typings
+
+# System Files
+.DS_Store
+Thumbs.db
+
+# Just For This Project
+/__pycache__
+/*/__pycache__
+*.jpg
+*.txt
+*.jpeg
+/.env
+/.venv
+/*.db
+/downloads/*/*
diff --git a/.vscode/extensions.json b/.vscode/extensions.json
@@ -0,0 +1,5 @@
+{
+    "recommendations": [
+        "alexcvzz.vscode-sqlite"
+    ]
+}
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2022, Rango Tools
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
@@ -0,0 +1,20 @@
+# Pornhub Crawler CLI
+🚨 This Repository Is JUST Developed For Scientific Porpose 🚨
+This is Simple CLI To Get Video from [pornhub](https://www.pornhub.com/)
+
+
+## Installation
+
+### Prerequisites
+First you need to install `Python v3.x` or upper after that you need to setup the `chrome driver` for Selenium Library.
+
+```bash
+git clone https://github.com/rango-tools/pornhub-crawler-cli
+```
+
+### Usage
+To Using This Repository You Just Need To Run This Command:
+
+```text
+python pornhub.py video [--download=<folder-name>] [--browser] [--login] [--socks5=<proxy-addr>] [--spliter=<second>] <videoID>
+```
diff --git a/aparat.py b/aparat.py
@@ -0,0 +1,223 @@
+__author__ = "Amirhossein Douzendeh Zenoozi"
+__license__ = "MIT"
+__version__ = "1.0"
+__proxy__ = False
+__doc__ = """
+Aparat CLI Crawler
+Usage:
+    aparat.py archive [--browser] [--page=<page-number>] <url>
+    aparat.py video [--browser] <videoID>
+    aparat.py -h | --help
+    aparat.py -v | --version
+
+------------------------------------------------------------------
+
+Options:
+    --page=<page-number>        Total Pages.
+    --browser                   Showing Browser if You Need.
+    -h --help                   Show this screen.
+    -v --version                Show version.
+"""
+
+from selenium import webdriver
+from selenium.common import exceptions
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from docopt import docopt
+from tqdm import tqdm
+
+import os
+import sqlite3
+import json
+import time
+
+class AparatCrawler:
+    def __init__(self, **kwargs):
+        # DataBase Connection Config
+        self.userAgent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
+        self.showBrowser = kwargs.get('showBrowser', True)
+        self.youtubeDlOptions = {}
+        self.dataBaseConnection = sqlite3.connect(f'aparat.db')
+
+        try:
+            if (not os.path.exists('un_proccessed.txt')):
+                f = open('un_proccessed.txt', 'w')
+                f.close()
+        except Exception as error:
+            print(error)
+            pass
+
+        try:
+            self.dataBaseConnection.execute('''CREATE TABLE aparat_videos
+                (id INTEGER PRIMARY KEY AUTOINCREMENT,
+                video_id CHAR(50),
+                categories_list TEXT NOT NULL,
+                view_count INTEGER,
+                video_title TEXT NOT NULL)''')
+        except sqlite3.Error as error:
+            print(error)
+            pass
+
+        # Selenium Driver Options
+        self.driverOption = webdriver.ChromeOptions()
+        self.driverOption.add_argument(f'user-agent={self.userAgent}')
+        self.driverOption.add_argument('log-level=3')
+
+        if( not self.showBrowser ):
+            self.driverOption.add_argument('headless')
+
+        self.driver = webdriver.Chrome(options=self.driverOption)
+        self.driver.maximize_window()
+
+    def process_archive_page( self, archiveUrl, toPage ):
+        archiveUrl = f'{archiveUrl}'
+        self.driver.get(archiveUrl)
+        totalVideos = []
+        videosWrapperElem = []
+
+
+        for pageNumber in range(toPage or 1):
+            # Get Single Archive Page Video ID's
+            videosWrapperElem = [v.get_attribute('data-uid') for v in self.driver.find_elements(By.CSS_SELECTOR, 'div.thumbnail-video') if str(v.get_attribute('data-uid')) not in totalVideos]
+            for videoID in videosWrapperElem:
+                if (not self.is_video_processed(videoID) and not self.is_video_saved_in_unProccessed(videoID)):
+                    self.insert_video_to_unProccessed(videoID)
+                    self.infinite_scroll(5, 1)
+            totalVideos = totalVideos + videosWrapperElem
+
+    def process_un_processed_file(self):
+        with open("un_proccessed.txt", "r") as fp:
+            unProccessedVideos = fp.readlines()
+
+        for line in tqdm(unProccessedVideos):
+            videID = line.strip("\n")
+            if self.process_single_video(videID):
+                self.remove_video_from_unProccessed(videID)
+
+    def process_single_video( self, videoID ):
+        if (not self.is_video_processed(videoID)):
+            generatedVideoUrl = f'https://www.aparat.com/v/{videoID}'
+            self.driver.get(generatedVideoUrl)
+
+            # Empty Lists
+            videoTitle = ''
+            viewCount = ''
+            videoCategories = []
+
+            try:
+                FollowBtn = WebDriverWait(self.driver, 20).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'button.button.follow.add-button')))
+
+                videoTitle = self.driver.find_element(By.CSS_SELECTOR, 'h1.single-details__title').text
+                viewCount = int(self.driver.find_element(By.CSS_SELECTOR, 'div.single-details__view > span.view-text').text.replace(',', ''))
+
+                for cat in self.driver.find_elements(By.CSS_SELECTOR, 'div.item-tag.video-tag a'):
+                    videoCategories.append(cat.text) 
+
+                self.insert_video_details_to_database(videoID, json.dumps(videoCategories), viewCount, videoTitle)
+
+                return True
+
+            except exceptions.TimeoutException:
+                print("=========== TimeOut Getting Element! ===========")
+        else:
+            print(f'=========== This Video is already proccessed! ===========')
+
+        return False
+
+    def is_video_processed( self, videoID ):
+        database_record = self.dataBaseConnection.execute("""SELECT * FROM aparat_videos WHERE video_id = (?) LIMIT 1""", (videoID,)).fetchone()
+        return database_record
+
+    def is_video_saved_in_unProccessed(self, videoID):
+        with open("un_proccessed.txt", "r") as fp:
+            unProccessedVideos = fp.readlines()
+            for line in unProccessedVideos:
+                if line.strip("\n") == videoID:
+                    return True
+                else:
+                    return False
+
+    def insert_video_details_to_database( self, videoID, categoriesList, viewCount, videoTitle ):
+        try:
+            self.dataBaseConnection.execute("""INSERT INTO aparat_videos (video_id, categories_list, view_count, video_title) VALUES (?, ?, ?, ?)""", (videoID, categoriesList, viewCount, videoTitle))
+            self.dataBaseConnection.commit()
+        except sqlite3.Error as error:
+            print(error)
+            pass
+
+    def insert_video_to_unProccessed(self, videoID):
+        with open("un_proccessed.txt", "a+") as fp:
+            # Move read cursor to the start of file.
+            fp.seek(0)
+            # If file is not empty then append '\n'
+            data = fp.read(100)
+            if len(data) > 0 :
+                fp.write("\n")
+
+            # Append text at the end of file
+            fp.write(videoID)
+
+    def remove_video_from_unProccessed(self, videoID):
+        with open("un_proccessed.txt", "r") as input:
+            input.seek(0)
+            with open("temp_un_proccessed.txt", "w") as output:
+                # iterate all lines from file
+                for line in input:
+                    # if text matches then don't write it
+                    if line.strip("\n") != videoID:
+                        output.write(line)
+                    else:
+                        print(line.strip("\n"), videoID)
+
+
+        # replace file with original name
+        os.replace('temp_un_proccessed.txt', 'un_proccessed.txt')
+
+    def infinite_scroll(self, timeout, counte):
+        scrollPauseTime = timeout
+
+        # Get scroll height
+        lastHeight = self.driver.execute_script("return document.body.scrollHeight")
+        loopIndex = 0
+
+        while ( loopIndex <= counte ):
+            # Scroll down to bottom
+            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+            # Wait to load page
+            time.sleep( scrollPauseTime )
+            # Calculate new scroll height and compare with last scroll height
+            newHeight = self.driver.execute_script("return document.body.scrollHeight")
+            if newHeight == lastHeight:
+                # If heights are the same it will exit the function
+                break
+            lastHeight = newHeight
+
+            # Make Infinite Loop
+            if ( counte != 0 ):
+                loopIndex += 1
+
+    def close_driver( self ):
+        self.dataBaseConnection.close()
+        self.driver.close()
+        self.driver.quit()
+
+def main():
+    arguments = docopt(__doc__, version='v1.0')
+    pageNumber = int(arguments['--page'])
+    showBrowser = arguments['--browser']
+    pageUrl = arguments['<url>']
+    videoID = arguments['<videoID>']
+
+    aparat = AparatCrawler(showBrowser=showBrowser)
+
+    if ( arguments['video'] ):
+        aparat.process_single_video( videoID=videoID )
+    elif ( arguments['archive'] ):
+        aparat.process_archive_page( pageUrl, toPage=pageNumber )
+        aparat.process_un_processed_file()
+
+    aparat.close_driver()
+
+if __name__ == '__main__':
+    main()