From 39aaf2bf7e22dd9a48ff69246e74d913d25af8cc Mon Sep 17 00:00:00 2001 From: Vasa Date: Tue, 15 May 2018 21:29:55 -0700 Subject: [PATCH] feature to turn on safe search filter as in #99 resolved the bug where time_range filter was ignored as in #93 made some improvements on the IncompleteReadException as in #83 option to exclude numbered ordering in image names as in #100 --- README.rst | 12 ++++++ .../google_images_download.py | 38 ++++++++++++++----- setup.py | 2 +- 3 files changed, 42 insertions(+), 10 deletions(-) diff --git a/README.rst b/README.rst index 3e03962e..f0a19b1d 100644 --- a/README.rst +++ b/README.rst @@ -280,6 +280,18 @@ Arguments | | | | | | | The path looks like this: "path/to/chromedriver". In windows it will be "path/to/chromedriver.exe" | +-------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+ +| safe_search | sa | Searches for images with the Safe Search filter On | +| | | | +| | | And this filter will be Off by default if you do not specify the safe_search argument | +| | | | +| | | This argument does not take any value. Just add '--safe_search' or '-sa' in your query. | ++-------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+ +| no_numbering | nn | When you specify this argument, the script does not add ordered numbering as prefix to the images it downloads | +| | | | +| | | If this argument is not specified, the images are numbered in order in which they are downloaded | +| | | | +| | | This argument does not take any value. Just add '--no_numbering' or '-nn' in your query. | ++-------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+ | help | h | show the help message regarding the usage of the above arguments | +-------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+ diff --git a/google_images_download/google_images_download.py b/google_images_download/google_images_download.py index 500fd943..96522566 100755 --- a/google_images_download/google_images_download.py +++ b/google_images_download/google_images_download.py @@ -14,6 +14,7 @@ from urllib.request import URLError, HTTPError from urllib.parse import quote import http.client + from http.client import IncompleteRead http.client._MAXHEADERS = 1000 else: # If the Current Version of Python is 2.x import urllib2 @@ -21,6 +22,7 @@ from urllib2 import URLError, HTTPError from urllib import quote import httplib + from httplib import IncompleteRead httplib._MAXHEADERS = 1000 import time # Importing the time library to check the time of code execution import os @@ -33,11 +35,11 @@ import socket args_list = ["keywords", "keywords_from_file", "prefix_keywords", "suffix_keywords", - "limit", "related_images", "format", "color", "color_type", "usage_rights", "size", + "limit", "format", "color", "color_type", "usage_rights", "size", "exact_size", "aspect_ratio", "type", "time", "time_range", "delay", "url", "single_image", "output_directory", "image_directory", "no_directory", "proxy", "similar_images", "specific_site", "print_urls", "print_size", "print_paths", "metadata", "extract_metadata", "socket_timeout", - "thumbnail", "language", "prefix", "chromedriver"] + "thumbnail", "language", "prefix", "chromedriver", "related_images", "safe_search", "no_numbering"] def user_input(): @@ -104,6 +106,8 @@ def user_input(): parser.add_argument('-px', '--proxy', help='specify a proxy address and port', type=str, required=False) parser.add_argument('-cd', '--chromedriver', help='specify the path to chromedriver executable in your local machine', type=str, required=False) parser.add_argument('-ri', '--related_images', default=False, help="Downloads images that are similar to the keyword provided", action="store_true") + parser.add_argument('-sa', '--safe_search', default=False, help="Turns on the safe search filter while searching for images", action="store_true") + parser.add_argument('-nn', '--no_numbering', default=False, help="Allows you to exclude the default numbering of images", action="store_true") args = parser.parse_args() arguments = vars(args) @@ -358,7 +362,7 @@ def build_url_parameters(self,arguments): if arguments['time_range']: json_acceptable_string = arguments['time_range'].replace("'", "\"") d = json.loads(json_acceptable_string) - time_range = '&cdr:1,cd_min:' + d['time_min'] + ',cd_max:' + d['time_min'] + time_range = ',cdr:1,cd_min:' + d['time_min'] + ',cd_max:' + d['time_min'] else: time_range = '' @@ -394,7 +398,9 @@ def build_url_parameters(self,arguments): #building main search URL - def build_search_url(self,search_term,params,url,similar_images,specific_site): + def build_search_url(self,search_term,params,url,similar_images,specific_site,safe_search): + #check safe_search + safe_search_string = "&safe=active" # check the args and choose the URL if url: url = url @@ -408,7 +414,12 @@ def build_search_url(self,search_term,params,url,similar_images,specific_site): else: url = 'https://www.google.com/search?q=' + quote( search_term) + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch' + params + '&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg' - #print(url) + + #safe search check + if safe_search: + url = url + safe_search_string + + # print(url) return url @@ -539,7 +550,7 @@ def download_image_thumbnail(self,image_url,main_directory,dir_name,return_image # Download Images - def download_image(self,image_url,image_format,main_directory,dir_name,count,print_urls,socket_timeout,prefix,print_size): + def download_image(self,image_url,image_format,main_directory,dir_name,count,print_urls,socket_timeout,prefix,print_size,no_numbering): if print_urls: print("Image URL: " + image_url) try: @@ -574,7 +585,10 @@ def download_image(self,image_url,image_format,main_directory,dir_name,count,pri else: prefix = '' - path = main_directory + "/" + dir_name + "/" + prefix + str(count) + ". " + image_name + if no_numbering: + path = main_directory + "/" + dir_name + "/" + prefix + image_name + else: + path = main_directory + "/" + dir_name + "/" + prefix + str(count) + ". " + image_name try: output_file = open(path, 'wb') @@ -632,6 +646,12 @@ def download_image(self,image_url,image_format,main_directory,dir_name,count,pri return_image_name = '' absolute_path = '' + except IncompleteRead as e: + download_status = 'fail' + download_message = "IncompleteReadError on an image...trying next one..." + " Error: " + str(e) + return_image_name = '' + absolute_path = '' + return download_status,download_message,return_image_name,absolute_path @@ -686,7 +706,7 @@ def _get_all_items(self,page,main_directory,dir_name,limit,arguments): items.append(object) # Append all the links in the list named 'Links' #download the images - download_status,download_message,return_image_name,absolute_path = self.download_image(object['image_link'],object['image_format'],main_directory,dir_name,count,arguments['print_urls'],arguments['socket_timeout'],arguments['prefix'],arguments['print_size']) + download_status,download_message,return_image_name,absolute_path = self.download_image(object['image_link'],object['image_format'],main_directory,dir_name,count,arguments['print_urls'],arguments['socket_timeout'],arguments['prefix'],arguments['print_size'],arguments['no_numbering']) print(download_message) if download_status == "success": @@ -812,7 +832,7 @@ def download(self,arguments): params = self.build_url_parameters(arguments) #building URL with params - url = self.build_search_url(search_term,params,arguments['url'],arguments['similar_images'],arguments['specific_site']) #building main search url + url = self.build_search_url(search_term,params,arguments['url'],arguments['similar_images'],arguments['specific_site'],arguments['safe_search']) #building main search url if limit < 101: raw_html = self.download_page(url) # download page diff --git a/setup.py b/setup.py index f3a4eb06..c725c937 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from codecs import open from os import path -__version__ = '2.2.2' +__version__ = '2.3.0' here = path.abspath(path.dirname(__file__))