-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraping2.py
76 lines (61 loc) · 2.28 KB
/
scraping2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import sys
from pprint import pprint
from selenium import webdriver
from bs4 import BeautifulSoup
from time import time, sleep
from urllib.request import urlopen, Request
from mimetypes import guess_extension
import traceback
class Fetcher:
def __init__(self, ua=''):
self.ua = ua
def fetch(self, url):
req = Request(url, headers={'User_Agent': self.ua})
try:
with urlopen(req, timeout=3) as p:
b_content = p.read()
mime = p.getheader('Content-Type')
except:
sys.stderr.write('Error in fetching {}\n'.format(url))
sys.stderr.write(traceback.format_exc())
return None, None
return b_content, mime
fetcher = Fetcher('Mozilla/5.0')#ユーザーエージェント
page_num = 20
dirname = 'datasets/cards'
#画像保存用ファイル作成
if not os.path.exists(dirname):
os.makedirs(dirname)
#PhantomJSをSelenium経由で利用します。
# url = "https://search.yahoo.co.jp/image/search?p=%E8%A1%9B%E8%97%A4%E7%BE%8E%E5%BD%A9&oq=&ei=UTF-8&save=0"
url = "https://search.yahoo.co.jp/image/search?p=%E3%83%88%E3%83%A9%E3%83%B3%E3%83%97+%E3%82%AB%E3%83%BC%E3%83%89&ei=UTF-8&rkf=1&oq=&save=0"
driver = webdriver.PhantomJS()
#PhantomJSで該当ページを取得&レンダリングします
driver.get(url)
for i in range(page_num):
#レンダリング結果をPhantomJSから取得します。
html = driver.page_source
#画像のurlを取得する
bs = BeautifulSoup(html, "html.parser")
img_urls = [img.get("href") for img in bs.find_all("a", target="imagewin")]
img_urls.remove("javascript:void(0);")
img_urls = list(set(img_urls))
#画像を保存する
for j, img_url in enumerate(img_urls):
sleep(0.1)
img, mime = fetcher.fetch(img_url)
if not mime or not img:
continue
ext = guess_extension(mime.split(';')[0])
if ext in ('.jpe', '.jpeg'):
ext = '.jpg'
if not ext:
continue
result_file = os.path.join(dirname, str(i) + '_' + str(j) + ext)
with open(result_file, mode='wb') as f:
f.write(img)
print('fetched', img_url)
#次のページに移動する
driver.find_element_by_link_text('次へ>').click()
driver.quit