Skip to content

Latest commit

 

History

History
193 lines (154 loc) · 6.33 KB

(实战)多线程爬取豆瓣电影信息.md

File metadata and controls

193 lines (154 loc) · 6.33 KB

爬虫学习使用指南--多线程爬虫

Auth: 王海飞

Data:2018-06-16

Email:[email protected]

github:https://github.com/coco369/knowledge

多线程爬虫豆瓣电影资源

思路:

案例1:获取电影的分类信息,针对一个分类,启动一个线程。即有多少分类的url,启动多少个线程

案例2:获取电影的分类的信息,将分类的url储存在一个列表中,启动两个线程去从列表中获取要访问的url,然后爬取url的内容,在进行数据分析即可。(使用线程锁)

案例1:爬取豆瓣上电影的名称和评分,并进行插入到mongodb中

	import urllib.request
	from urllib import parse
	import json
	import time
	import threading
	import pymongo

	
	"""
	获取豆瓣电影中的电影资源
	豆瓣电影url地址:https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0
	分析:
	    1. 该页面中的的电影资源信息都是通过ajax异步加载进行刷新出来的
	    2. 在F12下的network中过滤XHR(XMLHTTPRESPONSE)请求,可以查看到真正的异步的请求地址如下
	        https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=20
	    3. 正在的请求地址中,type为类型,tag为标签(热门、经典、最新、爱情、科幻等等),sort为排序,page_limit为每一个的条数,page_start为开始的条数下标
	    4. 获取tag类型的url地址为: https://movie.douban.com/j/search_tags?type=movie&source=
	"""

	
	def urllib_open(url):
	    """
	    公共的处理代码
	    """
	    header = {
	        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
	    }
	    req = urllib.request.Request(url=url, headers=header)
	    res = urllib.request.urlopen(req)
	
	    return res.read().decode('utf-8')

	
	def get_movie_tag(url):
	    """
	    获取电影的分类tag
	    """
	    tag_res = urllib_open(url)
	    # 返回的tag_res的结果为'{"tags":["热门","最新","经典","可播放","豆瓣高分","冷门佳片","华语","欧美","韩国","日本","动作","喜剧","爱情","科幻","悬疑","恐怖","成长"]}'
	    # 其结果为一个字符串类型的数据,需要将之转化为字典类型的
	    result = json.loads(tag_res)
	    content = result['tags']
	    return content

	
	def get_movies(movies_url, db):
	    # movies_url中指定电影类型的参数是tag=热门或者最新等等
	    # db 是mongo的对象,可以操作mongo数据库
	    movies_res = urllib_open(movies_url)
	    res = json.loads(movies_res)
	    result = res['subjects']
	    for res in result:
	        db.movies2.insert_one({
	            'm_name': res['title'],
	            'm_rate': res['rate']
	        })
	        print('标题:%s,评分:%s' % (res['title'], res['rate']))

	
	def main():
	
	    # 设置数据库的访问
	    mongo_client = pymongo.MongoClient('mongodb://45.76.206.145:27017')
	    db = mongo_client.douban
	
	    tag_url = 'https://movie.douban.com/j/search_tags?type=movie&source='
	    movies_url = 'https://movie.douban.com/j/search_subjects?type=movie&%s&sort=recommend&page_limit=20&page_start=0'
	    tag_content = get_movie_tag(tag_url)
	    threading_list = []
	    result_list = []
	    for tag in tag_content:
	        search_url = movies_url
	        data = {'tag': tag}
	        search_tag = parse.urlencode(data)
	        result_list.append(search_url % (search_tag,))
	
	    for url in result_list:
	        t = threading.Thread(target=get_movies, args=(url, db))
	        threading_list.append(t)
	
	    for thread in threading_list:
	        thread.start()
	        # thread.join()


	if __name__ == '__main__':
	    print(time.clock())
	    main()
	    print(time.clock())

案例2:

使用线程锁

	import threading
	import requests
	import json
	from urllib import parse

	def get_html(url):
	    header = {
	        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
	    }
	    res = requests.get(url, headers=header)
	    return res.text


	def get_movie_tag(url):
	    """
	    获取电影的分类tag
	    """
	    tag_res = get_html(url)
	    # 返回的tag_res的结果为'{"tags":["热门","最新","经典","可播放","豆瓣高分","冷门佳片","华语","欧美","韩国","日本","动作","喜剧","爱情","科幻","悬疑","恐怖","成长"]}'
	    # 其结果为一个字符串类型的数据,需要将之转化为字典类型的
	    result = json.loads(tag_res)
	    content = result['tags']
	    return content


	class SpiderOperation(threading.Thread):
	
	    def __init__(self):
	        super(SpiderOperation, self).__init__()
	        self.task_lock = threading.Lock()  # 线程锁
	
	    def update_task_list(self):
	        """
	         多线程操作共享的类对象资源,互斥访问,
	         将每个线程处理的结果存入self.task_result_list
	        """
	        if self.task_lock.acquire():
	            print(len(task_result_list))
	            link = task_result_list.pop() if task_result_list else ''
	            self.task_lock.release()
	            return link
	
	    def run(self):
	        task_link = self.update_task_list()
	        print(task_link)
	        if task_link:
	            movies_res = get_html(task_link)
	            res = json.loads(movies_res)
	            result = res['subjects']
	            for res in result:
	                print('标题:%s,评分:%s' % (res['title'], res['rate']))

	
	if __name__ == '__main__':
	
	    tag_url = 'https://movie.douban.com/j/search_tags?type=movie&source='
	    movie_url = 'https://movie.douban.com/j/search_subjects?type=movie&%s&sort=recommend&page_limit=20&page_start=0'
	    tags = get_movie_tag(tag_url)
	    global task_result_list
	    task_result_list = []
	    for tag in tags:
	        search_url = movie_url
	        data = {'tag': tag}
	        search_tag = parse.urlencode(data)
	        # 搜索出需要爬取的豆瓣分类的url地址
	        task_result_list.append(search_url % (search_tag,))
	
	    while True:
	        if task_result_list:
	            spider1 = SpiderOperation()
	            spider2 = SpiderOperation()
	            spider1.start()
	            spider2.start()
	        else:
	            break