Skip to content

Latest commit

 

History

History
164 lines (147 loc) · 4.91 KB

1-6.md

File metadata and controls

164 lines (147 loc) · 4.91 KB
layout title date tags
hexo
1.12 Notes
2020-01-12 04:00:28 -0800
Python Spider

1.6-1.12 Learning Notes

python spider

https://segmentfault.com/blog/papapa

analyse website

Chrome+F12

construct url

通过对不同页面的url分析,得到有用的参数,从而构造出url

request header

request cookie

从request里面复制,模拟真实请求

def cookie_to_dict(cookie):
    cookie_dict = {}
    items = cookie.split(';')
    for item in items:
        key = item.split('=')[0].replace(' ', '')
        value = item.split('=')[1]
        cookie_dict[key] = value
    return cookie_dict
r = requests.get(url,headers=headers, cookies=cookies)

select info

bs4

语法简单,容易上手

xpath

XML中查找信息
selenium里面也会用到

regex

应用范围最广,但是上手难
https://regex101.com/r/dmRygT/1 https://deerchao.cn/tutorials/regex/regex.htm

selenium

https://segmentfault.com/a/1190000015750160
模拟用户操作,实现反爬虫

find_element vs find_elements

find_element return a webelement
find_elements return a iterable list
webelement.text return text

allComments = driver.find_elements(By.XPATH, "//div[@class='itm']")
for each in allComments:
    name = each.find_elements_by_xpath(
        "./div[@class='cntwrap']/div[1]/div[1]/a")[0].text

等待网页加载

explicit

显式Wiats允许你设置一个加载时间的上限和一个条件,每隔0.5s就判断一下所设条件,条件成立就继续执行下面的代码,如果过了时间上限还是没有成立,默认抛出NoSuchElementException 异常。这种相对智能的等待方法能最大化地节省时间,应该优先选择使用

implicit

driver.implicitly_wait(10)

mysql store data

https://dev.mysql.com/doc/refman/8.0/en/identifiers.html
用反引号`包住保留字(表名、字段名等)
用双引号"或单引号'包住字符串
在python里写sql时,用三个引号"""包住sql语句

cursor.execute("DROP TABLE IF EXISTS `netease`.`{}`".format(tableName))
cursor.execute("""CREATE TABLE `{}` (
    `id` INT NOT NULL AUTO_INCREMENT,
    `name` VARCHAR(100),
    `comment` VARCHAR(10000),
    `lovenumber` INT,
    `commenttime` VARCHAR(100),
    PRIMARY KEY (`id`)
)""".format(tableName)) 
# 上面均为反引号
sql = """INSERT INTO `netease`.`{tableName}`
        (`name`,`comment`,`lovenumber`,`commenttime`) # 反引号
        VALUES
        ('{name}','{comment}',{lovenumber},'{commenttime}') # 引号
        """ .format(tableName=tableName, name=name, comment=comment, lovenumber=lovenumber, commenttime=commenttime)

OOP Refactor

__enter__ __exit__ 写这两个函数就可以用with
https://stackoverflow.com/questions/1984325/explaining-pythons-enter-and-exit

import pymysql


class Mysql():
    def __init__(self, host, user, password, database):
        self.host = host
        self.user = user
        self.password = password
        self.database = database
        self.conn = pymysql.connect(
            host=self.host, user=self.user, password=self.password, database=self.database)
        self.cursor = self.conn.cursor()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.conn.commit()
        self.conn.close()

    def fetchall(self):
        return self.cursor.fetchall()

    def execute(self, sql):
        return self.cursor.execute(sql)

    def query(self, sql):
        self.cursor.execute(sql)
        return self.fetchall()
    
    def commit(self):
        return self.conn.commit()
    
    def close(self):
        return self.conn.close()

生成词云

先用jieba分割词语,再用wordcloud

import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba
import src.Mysql


class MyWordCloud:
    def __init__(self, song):
        self.song = song
        self.origin_text = self.fromDBGetText()
        self.wordlist_after_jieba = self.jiebaDiv()
        self.wordcloud = self.getWordCloud()

    def fromDBGetText(self):
        with src.Mysql.Mysql('127.0.0.1', 'root', 'password', 'netease') as db:
            sql = """
            SELECT comment FROM netease.{};
            """.format(self.song)
            self.origin_text = db.query(sql)
        return self.origin_text

    def jiebaDiv(self):
        self.wordlist_after_jieba = jieba.cut(
            str(self.origin_text), cut_all=True)
        return " ".join(self.wordlist_after_jieba)

    def getWordCloud(self):
        return WordCloud(collocations=False, width=2000,
                         height=2000).generate(self.wordlist_after_jieba)
        # collocations=False 去除重复词语

    def saveToFile(self):
        self.wordcloud.to_file('./img/{}.png'.format(self.song))

    def plotImg(self):
        plt.imshow(self.wordcloud)
        plt.axis("off")
        plt.show()

多线程、多进程

Learning~