layout | title | date | tags | |
---|---|---|---|---|
hexo |
1.12 Notes |
2020-01-12 04:00:28 -0800 |
|
https://segmentfault.com/blog/papapa
Chrome+F12
通过对不同页面的url分析,得到有用的参数,从而构造出url
从request里面复制,模拟真实请求
def cookie_to_dict(cookie):
cookie_dict = {}
items = cookie.split(';')
for item in items:
key = item.split('=')[0].replace(' ', '')
value = item.split('=')[1]
cookie_dict[key] = value
return cookie_dict
r = requests.get(url,headers=headers, cookies=cookies)
语法简单,容易上手
XML中查找信息
selenium里面也会用到
应用范围最广,但是上手难
https://regex101.com/r/dmRygT/1
https://deerchao.cn/tutorials/regex/regex.htm
https://segmentfault.com/a/1190000015750160
模拟用户操作,实现反爬虫
find_element
return a webelement
find_elements
return a iterable list
webelement.text
return text
allComments = driver.find_elements(By.XPATH, "//div[@class='itm']")
for each in allComments:
name = each.find_elements_by_xpath(
"./div[@class='cntwrap']/div[1]/div[1]/a")[0].text
显式Wiats允许你设置一个加载时间的上限和一个条件,每隔0.5s就判断一下所设条件,条件成立就继续执行下面的代码,如果过了时间上限还是没有成立,默认抛出NoSuchElementException 异常。这种相对智能的等待方法能最大化地节省时间,应该优先选择使用
driver.implicitly_wait(10)
https://dev.mysql.com/doc/refman/8.0/en/identifiers.html
用反引号`包住保留字(表名、字段名等)
用双引号"或单引号'包住字符串
在python里写sql时,用三个引号"""包住sql语句
cursor.execute("DROP TABLE IF EXISTS `netease`.`{}`".format(tableName))
cursor.execute("""CREATE TABLE `{}` (
`id` INT NOT NULL AUTO_INCREMENT,
`name` VARCHAR(100),
`comment` VARCHAR(10000),
`lovenumber` INT,
`commenttime` VARCHAR(100),
PRIMARY KEY (`id`)
)""".format(tableName))
# 上面均为反引号
sql = """INSERT INTO `netease`.`{tableName}`
(`name`,`comment`,`lovenumber`,`commenttime`) # 反引号
VALUES
('{name}','{comment}',{lovenumber},'{commenttime}') # 引号
""" .format(tableName=tableName, name=name, comment=comment, lovenumber=lovenumber, commenttime=commenttime)
__enter__
__exit__
写这两个函数就可以用with
https://stackoverflow.com/questions/1984325/explaining-pythons-enter-and-exit
import pymysql
class Mysql():
def __init__(self, host, user, password, database):
self.host = host
self.user = user
self.password = password
self.database = database
self.conn = pymysql.connect(
host=self.host, user=self.user, password=self.password, database=self.database)
self.cursor = self.conn.cursor()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.conn.commit()
self.conn.close()
def fetchall(self):
return self.cursor.fetchall()
def execute(self, sql):
return self.cursor.execute(sql)
def query(self, sql):
self.cursor.execute(sql)
return self.fetchall()
def commit(self):
return self.conn.commit()
def close(self):
return self.conn.close()
先用jieba分割词语,再用wordcloud
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import jieba
import src.Mysql
class MyWordCloud:
def __init__(self, song):
self.song = song
self.origin_text = self.fromDBGetText()
self.wordlist_after_jieba = self.jiebaDiv()
self.wordcloud = self.getWordCloud()
def fromDBGetText(self):
with src.Mysql.Mysql('127.0.0.1', 'root', 'password', 'netease') as db:
sql = """
SELECT comment FROM netease.{};
""".format(self.song)
self.origin_text = db.query(sql)
return self.origin_text
def jiebaDiv(self):
self.wordlist_after_jieba = jieba.cut(
str(self.origin_text), cut_all=True)
return " ".join(self.wordlist_after_jieba)
def getWordCloud(self):
return WordCloud(collocations=False, width=2000,
height=2000).generate(self.wordlist_after_jieba)
# collocations=False 去除重复词语
def saveToFile(self):
self.wordcloud.to_file('./img/{}.png'.format(self.song))
def plotImg(self):
plt.imshow(self.wordcloud)
plt.axis("off")
plt.show()
Learning~