Skip to content

Commit 29e414c

Browse files
committed
Push Zhihu
1 parent 5f2f132 commit 29e414c

File tree

6 files changed

+976
-0
lines changed

6 files changed

+976
-0
lines changed

.DS_Store

0 Bytes
Binary file not shown.

FBpages/.DS_Store

0 Bytes
Binary file not shown.

Zhihu/.DS_Store

6 KB
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,336 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import pandas as pd\n",
10+
"import requests\n",
11+
"from bs4 import BeautifulSoup\n",
12+
"import json"
13+
]
14+
},
15+
{
16+
"cell_type": "code",
17+
"execution_count": 2,
18+
"metadata": {},
19+
"outputs": [],
20+
"source": [
21+
"def getPage(url):\n",
22+
"\n",
23+
" headers = {\n",
24+
" 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',\n",
25+
" 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',\n",
26+
" }\n",
27+
"\n",
28+
" try:\n",
29+
" r= requests.get(url, headers=headers)\n",
30+
" page= r.content\n",
31+
" return page\n",
32+
"\n",
33+
" except requests.HTTPError as e:\n",
34+
" print(e)\n",
35+
" print(\"HTTPError\")\n",
36+
"\n",
37+
" except requests.RequestException as e:\n",
38+
" print(e)\n",
39+
"\n",
40+
" except:\n",
41+
" print(\"Unknown Error!\")\n",
42+
"\n",
43+
"def zhihu_scrapyer(id= None, total= 1000):\n",
44+
"\n",
45+
" file= pd.DataFrame()\n",
46+
" \n",
47+
" page_number= 0\n",
48+
" \n",
49+
" base= 'https://www.zhihu.com/api/v4/questions/'+ str(id)+ '/answers?'\n",
50+
" include= 'data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Cpaid_info%2Cpaid_info_content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics%3Bdata%5B%2A%5D.settings.table_of_content.enabled'\n",
51+
" \n",
52+
" while (page_number< total): \n",
53+
"\n",
54+
" question_url= base + 'include=' + include + '&limit=5&' + 'offset=' + str(page_number)+ '&platform=desktop&sort_by=default'\n",
55+
"\n",
56+
" page= getPage(question_url)\n",
57+
" entries= json.loads(page)['data']\n",
58+
" \n",
59+
" if len(entries) != 0: \n",
60+
"\n",
61+
" answers= list()\n",
62+
"\n",
63+
" for i in entries:\n",
64+
" answer= list()\n",
65+
"\n",
66+
" answer.append(i['author']['name']) \n",
67+
" answer.append(i['author']['gender']) \n",
68+
" answer.append(i['author']['follower_count'])\n",
69+
" \n",
70+
" \n",
71+
" answer.append(i['voteup_count']) \n",
72+
" answer.append(i['comment_count']) \n",
73+
" answer.append(i[\"url\"]) \n",
74+
" answer.append(i['content']) ## Get the answer content\n",
75+
"\n",
76+
" answers.append(answer)\n",
77+
"\n",
78+
" file= file.append(pd.DataFrame(answers), ignore_index= True) \n",
79+
" \n",
80+
" else:\n",
81+
" pass\n",
82+
" \n",
83+
" page_number+= 5\n",
84+
" \n",
85+
" file.columns= [\"name\", \"gender\", \"follower_count\", \"voteup_count\", \"comment_count\", \"url\", \"content\"]\n",
86+
" \n",
87+
" return file"
88+
]
89+
},
90+
{
91+
"cell_type": "code",
92+
"execution_count": 3,
93+
"metadata": {
94+
"scrolled": true
95+
},
96+
"outputs": [],
97+
"source": [
98+
"if __name__=='__main__':\n",
99+
" file= zhihu_scrapyer(id= 441839927, total= 310)"
100+
]
101+
},
102+
{
103+
"cell_type": "code",
104+
"execution_count": 4,
105+
"metadata": {},
106+
"outputs": [
107+
{
108+
"data": {
109+
"text/html": [
110+
"<div>\n",
111+
"<style scoped>\n",
112+
" .dataframe tbody tr th:only-of-type {\n",
113+
" vertical-align: middle;\n",
114+
" }\n",
115+
"\n",
116+
" .dataframe tbody tr th {\n",
117+
" vertical-align: top;\n",
118+
" }\n",
119+
"\n",
120+
" .dataframe thead th {\n",
121+
" text-align: right;\n",
122+
" }\n",
123+
"</style>\n",
124+
"<table border=\"1\" class=\"dataframe\">\n",
125+
" <thead>\n",
126+
" <tr style=\"text-align: right;\">\n",
127+
" <th></th>\n",
128+
" <th>name</th>\n",
129+
" <th>gender</th>\n",
130+
" <th>follower_count</th>\n",
131+
" <th>voteup_count</th>\n",
132+
" <th>comment_count</th>\n",
133+
" <th>url</th>\n",
134+
" <th>content</th>\n",
135+
" </tr>\n",
136+
" </thead>\n",
137+
" <tbody>\n",
138+
" <tr>\n",
139+
" <th>0</th>\n",
140+
" <td>李珀河</td>\n",
141+
" <td>-1</td>\n",
142+
" <td>23</td>\n",
143+
" <td>1574</td>\n",
144+
" <td>203</td>\n",
145+
" <td>https://www.zhihu.com/api/v4/answers/1704574740</td>\n",
146+
" <td>&lt;p&gt;香港特区政府3日表示将安排专机接载滞留秘鲁的香港居民由利马飞往伦敦,再转乘已预留机位的...</td>\n",
147+
" </tr>\n",
148+
" <tr>\n",
149+
" <th>1</th>\n",
150+
" <td>洛神赋天下第一</td>\n",
151+
" <td>1</td>\n",
152+
" <td>11766</td>\n",
153+
" <td>2261</td>\n",
154+
" <td>241</td>\n",
155+
" <td>https://www.zhihu.com/api/v4/answers/1705038141</td>\n",
156+
" <td>&lt;p&gt;英国就别跳了,马上自身难保国将不国,大英帝国很大概率从此不复存在,就这样还有闲工夫关心...</td>\n",
157+
" </tr>\n",
158+
" <tr>\n",
159+
" <th>2</th>\n",
160+
" <td>甄昊元</td>\n",
161+
" <td>1</td>\n",
162+
" <td>65506</td>\n",
163+
" <td>5543</td>\n",
164+
" <td>311</td>\n",
165+
" <td>https://www.zhihu.com/api/v4/answers/1705222926</td>\n",
166+
" <td>&lt;p&gt;第一条规则:不要威胁我共,你进一步,我共也会进一步,模糊地带都是这么搞没的;&lt;/p&gt;&lt;...</td>\n",
167+
" </tr>\n",
168+
" <tr>\n",
169+
" <th>3</th>\n",
170+
" <td>远方青木</td>\n",
171+
" <td>-1</td>\n",
172+
" <td>452933</td>\n",
173+
" <td>1514</td>\n",
174+
" <td>56</td>\n",
175+
" <td>https://www.zhihu.com/api/v4/answers/1708705645</td>\n",
176+
" <td>&lt;p&gt;自2021年1月31日起,中国不再承认英国颁发的BNO护照为有效证件。&lt;/p&gt;&lt;fig...</td>\n",
177+
" </tr>\n",
178+
" <tr>\n",
179+
" <th>4</th>\n",
180+
" <td>观察者网</td>\n",
181+
" <td>-1</td>\n",
182+
" <td>630076</td>\n",
183+
" <td>1783</td>\n",
184+
" <td>130</td>\n",
185+
" <td>https://www.zhihu.com/api/v4/answers/1705612787</td>\n",
186+
" <td>&lt;h2&gt;&lt;b&gt;&lt;a href=\"https://link.zhihu.com/?target...</td>\n",
187+
" </tr>\n",
188+
" <tr>\n",
189+
" <th>...</th>\n",
190+
" <td>...</td>\n",
191+
" <td>...</td>\n",
192+
" <td>...</td>\n",
193+
" <td>...</td>\n",
194+
" <td>...</td>\n",
195+
" <td>...</td>\n",
196+
" <td>...</td>\n",
197+
" </tr>\n",
198+
" <tr>\n",
199+
" <th>298</th>\n",
200+
" <td>Lda</td>\n",
201+
" <td>-1</td>\n",
202+
" <td>22</td>\n",
203+
" <td>1</td>\n",
204+
" <td>2</td>\n",
205+
" <td>https://www.zhihu.com/api/v4/answers/1704749826</td>\n",
206+
" <td>&lt;p&gt;以前香港只有居民没有国民。&lt;/p&gt;&lt;p&gt;今后香港居民要被分化为国民和外国民两种存在了。...</td>\n",
207+
" </tr>\n",
208+
" <tr>\n",
209+
" <th>299</th>\n",
210+
" <td>知乎用户</td>\n",
211+
" <td>1</td>\n",
212+
" <td>2</td>\n",
213+
" <td>0</td>\n",
214+
" <td>2</td>\n",
215+
" <td>https://www.zhihu.com/api/v4/answers/1705839209</td>\n",
216+
" <td>&lt;p&gt;我爸爸上班被老总吼,回家打我一巴掌。&lt;/p&gt;</td>\n",
217+
" </tr>\n",
218+
" <tr>\n",
219+
" <th>300</th>\n",
220+
" <td>脱腻屎塔克</td>\n",
221+
" <td>1</td>\n",
222+
" <td>6</td>\n",
223+
" <td>0</td>\n",
224+
" <td>25</td>\n",
225+
" <td>https://www.zhihu.com/api/v4/answers/1705807033</td>\n",
226+
" <td>&lt;p&gt;有个问题,香港来内地不是一直使用港澳通行证吗,为什么要用b n o .再来,谁是b n...</td>\n",
227+
" </tr>\n",
228+
" <tr>\n",
229+
" <th>301</th>\n",
230+
" <td>pxgpprr</td>\n",
231+
" <td>-1</td>\n",
232+
" <td>0</td>\n",
233+
" <td>0</td>\n",
234+
" <td>2</td>\n",
235+
" <td>https://www.zhihu.com/api/v4/answers/1705356879</td>\n",
236+
" <td>&lt;p&gt;象征性而已啦,这护照的意义是帮助持有者在英国居住,又不是离开香港,人家换个香港护照离境...</td>\n",
237+
" </tr>\n",
238+
" <tr>\n",
239+
" <th>302</th>\n",
240+
" <td>吴名仕</td>\n",
241+
" <td>1</td>\n",
242+
" <td>2059</td>\n",
243+
" <td>1</td>\n",
244+
" <td>0</td>\n",
245+
" <td>https://www.zhihu.com/api/v4/answers/1704486736</td>\n",
246+
" <td>&lt;p&gt;这相当于宣布领取BNO的居民为不受欢迎人士,限期离境。&lt;/p&gt;</td>\n",
247+
" </tr>\n",
248+
" </tbody>\n",
249+
"</table>\n",
250+
"<p>303 rows × 7 columns</p>\n",
251+
"</div>"
252+
],
253+
"text/plain": [
254+
" name gender follower_count voteup_count comment_count \\\n",
255+
"0 李珀河 -1 23 1574 203 \n",
256+
"1 洛神赋天下第一 1 11766 2261 241 \n",
257+
"2 甄昊元 1 65506 5543 311 \n",
258+
"3 远方青木 -1 452933 1514 56 \n",
259+
"4 观察者网 -1 630076 1783 130 \n",
260+
".. ... ... ... ... ... \n",
261+
"298 Lda -1 22 1 2 \n",
262+
"299 知乎用户 1 2 0 2 \n",
263+
"300 脱腻屎塔克 1 6 0 25 \n",
264+
"301 pxgpprr -1 0 0 2 \n",
265+
"302 吴名仕 1 2059 1 0 \n",
266+
"\n",
267+
" url \\\n",
268+
"0 https://www.zhihu.com/api/v4/answers/1704574740 \n",
269+
"1 https://www.zhihu.com/api/v4/answers/1705038141 \n",
270+
"2 https://www.zhihu.com/api/v4/answers/1705222926 \n",
271+
"3 https://www.zhihu.com/api/v4/answers/1708705645 \n",
272+
"4 https://www.zhihu.com/api/v4/answers/1705612787 \n",
273+
".. ... \n",
274+
"298 https://www.zhihu.com/api/v4/answers/1704749826 \n",
275+
"299 https://www.zhihu.com/api/v4/answers/1705839209 \n",
276+
"300 https://www.zhihu.com/api/v4/answers/1705807033 \n",
277+
"301 https://www.zhihu.com/api/v4/answers/1705356879 \n",
278+
"302 https://www.zhihu.com/api/v4/answers/1704486736 \n",
279+
"\n",
280+
" content \n",
281+
"0 <p>香港特区政府3日表示将安排专机接载滞留秘鲁的香港居民由利马飞往伦敦,再转乘已预留机位的... \n",
282+
"1 <p>英国就别跳了,马上自身难保国将不国,大英帝国很大概率从此不复存在,就这样还有闲工夫关心... \n",
283+
"2 <p>第一条规则:不要威胁我共,你进一步,我共也会进一步,模糊地带都是这么搞没的;</p><... \n",
284+
"3 <p>自2021年1月31日起,中国不再承认英国颁发的BNO护照为有效证件。</p><fig... \n",
285+
"4 <h2><b><a href=\"https://link.zhihu.com/?target... \n",
286+
".. ... \n",
287+
"298 <p>以前香港只有居民没有国民。</p><p>今后香港居民要被分化为国民和外国民两种存在了。... \n",
288+
"299 <p>我爸爸上班被老总吼,回家打我一巴掌。</p> \n",
289+
"300 <p>有个问题,香港来内地不是一直使用港澳通行证吗,为什么要用b n o .再来,谁是b n... \n",
290+
"301 <p>象征性而已啦,这护照的意义是帮助持有者在英国居住,又不是离开香港,人家换个香港护照离境... \n",
291+
"302 <p>这相当于宣布领取BNO的居民为不受欢迎人士,限期离境。</p> \n",
292+
"\n",
293+
"[303 rows x 7 columns]"
294+
]
295+
},
296+
"execution_count": 4,
297+
"metadata": {},
298+
"output_type": "execute_result"
299+
}
300+
],
301+
"source": [
302+
"file"
303+
]
304+
},
305+
{
306+
"cell_type": "code",
307+
"execution_count": 5,
308+
"metadata": {},
309+
"outputs": [],
310+
"source": [
311+
"file.to_csv(\"zhuhu_441839927.csv\", encoding= \"utf-8-sig\")"
312+
]
313+
}
314+
],
315+
"metadata": {
316+
"kernelspec": {
317+
"display_name": "Python 3",
318+
"language": "python",
319+
"name": "python3"
320+
},
321+
"language_info": {
322+
"codemirror_mode": {
323+
"name": "ipython",
324+
"version": 3
325+
},
326+
"file_extension": ".py",
327+
"mimetype": "text/x-python",
328+
"name": "python",
329+
"nbconvert_exporter": "python",
330+
"pygments_lexer": "ipython3",
331+
"version": "3.8.2"
332+
}
333+
},
334+
"nbformat": 4,
335+
"nbformat_minor": 4
336+
}

0 commit comments

Comments
 (0)