|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": 1, |
| 6 | + "metadata": {}, |
| 7 | + "outputs": [], |
| 8 | + "source": [ |
| 9 | + "import pandas as pd\n", |
| 10 | + "import requests\n", |
| 11 | + "from bs4 import BeautifulSoup\n", |
| 12 | + "import json" |
| 13 | + ] |
| 14 | + }, |
| 15 | + { |
| 16 | + "cell_type": "code", |
| 17 | + "execution_count": 2, |
| 18 | + "metadata": {}, |
| 19 | + "outputs": [], |
| 20 | + "source": [ |
| 21 | + "def getPage(url):\n", |
| 22 | + "\n", |
| 23 | + " headers = {\n", |
| 24 | + " 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',\n", |
| 25 | + " 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',\n", |
| 26 | + " }\n", |
| 27 | + "\n", |
| 28 | + " try:\n", |
| 29 | + " r= requests.get(url, headers=headers)\n", |
| 30 | + " page= r.content\n", |
| 31 | + " return page\n", |
| 32 | + "\n", |
| 33 | + " except requests.HTTPError as e:\n", |
| 34 | + " print(e)\n", |
| 35 | + " print(\"HTTPError\")\n", |
| 36 | + "\n", |
| 37 | + " except requests.RequestException as e:\n", |
| 38 | + " print(e)\n", |
| 39 | + "\n", |
| 40 | + " except:\n", |
| 41 | + " print(\"Unknown Error!\")\n", |
| 42 | + "\n", |
| 43 | + "def zhihu_scrapyer(id= None, total= 1000):\n", |
| 44 | + "\n", |
| 45 | + " file= pd.DataFrame()\n", |
| 46 | + " \n", |
| 47 | + " page_number= 0\n", |
| 48 | + " \n", |
| 49 | + " base= 'https://www.zhihu.com/api/v4/questions/'+ str(id)+ '/answers?'\n", |
| 50 | + " include= 'data%5B%2A%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cattachment%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Crelevant_info%2Cquestion%2Cexcerpt%2Cis_labeled%2Cpaid_info%2Cpaid_info_content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cis_recognized%3Bdata%5B%2A%5D.mark_infos%5B%2A%5D.url%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%2A%5D.topics%3Bdata%5B%2A%5D.settings.table_of_content.enabled'\n", |
| 51 | + " \n", |
| 52 | + " while (page_number< total): \n", |
| 53 | + "\n", |
| 54 | + " question_url= base + 'include=' + include + '&limit=5&' + 'offset=' + str(page_number)+ '&platform=desktop&sort_by=default'\n", |
| 55 | + "\n", |
| 56 | + " page= getPage(question_url)\n", |
| 57 | + " entries= json.loads(page)['data']\n", |
| 58 | + " \n", |
| 59 | + " if len(entries) != 0: \n", |
| 60 | + "\n", |
| 61 | + " answers= list()\n", |
| 62 | + "\n", |
| 63 | + " for i in entries:\n", |
| 64 | + " answer= list()\n", |
| 65 | + "\n", |
| 66 | + " answer.append(i['author']['name']) \n", |
| 67 | + " answer.append(i['author']['gender']) \n", |
| 68 | + " answer.append(i['author']['follower_count'])\n", |
| 69 | + " \n", |
| 70 | + " \n", |
| 71 | + " answer.append(i['voteup_count']) \n", |
| 72 | + " answer.append(i['comment_count']) \n", |
| 73 | + " answer.append(i[\"url\"]) \n", |
| 74 | + " answer.append(i['content']) ## Get the answer content\n", |
| 75 | + "\n", |
| 76 | + " answers.append(answer)\n", |
| 77 | + "\n", |
| 78 | + " file= file.append(pd.DataFrame(answers), ignore_index= True) \n", |
| 79 | + " \n", |
| 80 | + " else:\n", |
| 81 | + " pass\n", |
| 82 | + " \n", |
| 83 | + " page_number+= 5\n", |
| 84 | + " \n", |
| 85 | + " file.columns= [\"name\", \"gender\", \"follower_count\", \"voteup_count\", \"comment_count\", \"url\", \"content\"]\n", |
| 86 | + " \n", |
| 87 | + " return file" |
| 88 | + ] |
| 89 | + }, |
| 90 | + { |
| 91 | + "cell_type": "code", |
| 92 | + "execution_count": 3, |
| 93 | + "metadata": { |
| 94 | + "scrolled": true |
| 95 | + }, |
| 96 | + "outputs": [], |
| 97 | + "source": [ |
| 98 | + "if __name__=='__main__':\n", |
| 99 | + " file= zhihu_scrapyer(id= 441839927, total= 310)" |
| 100 | + ] |
| 101 | + }, |
| 102 | + { |
| 103 | + "cell_type": "code", |
| 104 | + "execution_count": 4, |
| 105 | + "metadata": {}, |
| 106 | + "outputs": [ |
| 107 | + { |
| 108 | + "data": { |
| 109 | + "text/html": [ |
| 110 | + "<div>\n", |
| 111 | + "<style scoped>\n", |
| 112 | + " .dataframe tbody tr th:only-of-type {\n", |
| 113 | + " vertical-align: middle;\n", |
| 114 | + " }\n", |
| 115 | + "\n", |
| 116 | + " .dataframe tbody tr th {\n", |
| 117 | + " vertical-align: top;\n", |
| 118 | + " }\n", |
| 119 | + "\n", |
| 120 | + " .dataframe thead th {\n", |
| 121 | + " text-align: right;\n", |
| 122 | + " }\n", |
| 123 | + "</style>\n", |
| 124 | + "<table border=\"1\" class=\"dataframe\">\n", |
| 125 | + " <thead>\n", |
| 126 | + " <tr style=\"text-align: right;\">\n", |
| 127 | + " <th></th>\n", |
| 128 | + " <th>name</th>\n", |
| 129 | + " <th>gender</th>\n", |
| 130 | + " <th>follower_count</th>\n", |
| 131 | + " <th>voteup_count</th>\n", |
| 132 | + " <th>comment_count</th>\n", |
| 133 | + " <th>url</th>\n", |
| 134 | + " <th>content</th>\n", |
| 135 | + " </tr>\n", |
| 136 | + " </thead>\n", |
| 137 | + " <tbody>\n", |
| 138 | + " <tr>\n", |
| 139 | + " <th>0</th>\n", |
| 140 | + " <td>李珀河</td>\n", |
| 141 | + " <td>-1</td>\n", |
| 142 | + " <td>23</td>\n", |
| 143 | + " <td>1574</td>\n", |
| 144 | + " <td>203</td>\n", |
| 145 | + " <td>https://www.zhihu.com/api/v4/answers/1704574740</td>\n", |
| 146 | + " <td><p>香港特区政府3日表示将安排专机接载滞留秘鲁的香港居民由利马飞往伦敦,再转乘已预留机位的...</td>\n", |
| 147 | + " </tr>\n", |
| 148 | + " <tr>\n", |
| 149 | + " <th>1</th>\n", |
| 150 | + " <td>洛神赋天下第一</td>\n", |
| 151 | + " <td>1</td>\n", |
| 152 | + " <td>11766</td>\n", |
| 153 | + " <td>2261</td>\n", |
| 154 | + " <td>241</td>\n", |
| 155 | + " <td>https://www.zhihu.com/api/v4/answers/1705038141</td>\n", |
| 156 | + " <td><p>英国就别跳了,马上自身难保国将不国,大英帝国很大概率从此不复存在,就这样还有闲工夫关心...</td>\n", |
| 157 | + " </tr>\n", |
| 158 | + " <tr>\n", |
| 159 | + " <th>2</th>\n", |
| 160 | + " <td>甄昊元</td>\n", |
| 161 | + " <td>1</td>\n", |
| 162 | + " <td>65506</td>\n", |
| 163 | + " <td>5543</td>\n", |
| 164 | + " <td>311</td>\n", |
| 165 | + " <td>https://www.zhihu.com/api/v4/answers/1705222926</td>\n", |
| 166 | + " <td><p>第一条规则:不要威胁我共,你进一步,我共也会进一步,模糊地带都是这么搞没的;</p><...</td>\n", |
| 167 | + " </tr>\n", |
| 168 | + " <tr>\n", |
| 169 | + " <th>3</th>\n", |
| 170 | + " <td>远方青木</td>\n", |
| 171 | + " <td>-1</td>\n", |
| 172 | + " <td>452933</td>\n", |
| 173 | + " <td>1514</td>\n", |
| 174 | + " <td>56</td>\n", |
| 175 | + " <td>https://www.zhihu.com/api/v4/answers/1708705645</td>\n", |
| 176 | + " <td><p>自2021年1月31日起,中国不再承认英国颁发的BNO护照为有效证件。</p><fig...</td>\n", |
| 177 | + " </tr>\n", |
| 178 | + " <tr>\n", |
| 179 | + " <th>4</th>\n", |
| 180 | + " <td>观察者网</td>\n", |
| 181 | + " <td>-1</td>\n", |
| 182 | + " <td>630076</td>\n", |
| 183 | + " <td>1783</td>\n", |
| 184 | + " <td>130</td>\n", |
| 185 | + " <td>https://www.zhihu.com/api/v4/answers/1705612787</td>\n", |
| 186 | + " <td><h2><b><a href=\"https://link.zhihu.com/?target...</td>\n", |
| 187 | + " </tr>\n", |
| 188 | + " <tr>\n", |
| 189 | + " <th>...</th>\n", |
| 190 | + " <td>...</td>\n", |
| 191 | + " <td>...</td>\n", |
| 192 | + " <td>...</td>\n", |
| 193 | + " <td>...</td>\n", |
| 194 | + " <td>...</td>\n", |
| 195 | + " <td>...</td>\n", |
| 196 | + " <td>...</td>\n", |
| 197 | + " </tr>\n", |
| 198 | + " <tr>\n", |
| 199 | + " <th>298</th>\n", |
| 200 | + " <td>Lda</td>\n", |
| 201 | + " <td>-1</td>\n", |
| 202 | + " <td>22</td>\n", |
| 203 | + " <td>1</td>\n", |
| 204 | + " <td>2</td>\n", |
| 205 | + " <td>https://www.zhihu.com/api/v4/answers/1704749826</td>\n", |
| 206 | + " <td><p>以前香港只有居民没有国民。</p><p>今后香港居民要被分化为国民和外国民两种存在了。...</td>\n", |
| 207 | + " </tr>\n", |
| 208 | + " <tr>\n", |
| 209 | + " <th>299</th>\n", |
| 210 | + " <td>知乎用户</td>\n", |
| 211 | + " <td>1</td>\n", |
| 212 | + " <td>2</td>\n", |
| 213 | + " <td>0</td>\n", |
| 214 | + " <td>2</td>\n", |
| 215 | + " <td>https://www.zhihu.com/api/v4/answers/1705839209</td>\n", |
| 216 | + " <td><p>我爸爸上班被老总吼,回家打我一巴掌。</p></td>\n", |
| 217 | + " </tr>\n", |
| 218 | + " <tr>\n", |
| 219 | + " <th>300</th>\n", |
| 220 | + " <td>脱腻屎塔克</td>\n", |
| 221 | + " <td>1</td>\n", |
| 222 | + " <td>6</td>\n", |
| 223 | + " <td>0</td>\n", |
| 224 | + " <td>25</td>\n", |
| 225 | + " <td>https://www.zhihu.com/api/v4/answers/1705807033</td>\n", |
| 226 | + " <td><p>有个问题,香港来内地不是一直使用港澳通行证吗,为什么要用b n o .再来,谁是b n...</td>\n", |
| 227 | + " </tr>\n", |
| 228 | + " <tr>\n", |
| 229 | + " <th>301</th>\n", |
| 230 | + " <td>pxgpprr</td>\n", |
| 231 | + " <td>-1</td>\n", |
| 232 | + " <td>0</td>\n", |
| 233 | + " <td>0</td>\n", |
| 234 | + " <td>2</td>\n", |
| 235 | + " <td>https://www.zhihu.com/api/v4/answers/1705356879</td>\n", |
| 236 | + " <td><p>象征性而已啦,这护照的意义是帮助持有者在英国居住,又不是离开香港,人家换个香港护照离境...</td>\n", |
| 237 | + " </tr>\n", |
| 238 | + " <tr>\n", |
| 239 | + " <th>302</th>\n", |
| 240 | + " <td>吴名仕</td>\n", |
| 241 | + " <td>1</td>\n", |
| 242 | + " <td>2059</td>\n", |
| 243 | + " <td>1</td>\n", |
| 244 | + " <td>0</td>\n", |
| 245 | + " <td>https://www.zhihu.com/api/v4/answers/1704486736</td>\n", |
| 246 | + " <td><p>这相当于宣布领取BNO的居民为不受欢迎人士,限期离境。</p></td>\n", |
| 247 | + " </tr>\n", |
| 248 | + " </tbody>\n", |
| 249 | + "</table>\n", |
| 250 | + "<p>303 rows × 7 columns</p>\n", |
| 251 | + "</div>" |
| 252 | + ], |
| 253 | + "text/plain": [ |
| 254 | + " name gender follower_count voteup_count comment_count \\\n", |
| 255 | + "0 李珀河 -1 23 1574 203 \n", |
| 256 | + "1 洛神赋天下第一 1 11766 2261 241 \n", |
| 257 | + "2 甄昊元 1 65506 5543 311 \n", |
| 258 | + "3 远方青木 -1 452933 1514 56 \n", |
| 259 | + "4 观察者网 -1 630076 1783 130 \n", |
| 260 | + ".. ... ... ... ... ... \n", |
| 261 | + "298 Lda -1 22 1 2 \n", |
| 262 | + "299 知乎用户 1 2 0 2 \n", |
| 263 | + "300 脱腻屎塔克 1 6 0 25 \n", |
| 264 | + "301 pxgpprr -1 0 0 2 \n", |
| 265 | + "302 吴名仕 1 2059 1 0 \n", |
| 266 | + "\n", |
| 267 | + " url \\\n", |
| 268 | + "0 https://www.zhihu.com/api/v4/answers/1704574740 \n", |
| 269 | + "1 https://www.zhihu.com/api/v4/answers/1705038141 \n", |
| 270 | + "2 https://www.zhihu.com/api/v4/answers/1705222926 \n", |
| 271 | + "3 https://www.zhihu.com/api/v4/answers/1708705645 \n", |
| 272 | + "4 https://www.zhihu.com/api/v4/answers/1705612787 \n", |
| 273 | + ".. ... \n", |
| 274 | + "298 https://www.zhihu.com/api/v4/answers/1704749826 \n", |
| 275 | + "299 https://www.zhihu.com/api/v4/answers/1705839209 \n", |
| 276 | + "300 https://www.zhihu.com/api/v4/answers/1705807033 \n", |
| 277 | + "301 https://www.zhihu.com/api/v4/answers/1705356879 \n", |
| 278 | + "302 https://www.zhihu.com/api/v4/answers/1704486736 \n", |
| 279 | + "\n", |
| 280 | + " content \n", |
| 281 | + "0 <p>香港特区政府3日表示将安排专机接载滞留秘鲁的香港居民由利马飞往伦敦,再转乘已预留机位的... \n", |
| 282 | + "1 <p>英国就别跳了,马上自身难保国将不国,大英帝国很大概率从此不复存在,就这样还有闲工夫关心... \n", |
| 283 | + "2 <p>第一条规则:不要威胁我共,你进一步,我共也会进一步,模糊地带都是这么搞没的;</p><... \n", |
| 284 | + "3 <p>自2021年1月31日起,中国不再承认英国颁发的BNO护照为有效证件。</p><fig... \n", |
| 285 | + "4 <h2><b><a href=\"https://link.zhihu.com/?target... \n", |
| 286 | + ".. ... \n", |
| 287 | + "298 <p>以前香港只有居民没有国民。</p><p>今后香港居民要被分化为国民和外国民两种存在了。... \n", |
| 288 | + "299 <p>我爸爸上班被老总吼,回家打我一巴掌。</p> \n", |
| 289 | + "300 <p>有个问题,香港来内地不是一直使用港澳通行证吗,为什么要用b n o .再来,谁是b n... \n", |
| 290 | + "301 <p>象征性而已啦,这护照的意义是帮助持有者在英国居住,又不是离开香港,人家换个香港护照离境... \n", |
| 291 | + "302 <p>这相当于宣布领取BNO的居民为不受欢迎人士,限期离境。</p> \n", |
| 292 | + "\n", |
| 293 | + "[303 rows x 7 columns]" |
| 294 | + ] |
| 295 | + }, |
| 296 | + "execution_count": 4, |
| 297 | + "metadata": {}, |
| 298 | + "output_type": "execute_result" |
| 299 | + } |
| 300 | + ], |
| 301 | + "source": [ |
| 302 | + "file" |
| 303 | + ] |
| 304 | + }, |
| 305 | + { |
| 306 | + "cell_type": "code", |
| 307 | + "execution_count": 5, |
| 308 | + "metadata": {}, |
| 309 | + "outputs": [], |
| 310 | + "source": [ |
| 311 | + "file.to_csv(\"zhuhu_441839927.csv\", encoding= \"utf-8-sig\")" |
| 312 | + ] |
| 313 | + } |
| 314 | + ], |
| 315 | + "metadata": { |
| 316 | + "kernelspec": { |
| 317 | + "display_name": "Python 3", |
| 318 | + "language": "python", |
| 319 | + "name": "python3" |
| 320 | + }, |
| 321 | + "language_info": { |
| 322 | + "codemirror_mode": { |
| 323 | + "name": "ipython", |
| 324 | + "version": 3 |
| 325 | + }, |
| 326 | + "file_extension": ".py", |
| 327 | + "mimetype": "text/x-python", |
| 328 | + "name": "python", |
| 329 | + "nbconvert_exporter": "python", |
| 330 | + "pygments_lexer": "ipython3", |
| 331 | + "version": "3.8.2" |
| 332 | + } |
| 333 | + }, |
| 334 | + "nbformat": 4, |
| 335 | + "nbformat_minor": 4 |
| 336 | +} |
0 commit comments