|
1327 | 1327 | },
|
1328 | 1328 | {
|
1329 | 1329 | "cell_type": "code",
|
1330 |
| - "execution_count": 27, |
1331 |
| - "metadata": { |
1332 |
| - "collapsed": true |
1333 |
| - }, |
| 1330 | + "execution_count": 1, |
| 1331 | + "metadata": {}, |
| 1332 | + "outputs": [], |
| 1333 | + "source": [ |
| 1334 | + "# This cell is not contained in the book but\n", |
| 1335 | + "# added for convenience so that the notebook\n", |
| 1336 | + "# can be executed starting here, without\n", |
| 1337 | + "# executing prior code in this notebook\n", |
| 1338 | + "\n", |
| 1339 | + "import os\n", |
| 1340 | + "import gzip\n", |
| 1341 | + "\n", |
| 1342 | + "\n", |
| 1343 | + "if not os.path.isfile('movie_data.csv'):\n", |
| 1344 | + " if not os.path.isfile('movie_data.csv.gz'):\n", |
| 1345 | + " print('Please place a copy of the movie_data.csv.gz'\n", |
| 1346 | + " 'in this directory. You can obtain it by'\n", |
| 1347 | + " 'a) executing the code in the beginning of this'\n", |
| 1348 | + " 'notebook or b) by downloading it from GitHub:'\n", |
| 1349 | + " 'https://github.com/rasbt/python-machine-learning-'\n", |
| 1350 | + " 'book-2nd-edition/blob/master/code/ch08/movie_data.csv.gz')\n", |
| 1351 | + " else:\n", |
| 1352 | + " with in_f = gzip.open('movie_data.csv.gz', 'rb'), \\\n", |
| 1353 | + " out_f = open('movie_data.csv', 'wb'):\n", |
| 1354 | + " out_f.write(in_f.read())" |
| 1355 | + ] |
| 1356 | + }, |
| 1357 | + { |
| 1358 | + "cell_type": "code", |
| 1359 | + "execution_count": 2, |
| 1360 | + "metadata": {}, |
1334 | 1361 | "outputs": [],
|
1335 | 1362 | "source": [
|
1336 | 1363 | "import numpy as np\n",
|
1337 | 1364 | "import re\n",
|
1338 | 1365 | "from nltk.corpus import stopwords\n",
|
1339 | 1366 | "\n",
|
| 1367 | + "\n", |
| 1368 | + "# The `stop` is defined as earlier in this chapter\n", |
| 1369 | + "# Added it here for convenience, so that this section\n", |
| 1370 | + "# can be run as standalone without executing prior code\n", |
| 1371 | + "# in the directory\n", |
| 1372 | + "stop = stopwords.words('english')\n", |
| 1373 | + "\n", |
| 1374 | + "\n", |
1340 | 1375 | "def tokenizer(text):\n",
|
1341 | 1376 | " text = re.sub('<[^>]*>', '', text)\n",
|
1342 | 1377 | " emoticons = re.findall('(?::|;|=)(?:-)?(?:\\)|\\(|D|P)', text.lower())\n",
|
|
1356 | 1391 | },
|
1357 | 1392 | {
|
1358 | 1393 | "cell_type": "code",
|
1359 |
| - "execution_count": 28, |
| 1394 | + "execution_count": 3, |
1360 | 1395 | "metadata": {},
|
1361 | 1396 | "outputs": [
|
1362 | 1397 | {
|
|
1366 | 1401 | " 1)"
|
1367 | 1402 | ]
|
1368 | 1403 | },
|
1369 |
| - "execution_count": 28, |
| 1404 | + "execution_count": 3, |
1370 | 1405 | "metadata": {},
|
1371 | 1406 | "output_type": "execute_result"
|
1372 | 1407 | }
|
|
1377 | 1412 | },
|
1378 | 1413 | {
|
1379 | 1414 | "cell_type": "code",
|
1380 |
| - "execution_count": 29, |
1381 |
| - "metadata": { |
1382 |
| - "collapsed": true |
1383 |
| - }, |
| 1415 | + "execution_count": 4, |
| 1416 | + "metadata": {}, |
1384 | 1417 | "outputs": [],
|
1385 | 1418 | "source": [
|
1386 | 1419 | "def get_minibatch(doc_stream, size):\n",
|
|
1397 | 1430 | },
|
1398 | 1431 | {
|
1399 | 1432 | "cell_type": "code",
|
1400 |
| - "execution_count": 30, |
| 1433 | + "execution_count": 5, |
1401 | 1434 | "metadata": {},
|
1402 |
| - "outputs": [ |
1403 |
| - { |
1404 |
| - "name": "stderr", |
1405 |
| - "output_type": "stream", |
1406 |
| - "text": [ |
1407 |
| - "/Users/sebastian/miniconda3/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:73: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n", |
1408 |
| - " DeprecationWarning)\n" |
1409 |
| - ] |
1410 |
| - } |
1411 |
| - ], |
| 1435 | + "outputs": [], |
1412 | 1436 | "source": [
|
1413 | 1437 | "from sklearn.feature_extraction.text import HashingVectorizer\n",
|
1414 | 1438 | "from sklearn.linear_model import SGDClassifier\n",
|
1415 | 1439 | "\n",
|
| 1440 | + "\n", |
1416 | 1441 | "vect = HashingVectorizer(decode_error='ignore', \n",
|
1417 | 1442 | " n_features=2**21,\n",
|
1418 | 1443 | " preprocessor=None, \n",
|
1419 |
| - " tokenizer=tokenizer)\n", |
1420 |
| - "\n", |
1421 |
| - "clf = SGDClassifier(loss='log', random_state=1, n_iter=1)\n", |
1422 |
| - "doc_stream = stream_docs(path='movie_data.csv')" |
| 1444 | + " tokenizer=tokenizer)" |
1423 | 1445 | ]
|
1424 | 1446 | },
|
1425 | 1447 | {
|
|
1428 | 1450 | "source": [
|
1429 | 1451 | "**Note**\n",
|
1430 | 1452 | "\n",
|
1431 |
| - "- You can replace `Perceptron(n_iter, ...)` by `Perceptron(max_iter, ...)` in scikit-learn >= 0.19. The `n_iter` parameter is used here deriberately, because some people still use scikit-learn 0.18.\n" |
| 1453 | + "- You can replace `Perceptron(n_iter, ...)` by `Perceptron(max_iter, ...)` in scikit-learn >= 0.19." |
1432 | 1454 | ]
|
1433 | 1455 | },
|
1434 | 1456 | {
|
1435 | 1457 | "cell_type": "code",
|
1436 |
| - "execution_count": 31, |
| 1458 | + "execution_count": 6, |
| 1459 | + "metadata": {}, |
| 1460 | + "outputs": [], |
| 1461 | + "source": [ |
| 1462 | + "from distutils.version import LooseVersion as Version\n", |
| 1463 | + "from sklearn import __version__ as sklearn_version\n", |
| 1464 | + "\n", |
| 1465 | + "\n", |
| 1466 | + "if Version(sklearn_version) < '0.18':\n", |
| 1467 | + " clf = SGDClassifier(loss='log', random_state=1, n_iter=1)\n", |
| 1468 | + "else:\n", |
| 1469 | + " clf = SGDClassifier(loss='log', random_state=1, max_iter=1)\n", |
| 1470 | + "\n", |
| 1471 | + "\n", |
| 1472 | + "doc_stream = stream_docs(path='movie_data.csv')" |
| 1473 | + ] |
| 1474 | + }, |
| 1475 | + { |
| 1476 | + "cell_type": "code", |
| 1477 | + "execution_count": 7, |
1437 | 1478 | "metadata": {},
|
1438 | 1479 | "outputs": [
|
1439 | 1480 | {
|
1440 | 1481 | "name": "stderr",
|
1441 | 1482 | "output_type": "stream",
|
1442 | 1483 | "text": [
|
1443 | 1484 | "0% [##############################] 100% | ETA: 00:00:00\n",
|
1444 |
| - "Total time elapsed: 00:00:31\n" |
| 1485 | + "Total time elapsed: 00:00:28\n" |
1445 | 1486 | ]
|
1446 | 1487 | }
|
1447 | 1488 | ],
|
|
1461 | 1502 | },
|
1462 | 1503 | {
|
1463 | 1504 | "cell_type": "code",
|
1464 |
| - "execution_count": 32, |
| 1505 | + "execution_count": 8, |
1465 | 1506 | "metadata": {},
|
1466 | 1507 | "outputs": [
|
1467 | 1508 | {
|
|
1480 | 1521 | },
|
1481 | 1522 | {
|
1482 | 1523 | "cell_type": "code",
|
1483 |
| - "execution_count": 33, |
1484 |
| - "metadata": { |
1485 |
| - "collapsed": true |
1486 |
| - }, |
| 1524 | + "execution_count": 9, |
| 1525 | + "metadata": {}, |
1487 | 1526 | "outputs": [],
|
1488 | 1527 | "source": [
|
1489 | 1528 | "clf = clf.partial_fit(X_test, y_test)"
|
|
1792 | 1831 | },
|
1793 | 1832 | {
|
1794 | 1833 | "cell_type": "code",
|
1795 |
| - "execution_count": 1, |
| 1834 | + "execution_count": 10, |
1796 | 1835 | "metadata": {},
|
1797 | 1836 | "outputs": [
|
1798 | 1837 | {
|
1799 | 1838 | "name": "stdout",
|
1800 | 1839 | "output_type": "stream",
|
1801 | 1840 | "text": [
|
1802 | 1841 | "[NbConvertApp] Converting notebook ch08.ipynb to script\n",
|
1803 |
| - "[NbConvertApp] Writing 24613 bytes to ch08.py\n" |
| 1842 | + "[NbConvertApp] Writing 11500 bytes to ch08.txt\n" |
1804 | 1843 | ]
|
1805 | 1844 | }
|
1806 | 1845 | ],
|
|
1816 | 1855 | "language": "python",
|
1817 | 1856 | "name": "python3"
|
1818 | 1857 | },
|
1819 |
| - "language_info": { |
1820 |
| - "codemirror_mode": { |
1821 |
| - "name": "ipython", |
1822 |
| - "version": 3 |
1823 |
| - }, |
1824 |
| - "file_extension": ".py", |
1825 |
| - "mimetype": "text/x-python", |
1826 |
| - "name": "python", |
1827 |
| - "nbconvert_exporter": "python", |
1828 |
| - "pygments_lexer": "ipython3", |
1829 |
| - "version": "3.6.1" |
| 1858 | + "toc": { |
| 1859 | + "nav_menu": {}, |
| 1860 | + "number_sections": true, |
| 1861 | + "sideBar": true, |
| 1862 | + "skip_h1_title": false, |
| 1863 | + "title_cell": "Table of Contents", |
| 1864 | + "title_sidebar": "Contents", |
| 1865 | + "toc_cell": false, |
| 1866 | + "toc_position": {}, |
| 1867 | + "toc_section_display": true, |
| 1868 | + "toc_window_display": false |
1830 | 1869 | }
|
1831 | 1870 | },
|
1832 | 1871 | "nbformat": 4,
|
|
0 commit comments