Skip to content

Commit 5cbb7ca

Browse files
committed
make out-of-core section stand-alone executable in notebook
1 parent b810a64 commit 5cbb7ca

File tree

2 files changed

+96
-47
lines changed

2 files changed

+96
-47
lines changed

code/ch08/ch08.ipynb

Lines changed: 85 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1327,16 +1327,51 @@
13271327
},
13281328
{
13291329
"cell_type": "code",
1330-
"execution_count": 27,
1331-
"metadata": {
1332-
"collapsed": true
1333-
},
1330+
"execution_count": 1,
1331+
"metadata": {},
1332+
"outputs": [],
1333+
"source": [
1334+
"# This cell is not contained in the book but\n",
1335+
"# added for convenience so that the notebook\n",
1336+
"# can be executed starting here, without\n",
1337+
"# executing prior code in this notebook\n",
1338+
"\n",
1339+
"import os\n",
1340+
"import gzip\n",
1341+
"\n",
1342+
"\n",
1343+
"if not os.path.isfile('movie_data.csv'):\n",
1344+
" if not os.path.isfile('movie_data.csv.gz'):\n",
1345+
" print('Please place a copy of the movie_data.csv.gz'\n",
1346+
" 'in this directory. You can obtain it by'\n",
1347+
" 'a) executing the code in the beginning of this'\n",
1348+
" 'notebook or b) by downloading it from GitHub:'\n",
1349+
" 'https://github.com/rasbt/python-machine-learning-'\n",
1350+
" 'book-2nd-edition/blob/master/code/ch08/movie_data.csv.gz')\n",
1351+
" else:\n",
1352+
" with in_f = gzip.open('movie_data.csv.gz', 'rb'), \\\n",
1353+
" out_f = open('movie_data.csv', 'wb'):\n",
1354+
" out_f.write(in_f.read())"
1355+
]
1356+
},
1357+
{
1358+
"cell_type": "code",
1359+
"execution_count": 2,
1360+
"metadata": {},
13341361
"outputs": [],
13351362
"source": [
13361363
"import numpy as np\n",
13371364
"import re\n",
13381365
"from nltk.corpus import stopwords\n",
13391366
"\n",
1367+
"\n",
1368+
"# The `stop` is defined as earlier in this chapter\n",
1369+
"# Added it here for convenience, so that this section\n",
1370+
"# can be run as standalone without executing prior code\n",
1371+
"# in the directory\n",
1372+
"stop = stopwords.words('english')\n",
1373+
"\n",
1374+
"\n",
13401375
"def tokenizer(text):\n",
13411376
" text = re.sub('<[^>]*>', '', text)\n",
13421377
" emoticons = re.findall('(?::|;|=)(?:-)?(?:\\)|\\(|D|P)', text.lower())\n",
@@ -1356,7 +1391,7 @@
13561391
},
13571392
{
13581393
"cell_type": "code",
1359-
"execution_count": 28,
1394+
"execution_count": 3,
13601395
"metadata": {},
13611396
"outputs": [
13621397
{
@@ -1366,7 +1401,7 @@
13661401
" 1)"
13671402
]
13681403
},
1369-
"execution_count": 28,
1404+
"execution_count": 3,
13701405
"metadata": {},
13711406
"output_type": "execute_result"
13721407
}
@@ -1377,10 +1412,8 @@
13771412
},
13781413
{
13791414
"cell_type": "code",
1380-
"execution_count": 29,
1381-
"metadata": {
1382-
"collapsed": true
1383-
},
1415+
"execution_count": 4,
1416+
"metadata": {},
13841417
"outputs": [],
13851418
"source": [
13861419
"def get_minibatch(doc_stream, size):\n",
@@ -1397,29 +1430,18 @@
13971430
},
13981431
{
13991432
"cell_type": "code",
1400-
"execution_count": 30,
1433+
"execution_count": 5,
14011434
"metadata": {},
1402-
"outputs": [
1403-
{
1404-
"name": "stderr",
1405-
"output_type": "stream",
1406-
"text": [
1407-
"/Users/sebastian/miniconda3/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:73: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n",
1408-
" DeprecationWarning)\n"
1409-
]
1410-
}
1411-
],
1435+
"outputs": [],
14121436
"source": [
14131437
"from sklearn.feature_extraction.text import HashingVectorizer\n",
14141438
"from sklearn.linear_model import SGDClassifier\n",
14151439
"\n",
1440+
"\n",
14161441
"vect = HashingVectorizer(decode_error='ignore', \n",
14171442
" n_features=2**21,\n",
14181443
" preprocessor=None, \n",
1419-
" tokenizer=tokenizer)\n",
1420-
"\n",
1421-
"clf = SGDClassifier(loss='log', random_state=1, n_iter=1)\n",
1422-
"doc_stream = stream_docs(path='movie_data.csv')"
1444+
" tokenizer=tokenizer)"
14231445
]
14241446
},
14251447
{
@@ -1428,20 +1450,39 @@
14281450
"source": [
14291451
"**Note**\n",
14301452
"\n",
1431-
"- You can replace `Perceptron(n_iter, ...)` by `Perceptron(max_iter, ...)` in scikit-learn >= 0.19. The `n_iter` parameter is used here deriberately, because some people still use scikit-learn 0.18.\n"
1453+
"- You can replace `Perceptron(n_iter, ...)` by `Perceptron(max_iter, ...)` in scikit-learn >= 0.19."
14321454
]
14331455
},
14341456
{
14351457
"cell_type": "code",
1436-
"execution_count": 31,
1458+
"execution_count": 6,
1459+
"metadata": {},
1460+
"outputs": [],
1461+
"source": [
1462+
"from distutils.version import LooseVersion as Version\n",
1463+
"from sklearn import __version__ as sklearn_version\n",
1464+
"\n",
1465+
"\n",
1466+
"if Version(sklearn_version) < '0.18':\n",
1467+
" clf = SGDClassifier(loss='log', random_state=1, n_iter=1)\n",
1468+
"else:\n",
1469+
" clf = SGDClassifier(loss='log', random_state=1, max_iter=1)\n",
1470+
"\n",
1471+
"\n",
1472+
"doc_stream = stream_docs(path='movie_data.csv')"
1473+
]
1474+
},
1475+
{
1476+
"cell_type": "code",
1477+
"execution_count": 7,
14371478
"metadata": {},
14381479
"outputs": [
14391480
{
14401481
"name": "stderr",
14411482
"output_type": "stream",
14421483
"text": [
14431484
"0% [##############################] 100% | ETA: 00:00:00\n",
1444-
"Total time elapsed: 00:00:31\n"
1485+
"Total time elapsed: 00:00:28\n"
14451486
]
14461487
}
14471488
],
@@ -1461,7 +1502,7 @@
14611502
},
14621503
{
14631504
"cell_type": "code",
1464-
"execution_count": 32,
1505+
"execution_count": 8,
14651506
"metadata": {},
14661507
"outputs": [
14671508
{
@@ -1480,10 +1521,8 @@
14801521
},
14811522
{
14821523
"cell_type": "code",
1483-
"execution_count": 33,
1484-
"metadata": {
1485-
"collapsed": true
1486-
},
1524+
"execution_count": 9,
1525+
"metadata": {},
14871526
"outputs": [],
14881527
"source": [
14891528
"clf = clf.partial_fit(X_test, y_test)"
@@ -1792,15 +1831,15 @@
17921831
},
17931832
{
17941833
"cell_type": "code",
1795-
"execution_count": 1,
1834+
"execution_count": 10,
17961835
"metadata": {},
17971836
"outputs": [
17981837
{
17991838
"name": "stdout",
18001839
"output_type": "stream",
18011840
"text": [
18021841
"[NbConvertApp] Converting notebook ch08.ipynb to script\n",
1803-
"[NbConvertApp] Writing 24613 bytes to ch08.py\n"
1842+
"[NbConvertApp] Writing 11500 bytes to ch08.txt\n"
18041843
]
18051844
}
18061845
],
@@ -1816,17 +1855,17 @@
18161855
"language": "python",
18171856
"name": "python3"
18181857
},
1819-
"language_info": {
1820-
"codemirror_mode": {
1821-
"name": "ipython",
1822-
"version": 3
1823-
},
1824-
"file_extension": ".py",
1825-
"mimetype": "text/x-python",
1826-
"name": "python",
1827-
"nbconvert_exporter": "python",
1828-
"pygments_lexer": "ipython3",
1829-
"version": "3.6.1"
1858+
"toc": {
1859+
"nav_menu": {},
1860+
"number_sections": true,
1861+
"sideBar": true,
1862+
"skip_h1_title": false,
1863+
"title_cell": "Table of Contents",
1864+
"title_sidebar": "Contents",
1865+
"toc_cell": false,
1866+
"toc_position": {},
1867+
"toc_section_display": true,
1868+
"toc_window_display": false
18301869
}
18311870
},
18321871
"nbformat": 4,

code/ch08/ch08.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,12 @@
2323
from sklearn.feature_extraction.text import HashingVectorizer
2424
from sklearn.linear_model import SGDClassifier
2525
from sklearn.decomposition import LatentDirichletAllocation
26+
from distutils.version import LooseVersion as Version
27+
from sklearn import __version__ as sklearn_version
28+
29+
30+
# Added version check for recent scikit-learn 0.18 checks
31+
2632

2733
# *Python Machine Learning 2nd Edition* by [Sebastian Raschka](https://sebastianraschka.com), Packt Publishing Ltd. 2017
2834
#
@@ -577,7 +583,11 @@ def get_minibatch(doc_stream, size):
577583
preprocessor=None,
578584
tokenizer=tokenizer)
579585

580-
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
586+
if Version(sklearn_version) < '0.18':
587+
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
588+
else:
589+
clf = SGDClassifier(loss='log', random_state=1, max_iter=1)
590+
581591
doc_stream = stream_docs(path='movie_data.csv')
582592

583593

0 commit comments

Comments
 (0)