rasbt
diff --git a/‎.travis/install_travis_env.sh
Lines changed: 1 addition & 0 deletions b/‎.travis/install_travis_env.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎code/ch08/ch08.ipynb
Lines changed: 95 additions & 44 deletions b/‎code/ch08/ch08.ipynb
Lines changed: 95 additions & 44 deletions
diff --git a/‎code/ch08/movie_data.csv.zip renamed to ‎code/ch08/movie_data.csv.gz
25.3 MB b/‎code/ch08/movie_data.csv.zip renamed to ‎code/ch08/movie_data.csv.gz
25.3 MB
@@ -60,4 +60,5 @@ else
 fi
 
 python -c "import tensorflow; print('tensorflow %s' % tensorflow.__version__)"
+python -c "import os; print(os.environ)"
 
@@ -148,7 +148,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 22,
    "metadata": {
     "collapsed": true
    },
@@ -185,16 +185,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "0%                          100%\n",
-      "[##############################] | ETA: 00:00:00\n",
-      "Total time elapsed: 00:01:28\n"
+      "0% [##############################] 100% | ETA: 00:00:00\n",
+      "Total time elapsed: 00:02:21\n"
      ]
     }
    ],
@@ -233,7 +232,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 24,
    "metadata": {
     "collapsed": true
    },
@@ -254,7 +253,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 25,
    "metadata": {
     "collapsed": true
    },
@@ -265,13 +264,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/html": [
        "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
@@ -283,17 +295,17 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>\"Two Hands\" is a good addition to the Australi...</td>\n",
+       "      <td>In 1974, the teenager Martha Moxley (Maggie Gr...</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>It's New Year's eve, a cop-killer (in the form...</td>\n",
+       "      <td>OK... so... I really like Kris Kristofferson a...</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>I figured that any horror film with Orson Well...</td>\n",
+       "      <td>***SPOILER*** Do not read this, if you think a...</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -302,12 +314,12 @@
       ],
       "text/plain": [
        "                                              review  sentiment\n",
-       "0  \"Two Hands\" is a good addition to the Australi...          1\n",
-       "1  It's New Year's eve, a cop-killer (in the form...          0\n",
-       "2  I figured that any horror film with Orson Well...          0"
+       "0  In 1974, the teenager Martha Moxley (Maggie Gr...          1\n",
+       "1  OK... so... I really like Kris Kristofferson a...          0\n",
+       "2  ***SPOILER*** Do not read this, if you think a...          0"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -788,7 +800,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 9,
    "metadata": {
     "collapsed": true
    },
@@ -808,7 +820,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -817,7 +829,7 @@
        "['runners', 'like', 'running', 'and', 'thus', 'they', 'run']"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -828,7 +840,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -837,7 +849,7 @@
        "['runner', 'like', 'run', 'and', 'thu', 'they', 'run']"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -848,14 +860,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...\n",
+      "[nltk_data] Downloading package stopwords to\n",
+      "[nltk_data]     /Users/sebastian/nltk_data...\n",
       "[nltk_data]   Package stopwords is already up-to-date!\n"
      ]
     },
@@ -865,7 +878,7 @@
        "True"
       ]
      },
-     "execution_count": 22,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -878,7 +891,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -887,7 +900,7 @@
        "['runner', 'like', 'run', 'run', 'lot']"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -981,9 +994,32 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Important Note**\n",
+    "**Important Note about `n_jobs`**\n",
     "\n",
-    "Please note that it is highly recommended to use `n_jobs=-1` (instead of `n_jobs=1`) in the previous code example to utilize all available cores on your machine and speed up the grid search. However, some Windows users reported issues when running the previous code with the `n_jobs=-1` setting related to pickling the tokenizer and tokenizer_porter functions for multiprocessing on Windows. Another workaround would be to replace those two functions, `[tokenizer, tokenizer_porter]`, with `[str.split]`. However, note that the replacement by the simple str.split would not support stemming.\n"
+    "Please note that it is highly recommended to use `n_jobs=-1` (instead of `n_jobs=1`) in the previous code example to utilize all available cores on your machine and speed up the grid search. However, some Windows users reported issues when running the previous code with the `n_jobs=-1` setting related to pickling the tokenizer and tokenizer_porter functions for multiprocessing on Windows. Another workaround would be to replace those two functions, `[tokenizer, tokenizer_porter]`, with `[str.split]`. However, note that the replacement by the simple `str.split` would not support stemming."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Important Note about the running time**\n",
+    "\n",
+    "Executing the following code cell **may take up to 30-60 min** depending on your machine, since based on the parameter grid we defined, there are 2*2*2*3*5 + 2*2*2*3*5 = 240 models to fit.\n",
+    "\n",
+    "If you do not wish to wait so long, you could reduce the size of the dataset by decreasing the number of training samples, for example, as follows:\n",
+    "\n",
+    "    X_train = df.loc[:2500, 'review'].values\n",
+    "    y_train = df.loc[:2500, 'sentiment'].values\n",
+    "    \n",
+    "However, note that decreasing the training set size to such a small number will likely result in poorly performing models. Alternatively, you can delete parameters from the grid above to reduce the number of models to fit -- for example, by using the following:\n",
+    "\n",
+    "    param_grid = [{'vect__ngram_range': [(1, 1)],\n",
+    "                   'vect__stop_words': [stop, None],\n",
+    "                   'vect__tokenizer': [tokenizer],\n",
+    "                   'clf__penalty': ['l1', 'l2'],\n",
+    "                   'clf__C': [1.0, 10.0]},\n",
+    "                  ]"
    ]
   },
   {
@@ -1255,7 +1291,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 27,
    "metadata": {
     "collapsed": true
    },
@@ -1284,17 +1320,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "('\"\"\"Two Hands\"\" is a good addition to the Australian Film Catalogue.<br /><br />It is that curious mix of real life, surreal life, comedy, tragedy and love the Australians have developed on their own.<br /><br />Heath Ledger is basically a good if naive guy. Wanting to get on he falls in with a local \"\"Big\"\" Man Pando (Bryan Brown). But at the very moment he sets a first toe on the dark path to crime he meets Alex (Rose Byrne). Here is the cause of the error to change his life in ways unexpected. After getting on the wrong side of Pando accidentally, things get very bad very quickly and if not for a little otherworldly help this would have been a short sad film. Yes, Jimmy learns a few lessons in life and no one escapes uninjured in one way or another but at the end of it has a feel good feel to it. Although there is a lawless theme through the story, it is not glorified and helps to show how destructive crime can be on normal lives.<br /><br />Heath Ledger is excellent as Jimmy, innocent and savvy at the same time, Rose is hypnotic as Alex and Bryan is marvellous as usual. This is a small ensemble of characters are believable and I found myself caring about the good guys and disliking the baddies.<br /><br />This DVD is usually in the cheap aisle so I would recommend adding it too your DVD collection, it would be money well spent.<br /><br />8 out of 10\"',\n",
+       "('\"In 1974, the teenager Martha Moxley (Maggie Grace) moves to the high-class area of Belle Haven, Greenwich, Connecticut. On the Mischief Night, eve of Halloween, she was murdered in the backyard of her house and her murder remained unsolved. Twenty-two years later, the writer Mark Fuhrman (Christopher Meloni), who is a former LA detective that has fallen in disgrace for perjury in O.J. Simpson trial and moved to Idaho, decides to investigate the case with his partner Stephen Weeks (Andrew Mitchell) with the purpose of writing a book. The locals squirm and do not welcome them, but with the support of the retired detective Steve Carroll (Robert Forster) that was in charge of the investigation in the 70\\'s, they discover the criminal and a net of power and money to cover the murder.<br /><br />\"\"Murder in Greenwich\"\" is a good TV movie, with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a Kennedy. The powerful and rich family used their influence to cover the murder for more than twenty years. However, a snoopy detective and convicted perjurer in disgrace was able to disclose how the hideous crime was committed. The screenplay shows the investigation of Mark and the last days of Martha in parallel, but there is a lack of the emotion in the dramatization. My vote is seven.<br /><br />Title (Brazil): Not Available\"',\n",
        " 1)"
       ]
      },
-     "execution_count": 34,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1305,7 +1341,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 29,
    "metadata": {
     "collapsed": true
    },
@@ -1325,11 +1361,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/sebastian/miniconda3/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:73: DeprecationWarning: n_iter parameter is deprecated in 0.19 and will be removed in 0.21. Use max_iter and tol instead.\n",
+      "  DeprecationWarning)\n"
+     ]
+    }
+   ],
    "source": [
     "from sklearn.feature_extraction.text import HashingVectorizer\n",
     "from sklearn.linear_model import SGDClassifier\n",
@@ -1343,18 +1386,26 @@
     "doc_stream = stream_docs(path='movie_data.csv')"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Note**\n",
+    "\n",
+    "- You can replace `Perceptron(n_iter, ...)` by `Perceptron(max_iter, ...)` in scikit-learn >= 0.19. The `n_iter` parameter is used here deriberately, because some people still use scikit-learn 0.18.\n"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "0%                          100%\n",
-      "[##############################] | ETA: 00:00:00\n",
-      "Total time elapsed: 00:00:39\n"
+      "0% [##############################] 100% | ETA: 00:00:00\n",
+      "Total time elapsed: 00:00:31\n"
      ]
     }
    ],
@@ -1374,14 +1425,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Accuracy: 0.878\n"
+      "Accuracy: 0.867\n"
      ]
     }
    ],
@@ -1393,7 +1444,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 33,
    "metadata": {
     "collapsed": true
    },
Original file line number	Diff line number	Diff line change
`@@ -60,4 +60,5 @@ else`
`60`	`60`	`fi`
`61`	`61`
`62`	`62`	`python -c "import tensorflow; print('tensorflow %s' % tensorflow.__version__)"`
	`63`	`+python -c "import os; print(os.environ)"`
`63`	`64`