deploy: 175959d

JesperDramsch · Feb 11, 2024 · 62c69e1 · 62c69e1
1 parent aba1832
commit 62c69e1
Show file tree

Hide file tree

Showing 12 changed files with 144 additions and 144 deletions.
diff --git a/_images/16f87ceed29993be6fd6d67e6d8c467b5c2efb5382dcf3f246cc29c208139e6e.png b/_images/16f87ceed29993be6fd6d67e6d8c467b5c2efb5382dcf3f246cc29c208139e6e.png
diff --git a/_images/4d9c80cb801cc13c75abdec0fa51a2aed2125b4909a8b506c8ed2136a2bc13d2.png b/_images/4d9c80cb801cc13c75abdec0fa51a2aed2125b4909a8b506c8ed2136a2bc13d2.png
diff --git a/_images/5dd4146ebf31fbe53c566ab5a091ff1d1f709d050d2aefc911ebec8a74b404e3.png b/_images/5dd4146ebf31fbe53c566ab5a091ff1d1f709d050d2aefc911ebec8a74b404e3.png
diff --git a/_images/986d3355a04a331a4f82450cdb3d395e2e4c3f0e6bc69ce6ee11d573526c9fb5.png b/_images/986d3355a04a331a4f82450cdb3d395e2e4c3f0e6bc69ce6ee11d573526c9fb5.png
diff --git a/_images/ef0beddf67764f8f44cb60e7f2657fa94da63b2d1a7297eabf6c38fd95e9053e.png b/_images/ef0beddf67764f8f44cb60e7f2657fa94da63b2d1a7297eabf6c38fd95e9053e.png
diff --git a/_images/ef2b25a1ae4803b6969da817341409d2fc81bfe6ca226484623cf1e93a5284bf.png b/_images/ef2b25a1ae4803b6969da817341409d2fc81bfe6ca226484623cf1e93a5284bf.png
diff --git a/_images/f408cd7c3d2fd283ba309ea8c13579567d53f9d44ec71b781d036709a32b4587.png b/_images/f408cd7c3d2fd283ba309ea8c13579567d53f9d44ec71b781d036709a32b4587.png
diff --git a/_sources/notebooks/5-interpretability.ipynb b/_sources/notebooks/5-interpretability.ipynb
@@ -270,9 +270,10 @@
     "from sklearn.inspection import PartialDependenceDisplay, partial_dependence\n",
     "from matplotlib import pyplot as plt\n",
     "\n",
+    "\n",
     "pd_results = partial_dependence(model, X_train.sample(20), num_features)\n",
     "print(pd_results.keys())\n",
-    "print(f\"Example Values: {pd_results['values'][0]}, Average: {pd_results['average'][0].mean(axis=0)}\")"
+    "print(f\"Example Values: {pd_results['values'][0]}, Average: {pd_results['average'][0][0].mean(axis=0)}\")"
    ]
   },
   {
@@ -467,7 +468,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {
     "execution": {
      "iopub.execute_input": "2022-12-13T01:42:23.343022Z",
@@ -490,14 +491,22 @@
     }
    ],
    "source": [
-    "pd.Series(rf.named_steps[\"classifier\"].feature_importances_, index=num_features+['F', 'M']).plot.bar()\n",
+    "pd.Series(rf.named_steps[\"classifier\"].feature_importances_, index=num_features+['Female', 'Male']).plot.bar()\n",
     "plt.show()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": []
+   "source": [
+    "The tree-based feature importance shows the importances as the \"random forest sees them\", which means we get the `Sex` feature split into male and female from the OneHotEncoding. This also means that this categorical features is correlated strongly.\n",
+    "\n",
+    "We can clearly see that the `Culmen length` is the most important feature in determining which penguin we're facing. `Culmen depth` seems to be slightly less important than `Flipper length`. `Sex` seems to be entirely unimportant.\n",
+    "\n",
+    "Now we can use the more sophisticated permutation importance. \n",
+    "\n",
+    "Luckily, scikit-learn implements this feature for us and we can just import it:"
+   ]
   },
   {
    "cell_type": "code",
@@ -527,13 +536,19 @@
     "from sklearn.inspection import permutation_importance\n",
     "\n",
     "result = permutation_importance(\n",
-    "    rf, X_test, y_test, n_repeats=10, random_state=42\n",
+    "    rf, X_train, y_train, n_repeats=10, random_state=42\n",
     ")\n",
     "\n",
-    "pd.Series(result.importances_mean, index=features).plot.bar()\n",
+    "fi_rf_train = pd.Series(result.importances_mean, index=features)\n",
+    "fi_rf_train.plot.bar()\n",
     "plt.show()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": 12,
@@ -560,13 +575,26 @@
    ],
    "source": [
     "result = permutation_importance(\n",
-    "    model, X_test, y_test, n_repeats=10, random_state=42\n",
+    "    model, X_train, y_train, n_repeats=10, random_state=42\n",
     ")\n",
     "\n",
-    "pd.Series(result.importances_mean, index=features).plot.bar()\n",
+    "fi_svm_train = pd.Series(result.importances_mean, index=features)\n",
+    "fi_svm_train.plot.bar()\n",
     "plt.show()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fi_rf_test = pd.Series(permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=42), index=features)\n",
+    "fi_svm_test = pd.Series(permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42), index=features)\n",
+    "\n",
+    "pd.DataFrame({\"RF Train\": fi_rf_train, \"SVM Train\": fi_svm_train, \"RF Test\": fi_rf_test, \"SVM Test\": fi_svm_test}).plot.bar()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/notebooks/0-basic-data-prep-and-model.html b/notebooks/0-basic-data-prep-and-model.html
@@ -984,38 +984,38 @@ <h2>Machine Learning<a class="headerlink" href="#machine-learning" title="Permal
   </thead>
   <tbody>
     <tr>
-      <th>38</th>
-      <td>37.6</td>
-      <td>19.3</td>
-      <td>181.0</td>
+      <th>34</th>
+      <td>36.4</td>
+      <td>17.0</td>
+      <td>195.0</td>
       <td>FEMALE</td>
     </tr>
     <tr>
-      <th>93</th>
-      <td>39.6</td>
-      <td>18.1</td>
-      <td>186.0</td>
+      <th>243</th>
+      <td>52.2</td>
+      <td>17.1</td>
+      <td>228.0</td>
       <td>MALE</td>
     </tr>
     <tr>
-      <th>152</th>
-      <td>46.1</td>
-      <td>13.2</td>
-      <td>211.0</td>
+      <th>312</th>
+      <td>47.6</td>
+      <td>18.3</td>
+      <td>195.0</td>
       <td>FEMALE</td>
     </tr>
     <tr>
-      <th>209</th>
-      <td>45.5</td>
-      <td>15.0</td>
-      <td>220.0</td>
+      <th>285</th>
+      <td>51.3</td>
+      <td>19.9</td>
+      <td>198.0</td>
       <td>MALE</td>
     </tr>
     <tr>
-      <th>161</th>
-      <td>46.8</td>
-      <td>15.4</td>
-      <td>215.0</td>
+      <th>46</th>
+      <td>41.1</td>
+      <td>19.0</td>
+      <td>182.0</td>
       <td>MALE</td>
     </tr>
     <tr>
@@ -1026,39 +1026,39 @@ <h2>Machine Learning<a class="headerlink" href="#machine-learning" title="Permal
       <td>...</td>
     </tr>
     <tr>
-      <th>91</th>
-      <td>41.1</td>
-      <td>18.1</td>
-      <td>205.0</td>
-      <td>MALE</td>
+      <th>44</th>
+      <td>37.0</td>
+      <td>16.9</td>
+      <td>185.0</td>
+      <td>FEMALE</td>
     </tr>
     <tr>
-      <th>183</th>
-      <td>42.8</td>
-      <td>14.2</td>
-      <td>209.0</td>
+      <th>236</th>
+      <td>44.9</td>
+      <td>13.8</td>
+      <td>212.0</td>
       <td>FEMALE</td>
     </tr>
     <tr>
-      <th>286</th>
-      <td>46.6</td>
-      <td>17.8</td>
+      <th>83</th>
+      <td>35.1</td>
+      <td>19.4</td>
       <td>193.0</td>
-      <td>FEMALE</td>
+      <td>MALE</td>
     </tr>
     <tr>
-      <th>337</th>
-      <td>46.8</td>
-      <td>16.5</td>
-      <td>189.0</td>
-      <td>FEMALE</td>
+      <th>31</th>
+      <td>37.2</td>
+      <td>18.1</td>
+      <td>178.0</td>
+      <td>MALE</td>
     </tr>
     <tr>
-      <th>330</th>
-      <td>42.5</td>
-      <td>17.3</td>
-      <td>187.0</td>
-      <td>FEMALE</td>
+      <th>121</th>
+      <td>37.7</td>
+      <td>19.8</td>
+      <td>198.0</td>
+      <td>MALE</td>
     </tr>
   </tbody>
 </table>
@@ -1095,48 +1095,48 @@ <h2>Machine Learning<a class="headerlink" href="#machine-learning" title="Permal
   </thead>
   <tbody>
     <tr>
-      <th>38</th>
+      <th>34</th>
       <td>Adelie Penguin (Pygoscelis adeliae)</td>
     </tr>
     <tr>
-      <th>93</th>
-      <td>Adelie Penguin (Pygoscelis adeliae)</td>
+      <th>243</th>
+      <td>Gentoo penguin (Pygoscelis papua)</td>
     </tr>
     <tr>
-      <th>152</th>
-      <td>Gentoo penguin (Pygoscelis papua)</td>
+      <th>312</th>
+      <td>Chinstrap penguin (Pygoscelis antarctica)</td>
     </tr>
     <tr>
-      <th>209</th>
-      <td>Gentoo penguin (Pygoscelis papua)</td>
+      <th>285</th>
+      <td>Chinstrap penguin (Pygoscelis antarctica)</td>
     </tr>
     <tr>
-      <th>161</th>
-      <td>Gentoo penguin (Pygoscelis papua)</td>
+      <th>46</th>
+      <td>Adelie Penguin (Pygoscelis adeliae)</td>
     </tr>
     <tr>
       <th>...</th>
       <td>...</td>
     </tr>
     <tr>
-      <th>91</th>
+      <th>44</th>
       <td>Adelie Penguin (Pygoscelis adeliae)</td>
     </tr>
     <tr>
-      <th>183</th>
+      <th>236</th>
       <td>Gentoo penguin (Pygoscelis papua)</td>
     </tr>
     <tr>
-      <th>286</th>
-      <td>Chinstrap penguin (Pygoscelis antarctica)</td>
+      <th>83</th>
+      <td>Adelie Penguin (Pygoscelis adeliae)</td>
     </tr>
     <tr>
-      <th>337</th>
-      <td>Chinstrap penguin (Pygoscelis antarctica)</td>
+      <th>31</th>
+      <td>Adelie Penguin (Pygoscelis adeliae)</td>
     </tr>
     <tr>
-      <th>330</th>
-      <td>Chinstrap penguin (Pygoscelis antarctica)</td>
+      <th>121</th>
+      <td>Adelie Penguin (Pygoscelis adeliae)</td>
     </tr>
   </tbody>
 </table>
@@ -1278,7 +1278,7 @@ <h3>Model Training<a class="headerlink" href="#model-training" title="Permalink
 </div>
 </div>
 <div class="cell_output docutils container">
-<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>0.9900990099009901
+<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>0.9801980198019802
 </pre></div>
 </div>
 </div>

diff --git a/notebooks/1-model-evaluation.html b/notebooks/1-model-evaluation.html
@@ -1146,8 +1146,8 @@ <h2><span class="section-number">1.3.5. </span>Choosing the appropriate Evaluati
 </div>
 </div>
 <div class="cell_output docutils container">
-<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>{&#39;fit_time&#39;: array([0.00581956, 0.00523829, 0.00519466, 0.00517082, 0.00550675]),
- &#39;score_time&#39;: array([0.00410533, 0.00397706, 0.00397515, 0.00400424, 0.00418091]),
+<div class="output text_plain highlight-myst-ansi notranslate"><div class="highlight"><pre><span></span>{&#39;fit_time&#39;: array([0.00599885, 0.00516748, 0.00515461, 0.00513673, 0.00516844]),
+ &#39;score_time&#39;: array([0.00435352, 0.00395489, 0.00400567, 0.00393963, 0.00396037]),
  &#39;test_MCC&#39;: array([0.37796447, 0.27863911, 0.40824829, 0.02424643, 0.08625819]),
  &#39;test_ACC&#39;: array([0.73333333, 0.7       , 0.76666667, 0.66666667, 0.62068966])}
 </pre></div>