Add files via upload

hiteshvaidya · web-flow · commit 51844d966188 · 2018-06-17T16:20:16.000+05:30
diff --git a/Sentiment Analysis.ipynb b/Sentiment Analysis.ipynb
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -62,7 +62,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -93,7 +93,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -130,7 +130,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -159,7 +159,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -191,7 +191,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -372,7 +372,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -387,9 +387,42 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"http://www.amazon.com/gp/product/B00OSTKZWM?redirect=true&ref_=cm_cr_ryp_prd_ttl_sol_1\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n",
+      "  ' that document to Beautiful Soup.' % decoded_markup\n",
+      "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"http://www.amazon.com/gp/product/B013YDFH3Y?redirect=true&ref_=cm_cr_ryp_prd_ttl_sol_0\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n",
+      "  ' that document to Beautiful Soup.' % decoded_markup\n",
+      "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"https://www.amazon.com/gp/product/B00U8KSNB0/ref=cm_cr_ryp_prd_ttl_sol_22\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n",
+      "  ' that document to Beautiful Soup.' % decoded_markup\n",
+      "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"http://www.amazon.com/gp/product/B00PEJQU9M?redirect=true&ref_=cm_cr_ryp_prd_ttl_sol_0\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n",
+      "  ' that document to Beautiful Soup.' % decoded_markup\n",
+      "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"https://www.amazon.com/dp/B00K15KRV6/ref=cm_cr_ryp_prd_ttl_sol_22\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n",
+      "  ' that document to Beautiful Soup.' % decoded_markup\n",
+      "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"https://www.amazon.com/gp/product/B00G197Q4M/ref=cm_cr_ryp_prd_ttl_sol_26\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n",
+      "  ' that document to Beautiful Soup.' % decoded_markup\n",
+      "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"http://www.amazon.com/gp/product/B0193D539M?redirect=true&ref_=cm_cr_ryp_prd_ttl_sol_0\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n",
+      "  ' that document to Beautiful Soup.' % decoded_markup\n",
+      "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"https://www.amazon.com/dp/B01BO6BYMQ/ref=cm_cr_ryp_prd_ttl_sol_1\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n",
+      "  ' that document to Beautiful Soup.' % decoded_markup\n",
+      "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"https://www.amazon.com/gp/product/B00JEMZYM4/ref=cm_cr_ryp_prd_ttl_sol_0\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n",
+      "  ' that document to Beautiful Soup.' % decoded_markup\n",
+      "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"https://www.amazon.com/dp/B00QF5QJR2/ref=cm_cr_ryp_prd_ttl_sol_0\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n",
+      "  ' that document to Beautiful Soup.' % decoded_markup\n",
+      "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"https://www.amazon.com/dp/B01CJU9BBM/ref=cm_cr_ryp_prd_ttl_sol_0\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n",
+      "  ' that document to Beautiful Soup.' % decoded_markup\n",
+      "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"https://www.amazon.com/gp/product/B00JFNDLRC/ref=cm_cr_ryp_prd_ttl_sol_0\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n",
+      "  ' that document to Beautiful Soup.' % decoded_markup\n",
+      "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"http://www.amazon.com/gp/product/B00EZHM9JE?redirect=true&ref_=cm_cr_ryp_prd_ttl_sol_0\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n",
+      "  ' that document to Beautiful Soup.' % decoded_markup\n"
+     ]
+    }
+   ],
    "source": [
     "    # Preprocess text data in training set and validation set\n",
     "    x_train_cleaned = []\n",
@@ -415,7 +448,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -507,7 +540,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -529,7 +562,7 @@
        "          verbose=0, warm_start=False)"
       ]
      },
-     "execution_count": 40,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -633,24 +666,64 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
+   "source": [
+    "    #x_train_subset = tfidf.transform(x_train_cleaned[:100])\n",
+    "    x_train_input = tfidf.transform(x_train_cleaned)\n",
+    "    svr_lin = LinearSVC(multi_class='ovr',C=1.0,loss='squared_hinge', dual=False)\n",
+    "    svr_lin.fit(x_train_input, y_train)\n",
+    "    y_svr_lin_predicted = svr_lin.predict(tfidf.transform(x_test_cleaned))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "    modelEvaluation(y_svr_lin_predicted, y_test)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Functions for Model Evaluation\n",
+    "\n",
+    "There are multiple functions for model evaluation in scikit learn. To know more about them, please follow the below mentioned links\n",
+    "- [accuracy score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score)\n",
+    "- [f_score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html)\n",
+    "- [f1_score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score)\n",
+    "- [confusion matrix](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html#sklearn.metrics.confusion_matrix)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Accuracy of this SVM = 0.8917012568575658\n"
+      "Accuracy of this SVM = 0.9409305970799106\n",
+      "Fscore of this SVM = (0.9412812101129703, 0.9409305970799106, 0.9384909185837339, None)\n",
+      "F-1 score of this SVM = 0.9384909185837339\n",
+      "confusion matrix = [[ 7477    33   628]\n",
+      " [  306  1775   655]\n",
+      " [  373    40 23164]]\n"
      ]
     }
    ],
    "source": [
-    "    #x_train_subset = tfidf.transform(x_train_cleaned[:100])\n",
-    "    x_train_input = tfidf.transform(x_train_cleaned)\n",
-    "    svr_lin = LinearSVC(multi_class='ovr',C=1.0,loss='squared_hinge', dual=False)\n",
-    "    svr_lin.fit(x_train_input, y_train)\n",
-    "    y_predicted = svr_lin.predict(tfidf.transform(x_test_cleaned))\n",
-    "    print \"Accuracy of this SVM = \" + str(metrics.accuracy_score(y_test, y_predicted))"
+    "    print \"Accuracy of this SVM = \" + str(metrics.accuracy_score(y_test, y_svr_lin_predicted))\n",
+    "    print \"Fscore of this SVM = \" + str(metrics.precision_recall_fscore_support(y_test, y_svr_lin_predicted, pos_label=2, average='weighted'))\n",
+    "    print \"F-1 score of this SVM = \" + str(metrics.f1_score(y_test, y_svr_lin_predicted, pos_label=2, average='weighted'))\n",
+    "    print \"confusion matrix = \" + str(metrics.confusion_matrix(y_test, y_svr_lin_predicted))"
    ]
   },
   {
@@ -664,22 +737,47 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "    rand = RandomForestClassifier()\n",
+    "    rand.fit(x_train_input, y_train)\n",
+    "    y_rand_predicted = rand.predict(tfidf.transform(x_test_cleaned))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "    modelEvaluation(y_rand_predicted, y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Accuracy of Random Forest = 0.9395082871324489\n"
+      "Accuracy of Random Forest = 0.9386665118574207\n",
+      "Fscore of this SVM = (0.9391700836892655, 0.9386665118574207, 0.9362244481117618, None)\n",
+      "F-1 score of this SVM = 0.9362244481117618\n",
+      "confusion matrix = [[ 7486    33   619]\n",
+      " [  338  1759   639]\n",
+      " [  446    38 23093]]\n"
      ]
     }
    ],
    "source": [
-    "    rand = RandomForestClassifier()\n",
-    "    rand.fit(x_train_input, y_train)\n",
-    "    y_predicted = rand.predict(tfidf.transform(x_test_cleaned))\n",
-    "    print \"Accuracy of Random Forest = \" + str(rand.score(tfidf.transform(x_test_cleaned), y_test))"
+    "    print \"Accuracy of Random Forest = \" + str(rand.score(tfidf.transform(x_test_cleaned), y_test))\n",
+    "    print \"Fscore of this SVM = \" + str(metrics.precision_recall_fscore_support(y_test, y_predicted, pos_label=2, average='weighted'))\n",
+    "    print \"F-1 score of this SVM = \" + str(metrics.f1_score(y_test, y_predicted, pos_label=2, average='weighted'))\n",
+    "    print \"confusion matrix = \" + str(metrics.confusion_matrix(y_test, y_predicted))"
    ]
   },
   {
@@ -693,22 +791,71 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "    decTree = DecisionTreeClassifier()\n",
+    "    decTree.fit(x_train_input, y_train)\n",
+    "    y_decTree_predicted = decTree.predict(tfidf.transform(x_test_cleaned))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Accuracy of Decision Tree = 0.9263591768018344\n"
+      "\n",
+      "Accuracy on validation set: 0.9262\n",
+      "\n",
+      "Classification report : \n",
+      "             precision    recall  f1-score   support\n",
+      "\n",
+      "          0       0.90      0.90      0.90      8138\n",
+      "          1       0.78      0.70      0.73      2736\n",
+      "          2       0.95      0.96      0.96     23577\n",
+      "\n",
+      "avg / total       0.92      0.93      0.93     34451\n",
+      "\n",
+      "\n",
+      "Confusion Matrix : \n",
+      "[[ 7291   244   603]\n",
+      " [  299  1902   535]\n",
+      " [  555   306 22716]]\n"
      ]
     }
    ],
    "source": [
-    "    decTree = DecisionTreeClassifier()\n",
-    "    decTree.fit(x_train_input, y_train)\n",
-    "    y_predicted = decTree.predict(tfidf.transform(x_test_cleaned))\n",
-    "    print \"Accuracy of Decision Tree = \" + str(decTree.score(tfidf.transform(x_test_cleaned), y_test))"
+    "    modelEvaluation(y_decTree_predicted, y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy of Decision Tree = 0.9262140431337261\n",
+      "Fscore of this SVM = (0.9247698369985567, 0.9262140431337261, 0.9252945198875524, None)\n",
+      "F-1 score of this SVM = 0.9252945198875524\n",
+      "confusion matrix = [[ 7291   244   603]\n",
+      " [  299  1902   535]\n",
+      " [  555   306 22716]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "    print \"Accuracy of Decision Tree = \" + str(decTree.score(tfidf.transform(x_test_cleaned), y_test))\n",
+    "    print \"Fscore of this SVM = \" + str(metrics.precision_recall_fscore_support(y_test, y_decTree_predicted, pos_label=2, average='weighted'))\n",
+    "    print \"F-1 score of this SVM = \" + str(metrics.f1_score(y_test, y_decTree_predicted, pos_label=2, average='weighted'))\n",
+    "    print \"confusion matrix = \" + str(metrics.confusion_matrix(y_test, y_decTree_predicted))"
    ]
   }
  ],