|
13 | 13 | },
|
14 | 14 | {
|
15 | 15 | "cell_type": "code",
|
16 |
| - "execution_count": 1, |
| 16 | + "execution_count": 2, |
17 | 17 | "metadata": {},
|
18 | 18 | "outputs": [],
|
19 | 19 | "source": [
|
|
62 | 62 | },
|
63 | 63 | {
|
64 | 64 | "cell_type": "code",
|
65 |
| - "execution_count": 8, |
| 65 | + "execution_count": 3, |
66 | 66 | "metadata": {},
|
67 | 67 | "outputs": [],
|
68 | 68 | "source": [
|
|
93 | 93 | },
|
94 | 94 | {
|
95 | 95 | "cell_type": "code",
|
96 |
| - "execution_count": 3, |
| 96 | + "execution_count": 4, |
97 | 97 | "metadata": {},
|
98 | 98 | "outputs": [],
|
99 | 99 | "source": [
|
|
130 | 130 | },
|
131 | 131 | {
|
132 | 132 | "cell_type": "code",
|
133 |
| - "execution_count": 4, |
| 133 | + "execution_count": 5, |
134 | 134 | "metadata": {},
|
135 | 135 | "outputs": [],
|
136 | 136 | "source": [
|
|
159 | 159 | },
|
160 | 160 | {
|
161 | 161 | "cell_type": "code",
|
162 |
| - "execution_count": 37, |
| 162 | + "execution_count": 6, |
163 | 163 | "metadata": {},
|
164 | 164 | "outputs": [],
|
165 | 165 | "source": [
|
|
191 | 191 | },
|
192 | 192 | {
|
193 | 193 | "cell_type": "code",
|
194 |
| - "execution_count": 9, |
| 194 | + "execution_count": 7, |
195 | 195 | "metadata": {},
|
196 | 196 | "outputs": [
|
197 | 197 | {
|
|
372 | 372 | },
|
373 | 373 | {
|
374 | 374 | "cell_type": "code",
|
375 |
| - "execution_count": 14, |
| 375 | + "execution_count": 8, |
376 | 376 | "metadata": {},
|
377 | 377 | "outputs": [],
|
378 | 378 | "source": [
|
|
387 | 387 | },
|
388 | 388 | {
|
389 | 389 | "cell_type": "code",
|
390 |
| - "execution_count": 16, |
| 390 | + "execution_count": 9, |
391 | 391 | "metadata": {},
|
392 |
| - "outputs": [], |
| 392 | + "outputs": [ |
| 393 | + { |
| 394 | + "name": "stderr", |
| 395 | + "output_type": "stream", |
| 396 | + "text": [ |
| 397 | + "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"http://www.amazon.com/gp/product/B00OSTKZWM?redirect=true&ref_=cm_cr_ryp_prd_ttl_sol_1\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n", |
| 398 | + " ' that document to Beautiful Soup.' % decoded_markup\n", |
| 399 | + "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"http://www.amazon.com/gp/product/B013YDFH3Y?redirect=true&ref_=cm_cr_ryp_prd_ttl_sol_0\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n", |
| 400 | + " ' that document to Beautiful Soup.' % decoded_markup\n", |
| 401 | + "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"https://www.amazon.com/gp/product/B00U8KSNB0/ref=cm_cr_ryp_prd_ttl_sol_22\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n", |
| 402 | + " ' that document to Beautiful Soup.' % decoded_markup\n", |
| 403 | + "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"http://www.amazon.com/gp/product/B00PEJQU9M?redirect=true&ref_=cm_cr_ryp_prd_ttl_sol_0\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n", |
| 404 | + " ' that document to Beautiful Soup.' % decoded_markup\n", |
| 405 | + "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"https://www.amazon.com/dp/B00K15KRV6/ref=cm_cr_ryp_prd_ttl_sol_22\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n", |
| 406 | + " ' that document to Beautiful Soup.' % decoded_markup\n", |
| 407 | + "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"https://www.amazon.com/gp/product/B00G197Q4M/ref=cm_cr_ryp_prd_ttl_sol_26\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n", |
| 408 | + " ' that document to Beautiful Soup.' % decoded_markup\n", |
| 409 | + "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"http://www.amazon.com/gp/product/B0193D539M?redirect=true&ref_=cm_cr_ryp_prd_ttl_sol_0\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n", |
| 410 | + " ' that document to Beautiful Soup.' % decoded_markup\n", |
| 411 | + "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"https://www.amazon.com/dp/B01BO6BYMQ/ref=cm_cr_ryp_prd_ttl_sol_1\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n", |
| 412 | + " ' that document to Beautiful Soup.' % decoded_markup\n", |
| 413 | + "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"https://www.amazon.com/gp/product/B00JEMZYM4/ref=cm_cr_ryp_prd_ttl_sol_0\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n", |
| 414 | + " ' that document to Beautiful Soup.' % decoded_markup\n", |
| 415 | + "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"https://www.amazon.com/dp/B00QF5QJR2/ref=cm_cr_ryp_prd_ttl_sol_0\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n", |
| 416 | + " ' that document to Beautiful Soup.' % decoded_markup\n", |
| 417 | + "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"https://www.amazon.com/dp/B01CJU9BBM/ref=cm_cr_ryp_prd_ttl_sol_0\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n", |
| 418 | + " ' that document to Beautiful Soup.' % decoded_markup\n", |
| 419 | + "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"https://www.amazon.com/gp/product/B00JFNDLRC/ref=cm_cr_ryp_prd_ttl_sol_0\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n", |
| 420 | + " ' that document to Beautiful Soup.' % decoded_markup\n", |
| 421 | + "/home/hitesh/Documents/sentiment_analysis/local/lib/python2.7/site-packages/bs4/__init__.py:282: UserWarning: \"http://www.amazon.com/gp/product/B00EZHM9JE?redirect=true&ref_=cm_cr_ryp_prd_ttl_sol_0\" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.\n", |
| 422 | + " ' that document to Beautiful Soup.' % decoded_markup\n" |
| 423 | + ] |
| 424 | + } |
| 425 | + ], |
393 | 426 | "source": [
|
394 | 427 | " # Preprocess text data in training set and validation set\n",
|
395 | 428 | " x_train_cleaned = []\n",
|
|
415 | 448 | },
|
416 | 449 | {
|
417 | 450 | "cell_type": "code",
|
418 |
| - "execution_count": 42, |
| 451 | + "execution_count": 10, |
419 | 452 | "metadata": {},
|
420 | 453 | "outputs": [
|
421 | 454 | {
|
|
507 | 540 | },
|
508 | 541 | {
|
509 | 542 | "cell_type": "code",
|
510 |
| - "execution_count": 40, |
| 543 | + "execution_count": 11, |
511 | 544 | "metadata": {},
|
512 | 545 | "outputs": [
|
513 | 546 | {
|
|
529 | 562 | " verbose=0, warm_start=False)"
|
530 | 563 | ]
|
531 | 564 | },
|
532 |
| - "execution_count": 40, |
| 565 | + "execution_count": 11, |
533 | 566 | "metadata": {},
|
534 | 567 | "output_type": "execute_result"
|
535 | 568 | }
|
|
633 | 666 | },
|
634 | 667 | {
|
635 | 668 | "cell_type": "code",
|
636 |
| - "execution_count": 43, |
| 669 | + "execution_count": null, |
637 | 670 | "metadata": {},
|
| 671 | + "outputs": [], |
| 672 | + "source": [ |
| 673 | + " #x_train_subset = tfidf.transform(x_train_cleaned[:100])\n", |
| 674 | + " x_train_input = tfidf.transform(x_train_cleaned)\n", |
| 675 | + " svr_lin = LinearSVC(multi_class='ovr',C=1.0,loss='squared_hinge', dual=False)\n", |
| 676 | + " svr_lin.fit(x_train_input, y_train)\n", |
| 677 | + " y_svr_lin_predicted = svr_lin.predict(tfidf.transform(x_test_cleaned))" |
| 678 | + ] |
| 679 | + }, |
| 680 | + { |
| 681 | + "cell_type": "code", |
| 682 | + "execution_count": 16, |
| 683 | + "metadata": {}, |
| 684 | + "outputs": [], |
| 685 | + "source": [ |
| 686 | + " modelEvaluation(y_svr_lin_predicted, y_test)" |
| 687 | + ] |
| 688 | + }, |
| 689 | + { |
| 690 | + "cell_type": "markdown", |
| 691 | + "metadata": {}, |
| 692 | + "source": [ |
| 693 | + "### Functions for Model Evaluation\n", |
| 694 | + "\n", |
| 695 | + "There are multiple functions for model evaluation in scikit learn. To know more about them, please follow the below mentioned links\n", |
| 696 | + "- [accuracy score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score)\n", |
| 697 | + "- [f_score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html)\n", |
| 698 | + "- [f1_score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score)\n", |
| 699 | + "- [confusion matrix](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html#sklearn.metrics.confusion_matrix)" |
| 700 | + ] |
| 701 | + }, |
| 702 | + { |
| 703 | + "cell_type": "code", |
| 704 | + "execution_count": 23, |
| 705 | + "metadata": { |
| 706 | + "scrolled": true |
| 707 | + }, |
638 | 708 | "outputs": [
|
639 | 709 | {
|
640 | 710 | "name": "stdout",
|
641 | 711 | "output_type": "stream",
|
642 | 712 | "text": [
|
643 |
| - "Accuracy of this SVM = 0.8917012568575658\n" |
| 713 | + "Accuracy of this SVM = 0.9409305970799106\n", |
| 714 | + "Fscore of this SVM = (0.9412812101129703, 0.9409305970799106, 0.9384909185837339, None)\n", |
| 715 | + "F-1 score of this SVM = 0.9384909185837339\n", |
| 716 | + "confusion matrix = [[ 7477 33 628]\n", |
| 717 | + " [ 306 1775 655]\n", |
| 718 | + " [ 373 40 23164]]\n" |
644 | 719 | ]
|
645 | 720 | }
|
646 | 721 | ],
|
647 | 722 | "source": [
|
648 |
| - " #x_train_subset = tfidf.transform(x_train_cleaned[:100])\n", |
649 |
| - " x_train_input = tfidf.transform(x_train_cleaned)\n", |
650 |
| - " svr_lin = LinearSVC(multi_class='ovr',C=1.0,loss='squared_hinge', dual=False)\n", |
651 |
| - " svr_lin.fit(x_train_input, y_train)\n", |
652 |
| - " y_predicted = svr_lin.predict(tfidf.transform(x_test_cleaned))\n", |
653 |
| - " print \"Accuracy of this SVM = \" + str(metrics.accuracy_score(y_test, y_predicted))" |
| 723 | + " print \"Accuracy of this SVM = \" + str(metrics.accuracy_score(y_test, y_svr_lin_predicted))\n", |
| 724 | + " print \"Fscore of this SVM = \" + str(metrics.precision_recall_fscore_support(y_test, y_svr_lin_predicted, pos_label=2, average='weighted'))\n", |
| 725 | + " print \"F-1 score of this SVM = \" + str(metrics.f1_score(y_test, y_svr_lin_predicted, pos_label=2, average='weighted'))\n", |
| 726 | + " print \"confusion matrix = \" + str(metrics.confusion_matrix(y_test, y_svr_lin_predicted))" |
654 | 727 | ]
|
655 | 728 | },
|
656 | 729 | {
|
|
664 | 737 | },
|
665 | 738 | {
|
666 | 739 | "cell_type": "code",
|
667 |
| - "execution_count": 44, |
| 740 | + "execution_count": null, |
| 741 | + "metadata": {}, |
| 742 | + "outputs": [], |
| 743 | + "source": [ |
| 744 | + " rand = RandomForestClassifier()\n", |
| 745 | + " rand.fit(x_train_input, y_train)\n", |
| 746 | + " y_rand_predicted = rand.predict(tfidf.transform(x_test_cleaned))" |
| 747 | + ] |
| 748 | + }, |
| 749 | + { |
| 750 | + "cell_type": "code", |
| 751 | + "execution_count": 25, |
| 752 | + "metadata": {}, |
| 753 | + "outputs": [], |
| 754 | + "source": [ |
| 755 | + " modelEvaluation(y_rand_predicted, y_test)" |
| 756 | + ] |
| 757 | + }, |
| 758 | + { |
| 759 | + "cell_type": "code", |
| 760 | + "execution_count": 26, |
668 | 761 | "metadata": {},
|
669 | 762 | "outputs": [
|
670 | 763 | {
|
671 | 764 | "name": "stdout",
|
672 | 765 | "output_type": "stream",
|
673 | 766 | "text": [
|
674 |
| - "Accuracy of Random Forest = 0.9395082871324489\n" |
| 767 | + "Accuracy of Random Forest = 0.9386665118574207\n", |
| 768 | + "Fscore of this SVM = (0.9391700836892655, 0.9386665118574207, 0.9362244481117618, None)\n", |
| 769 | + "F-1 score of this SVM = 0.9362244481117618\n", |
| 770 | + "confusion matrix = [[ 7486 33 619]\n", |
| 771 | + " [ 338 1759 639]\n", |
| 772 | + " [ 446 38 23093]]\n" |
675 | 773 | ]
|
676 | 774 | }
|
677 | 775 | ],
|
678 | 776 | "source": [
|
679 |
| - " rand = RandomForestClassifier()\n", |
680 |
| - " rand.fit(x_train_input, y_train)\n", |
681 |
| - " y_predicted = rand.predict(tfidf.transform(x_test_cleaned))\n", |
682 |
| - " print \"Accuracy of Random Forest = \" + str(rand.score(tfidf.transform(x_test_cleaned), y_test))" |
| 777 | + " print \"Accuracy of Random Forest = \" + str(rand.score(tfidf.transform(x_test_cleaned), y_test))\n", |
| 778 | + " print \"Fscore of this SVM = \" + str(metrics.precision_recall_fscore_support(y_test, y_predicted, pos_label=2, average='weighted'))\n", |
| 779 | + " print \"F-1 score of this SVM = \" + str(metrics.f1_score(y_test, y_predicted, pos_label=2, average='weighted'))\n", |
| 780 | + " print \"confusion matrix = \" + str(metrics.confusion_matrix(y_test, y_predicted))" |
683 | 781 | ]
|
684 | 782 | },
|
685 | 783 | {
|
|
693 | 791 | },
|
694 | 792 | {
|
695 | 793 | "cell_type": "code",
|
696 |
| - "execution_count": 18, |
| 794 | + "execution_count": null, |
| 795 | + "metadata": {}, |
| 796 | + "outputs": [], |
| 797 | + "source": [ |
| 798 | + " decTree = DecisionTreeClassifier()\n", |
| 799 | + " decTree.fit(x_train_input, y_train)\n", |
| 800 | + " y_decTree_predicted = decTree.predict(tfidf.transform(x_test_cleaned))" |
| 801 | + ] |
| 802 | + }, |
| 803 | + { |
| 804 | + "cell_type": "code", |
| 805 | + "execution_count": 29, |
697 | 806 | "metadata": {},
|
698 | 807 | "outputs": [
|
699 | 808 | {
|
700 | 809 | "name": "stdout",
|
701 | 810 | "output_type": "stream",
|
702 | 811 | "text": [
|
703 |
| - "Accuracy of Decision Tree = 0.9263591768018344\n" |
| 812 | + "\n", |
| 813 | + "Accuracy on validation set: 0.9262\n", |
| 814 | + "\n", |
| 815 | + "Classification report : \n", |
| 816 | + " precision recall f1-score support\n", |
| 817 | + "\n", |
| 818 | + " 0 0.90 0.90 0.90 8138\n", |
| 819 | + " 1 0.78 0.70 0.73 2736\n", |
| 820 | + " 2 0.95 0.96 0.96 23577\n", |
| 821 | + "\n", |
| 822 | + "avg / total 0.92 0.93 0.93 34451\n", |
| 823 | + "\n", |
| 824 | + "\n", |
| 825 | + "Confusion Matrix : \n", |
| 826 | + "[[ 7291 244 603]\n", |
| 827 | + " [ 299 1902 535]\n", |
| 828 | + " [ 555 306 22716]]\n" |
704 | 829 | ]
|
705 | 830 | }
|
706 | 831 | ],
|
707 | 832 | "source": [
|
708 |
| - " decTree = DecisionTreeClassifier()\n", |
709 |
| - " decTree.fit(x_train_input, y_train)\n", |
710 |
| - " y_predicted = decTree.predict(tfidf.transform(x_test_cleaned))\n", |
711 |
| - " print \"Accuracy of Decision Tree = \" + str(decTree.score(tfidf.transform(x_test_cleaned), y_test))" |
| 833 | + " modelEvaluation(y_decTree_predicted, y_test)" |
| 834 | + ] |
| 835 | + }, |
| 836 | + { |
| 837 | + "cell_type": "code", |
| 838 | + "execution_count": 30, |
| 839 | + "metadata": {}, |
| 840 | + "outputs": [ |
| 841 | + { |
| 842 | + "name": "stdout", |
| 843 | + "output_type": "stream", |
| 844 | + "text": [ |
| 845 | + "Accuracy of Decision Tree = 0.9262140431337261\n", |
| 846 | + "Fscore of this SVM = (0.9247698369985567, 0.9262140431337261, 0.9252945198875524, None)\n", |
| 847 | + "F-1 score of this SVM = 0.9252945198875524\n", |
| 848 | + "confusion matrix = [[ 7291 244 603]\n", |
| 849 | + " [ 299 1902 535]\n", |
| 850 | + " [ 555 306 22716]]\n" |
| 851 | + ] |
| 852 | + } |
| 853 | + ], |
| 854 | + "source": [ |
| 855 | + " print \"Accuracy of Decision Tree = \" + str(decTree.score(tfidf.transform(x_test_cleaned), y_test))\n", |
| 856 | + " print \"Fscore of this SVM = \" + str(metrics.precision_recall_fscore_support(y_test, y_decTree_predicted, pos_label=2, average='weighted'))\n", |
| 857 | + " print \"F-1 score of this SVM = \" + str(metrics.f1_score(y_test, y_decTree_predicted, pos_label=2, average='weighted'))\n", |
| 858 | + " print \"confusion matrix = \" + str(metrics.confusion_matrix(y_test, y_decTree_predicted))" |
712 | 859 | ]
|
713 | 860 | }
|
714 | 861 | ],
|
|
0 commit comments