-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathsentimental analysis sample.py
179 lines (124 loc) · 5.54 KB
/
sentimental analysis sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 13 15:35:02 2019
@author: Gopi
"""
import requests # Importing requests to extract content from a url
from bs4 import BeautifulSoup as bs # Beautifulsoup is for web scrapping...used to scrap specific content
import re
#import nltk
#from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from wordcloud import WordCloud
# creating empty reviews list
iphone_reviews=[]
#forest = ["the","king","of","jungle"]
for i in range(1,20):
ip=[]
#url="https://www.amazon.in/Apple-MacBook-Air-13-3-inch-Integrated/product-reviews/B073Q5R6VR/ref=cm_cr_arp_d_paging_btm_2?showViewpoints=1&pageNumber="+str(i)
url = "https://www.amazon.in/All-New-Kindle-reader-Glare-Free-Touchscreen/product-reviews/B0186FF45G/ref=cm_cr_getr_d_paging_btm_3?showViewpoints=1&pageNumber="
response = requests.get(url)
soup = bs(response.content,"html.parser")# creating soup object to iterate over the extracted content
reviews = soup.findAll("span",attrs={"class","a-size-base review-text"})# Extracting the content under specific tags
for i in range(len(reviews)):
ip.append(reviews[i].text)
iphone_reviews=iphone_reviews+ip # adding the reviews of one page to empty list which in future contains all the reviews
# writng reviews in a text file
with open("iphone.txt","w",encoding='utf8') as output:
output.write(str(iphone_reviews))
# Joinining all the reviews into single paragraph
ip_rev_string = " ".join(iphone_reviews)
# Removing unwanted symbols incase if exists
ip_rev_string = re.sub("[^A-Za-z" "]+"," ",ip_rev_string).lower()
ip_rev_string = re.sub("[0-9" "]+"," ",ip_rev_string)
# words that contained in iphone 7 reviews
ip_reviews_words = ip_rev_string.split(" ")
stop_words = stopwords.words('english')
with open("E:\\Bokey\\Text Mining\\sw.txt","r") as sw:
stopwords = sw.read()
stopwords = stopwords.split("\n")
temp = ["this","is","awsome","Data","Science"]
[i for i in temp if i not in "is"]
ip_reviews_words = [w for w in ip_reviews_words if not w in stopwords]
# Joinining all the reviews into single paragraph
ip_rev_string = " ".join(ip_reviews_words)
# WordCloud can be performed on the string inputs. That is the reason we have combined
# entire reviews into single paragraph
# Simple word cloud
wordcloud_ip = WordCloud(
background_color='black',
width=1800,
height=1400
).generate(ip_rev_string)
plt.imshow(wordcloud_ip)
# positive words # Choose the path for +ve words stored in system
with open("E:\\Bokey\\Bharani_Assignment\\Twitter\\positive-words.txt","r") as pos:
poswords = pos.read().split("\n")
poswords = poswords[36:]
# negative words Choose path for -ve words stored in system
with open("E:\\Bokey\\Bharani_Assignment\\Twitter\\negative-words.txt","r") as neg:
negwords = neg.read().split("\n")
negwords = negwords[37:]
# negative word cloud
# Choosing the only words which are present in negwords
ip_neg_in_neg = " ".join ([w for w in ip_reviews_words if w in negwords])
wordcloud_neg_in_neg = WordCloud(
background_color='black',
width=1800,
height=1400
).generate(ip_neg_in_neg)
plt.imshow(wordcloud_neg_in_neg)
# Positive word cloud
# Choosing the only words which are present in positive words
ip_pos_in_pos = " ".join ([w for w in ip_reviews_words if w in poswords])
wordcloud_pos_in_pos = WordCloud(
background_color='black',
width=1800,
height=1400
).generate(ip_pos_in_pos)
plt.imshow(wordcloud_pos_in_pos)
nltk
# Unique words
iphone_unique_words = list(set(" ".join(iphone_reviews).split(" ")))
################# IMDB reviews extraction ######################## Time Taking process as this program is operating the web page while extracting
############# the data we need to use time library in order sleep and make it to extract for that specific page
#### We need to install selenium for python
#### pip install selenium
#### time library to sleep the program for few seconds
from selenium import webdriver
browser = webdriver.Chrome()
from bs4 import BeautifulSoup as bs
#page = "http://www.imdb.com/title/tt0944947/reviews?ref_=tt_urv"
page = "http://www.imdb.com/title/tt6294822/reviews?ref_=tt_urv"
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import ElementNotVisibleException
browser.get(page)
import time
reviews = []
i=1
while (i>0):
#i=i+25
try:
button = browser.find_element_by_xpath('//*[@id="load-more-trigger"]')
button.click()
time.sleep(5)
ps = browser.page_source
soup=bs(ps,"html.parser")
rev = soup.findAll("div",attrs={"class","text"})
reviews.extend(rev)
except NoSuchElementException:
break
except ElementNotVisibleException:
break
##### If we want only few recent reviews you can either press cntrl+c to break the operation in middle but the it will store
##### Whatever data it has extracted so far #######
len(reviews)
len(list(set(reviews)))
import re
cleaned_reviews= re.sub('[^A-Za-z0-9" "]+', '', reviews)
f = open("reviews.txt","w")
f.write(cleaned_reviews)
f.close()
with open("The_Post.text","w") as fp:
fp.write(str(reviews))
len(soup.find_all("p"))