-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalysis.py
More file actions
74 lines (59 loc) · 2.3 KB
/
analysis.py
File metadata and controls
74 lines (59 loc) · 2.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import re
import pandas as pd
import nltk
nltk.download("punkt", "stopwords")
nltk.download("wordnet")
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
def analyze_sentiment(text):
analyzer = SentimentIntensityAnalyzer()
sentiment_scores = analyzer.polarity_scores(text)
compound_score = sentiment_scores["compound"]
return compound_score
def clean_text(text):
cleaned_text = text.lower()
cleaned_text = re.sub(r"http\S+", "", cleaned_text)
cleaned_text = re.sub(r"\d+", "", cleaned_text)
cleaned_text = re.sub(r"[^a-zA-Z0-9\s]", "", cleaned_text)
words = word_tokenize(cleaned_text)
stop_words = set(stopwords.words("english"))
cleaned_words = [word for word in words if word not in stop_words]
lemmatizer = WordNetLemmatizer()
cleaned_words = [lemmatizer.lemmatize(word) for word in cleaned_words]
cleaned_text = " ".join(cleaned_words)
return cleaned_text
def analyze_csv(file_path):
df = pd.read_csv(file_path)
raw_data = df["Review"].to_frame().to_html()
df["Review"] = df["Review"].apply(clean_text)
df["Sentiment"] = df["Review"].apply(analyze_sentiment)
cleaned_data = df["Review"].to_frame().to_html()
return df, raw_data, cleaned_data
def calculate_summary_statistics(df):
avg_sentiment = df["Sentiment"].mean()
sentiment_counts = df["Sentiment"].value_counts().sort_index().to_dict()
distribution_of_scores = sentiment_counts
num_reviews = df.shape[0]
most_positive_comment = df.loc[df["Sentiment"].idxmax(), "Review"]
most_negative_comment = df.loc[df["Sentiment"].idxmin(), "Review"]
return (
avg_sentiment,
sentiment_counts,
distribution_of_scores,
num_reviews,
most_positive_comment,
most_negative_comment,
)
def get_top_comments(df, positive=True, num_comments=5):
if positive:
sorted_comments = df[df["Sentiment"] > 0].sort_values(
by="Sentiment", ascending=False
)
else:
sorted_comments = df[df["Sentiment"] < 0].sort_values(
by="Sentiment", ascending=True
)
top_comments = sorted_comments["Review"].head(num_comments).tolist()
return top_comments