This repository has been archived by the owner on Jul 1, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Uploaded the two python codes and the stop word list.
- Loading branch information
1 parent
27c78d5
commit df9bfe4
Showing
3 changed files
with
1,366 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
import openpyxl | ||
import nltk | ||
from nltk.tokenize import word_tokenize | ||
from nltk.probability import FreqDist | ||
from nltk.util import ngrams | ||
import string | ||
import requests | ||
import os | ||
|
||
# Get the directory path of the Python script | ||
dir_path = os.path.dirname(os.path.realpath(__file__)) | ||
|
||
# Load the Excel file in the same directory as the Python script | ||
wb = openpyxl.load_workbook(os.path.join(dir_path, 'INSERT EXCEL FILE NAME')) | ||
|
||
# Get the sheet named "Categories" | ||
ws_categories = wb['Categories'] | ||
|
||
# Get the sheet named "Dictionary" | ||
ws_dictionary = wb['Dictionary'] | ||
|
||
# Create a dictionary to store the categories and their keywords | ||
categories = {} | ||
|
||
# Define the set of stop words | ||
with open(os.path.join(dir_path, 'Stopwords List Expanded.txt'), 'r') as f: | ||
stopwords_list = f.readlines() | ||
stop_words = set([word.strip() for word in stopwords_list]) | ||
|
||
# Loop through the rows of the sheet "Categories" | ||
feedback_counts = {} | ||
for row in ws_categories.rows: | ||
|
||
# Get the feedback and the category | ||
feedback = row[0].value | ||
category = row[1].value | ||
|
||
# Increment the feedback count for the category | ||
if category not in feedback_counts: | ||
feedback_counts[category] = 1 | ||
else: | ||
feedback_counts[category] += 1 | ||
|
||
# If the category does not exist in the dictionary, create it | ||
if category not in categories: | ||
categories[category] = [] | ||
|
||
# Tokenise the feedback | ||
tokens = [word.lower() for word in word_tokenize(feedback) if word.isalnum() and not word[0].isdigit()] | ||
|
||
# Remove stop words from the list of tokens | ||
tokens = [word for word in tokens if word not in stop_words and word.lower() not in stop_words] | ||
|
||
# Remove duplicate words from the list of tokens | ||
tokens = list(set(tokens)) | ||
|
||
# Create 3-grams and 4-grams from the list of tokens | ||
n = 4 | ||
n_grams = list(ngrams(tokens, n)) | ||
for gram in n_grams: | ||
categories[category].append(' '.join(gram)) | ||
n = 3 | ||
n_grams = list(ngrams(tokens, n)) | ||
for gram in n_grams: | ||
categories[category].append(' '.join(gram)) | ||
|
||
# Tag the tokens with part-of-speech tags | ||
tagged_tokens = nltk.pos_tag(tokens) | ||
|
||
# Add the adjectives, adverbs, verbs, and nouns to the list of keywords for the category | ||
for word, tag in tagged_tokens: | ||
if tag.startswith('J') or tag.startswith('R') or tag.startswith('V') or tag.startswith('N'): | ||
categories[category].append(word) | ||
|
||
# Loop through the categories | ||
for category in categories: | ||
|
||
# Get the list of keywords for the category | ||
keywords = categories[category] | ||
|
||
# Create a frequency distribution of the keywords | ||
fdist = FreqDist(keywords) | ||
|
||
# Get the total number of feedbacks assigned to the category | ||
total_feedbacks = feedback_counts[category] | ||
|
||
# Get the list of unique keywords sorted by frequency | ||
unique_keywords = [word for word, freq in sorted(fdist.items(), key=lambda x: x[1], reverse=True) if word not in string.punctuation and word not in stop_words and (len(word.split()) == 3 or len(word.split()) == 4)] | ||
|
||
# Initialize a list to store the common n-grams | ||
common_ngrams = [] | ||
|
||
# Loop through all possible n-grams for the category | ||
for n in range(2, 5): | ||
ngrams_list = list(ngrams(keywords, n)) | ||
ngrams_freq = FreqDist(ngrams_list) | ||
for ngram, freq in ngrams_freq.items(): | ||
if freq/total_feedbacks > 0: | ||
if n == 5: | ||
common_ngrams.append(' '.join(ngram)) | ||
else: | ||
common_ngrams.append(' '.join(ngram[:-1])) | ||
|
||
# Modify the list of unique keywords to only include the common n-grams | ||
unique_keywords = [word for word in unique_keywords if word in common_ngrams] | ||
|
||
# Write the list of unique keywords to the sheet "Dictionary" | ||
keywords_str = ', '.join(unique_keywords) | ||
ws_dictionary.append([category, keywords_str]) | ||
|
||
# Save and close the Excel file | ||
wb.save(os.path.join(dir_path, 'Dictionary_Output.xlsx')) | ||
wb.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
import pandas as pd | ||
import numpy as np | ||
import matplotlib | ||
import nltk | ||
from nltk.sentiment import SentimentIntensityAnalyzer | ||
from IPython.display import display | ||
import openpyxl | ||
import os | ||
|
||
# Get the directory path of the Python script | ||
dir_path = os.path.dirname(os.path.realpath(__file__)) | ||
|
||
# Load the Excel file in the same directory as the Python script | ||
wb = openpyxl.load_workbook(os.path.join(dir_path, 'Dictionary_Output.xlsx')) | ||
|
||
# Get the "Dictionary" sheet | ||
ws_dictionary = wb['Dictionary'] | ||
|
||
# Create a dictionary of custom categories and their associated keywords | ||
custom_categories = {} | ||
|
||
# Loop through the rows in the "Dictionary" sheet | ||
for row in ws_dictionary.iter_rows(min_row=2, values_only=True): | ||
category = row[0] | ||
if row[1] is None: | ||
continue | ||
n_grams = [n_gram.strip() for n_gram in row[1].split(",")] | ||
if len(n_grams) > 0: | ||
custom_categories[category] = n_grams | ||
|
||
# load the Excel file to analyse into a pandas dataframe | ||
df = pd.read_excel(os.path.join(dir_path, 'INSERT EXCEL FILE NAME'), sheet_name='Raw') | ||
|
||
# convert the 'Feedback' column to string type | ||
df['Feedback'] = df['Feedback'].astype(str) | ||
|
||
# create a function to categorise text based on n-grams | ||
def categorize_text(text, custom_categories): | ||
if pd.isna(text): | ||
return [] | ||
categories = [] | ||
for category, n_grams in custom_categories.items(): | ||
for n_gram in n_grams: | ||
words = [word.strip() for word in n_gram.split(" ")] | ||
if all(word in text.lower() for word in words): | ||
if category not in categories: | ||
categories.append(category) | ||
if len(categories) == 0: | ||
categories.append("Z.Unknown") | ||
return categories | ||
|
||
# apply the categorize_text function to the dataframe to create a new column | ||
df['categories'] = df['Feedback'].apply(lambda x: categorize_text(x, custom_categories)) | ||
|
||
# explode the 'categories' column to get a separate row for each category | ||
df = df.explode('categories') | ||
|
||
# create an instance of the SentimentIntensityAnalyzer | ||
sid = SentimentIntensityAnalyzer() | ||
|
||
# apply the sid polarity_scores function to the dataframe to create new columns for sentiment analysis | ||
df['sentiment'] = df['Feedback'].apply(lambda x: sid.polarity_scores(x)['compound']) | ||
|
||
# create a pivot table to summarize the data by category | ||
pivot_table = df.groupby('categories').agg({'sentiment': ['count', 'mean']}) | ||
|
||
# add a row for the total count and mean sentiment | ||
total_count = pivot_table['sentiment']['count'].sum() | ||
total_mean_sentiment = pivot_table['sentiment']['mean'].mean() | ||
pivot_table.loc['Total'] = [total_count, total_mean_sentiment] | ||
|
||
# format the pivot table | ||
pivot_table.columns = ['Count', 'Mean Sentiment'] | ||
pivot_table.index.name = None | ||
|
||
# define functions to apply font colour based on value NB. Not working fully as intended | ||
def color_negative_red(val): | ||
color = 'red' if val < 0 else 'black' | ||
return f'color: {color}' | ||
|
||
def color_positive_green(val): | ||
color = 'green' if val > 0 else 'black' | ||
return f'color: {color}' | ||
|
||
# apply formatting to mean sentiment column | ||
styled_table = pivot_table.style.applymap(color_negative_red, subset=pd.IndexSlice[:, ['Mean Sentiment']]).applymap(color_positive_green, subset=pd.IndexSlice[:, ['Mean Sentiment']]) | ||
|
||
# Save the file to the current directory | ||
output_file_path = os.path.join(dir_path, 'Category_Analysis_Output.xlsx') | ||
writer = pd.ExcelWriter(output_file_path) | ||
styled_table.to_excel(writer, sheet_name='summary') | ||
df.to_excel(writer, sheet_name='raw') | ||
writer._save() |
Oops, something went wrong.