Skip to content
This repository has been archived by the owner on Jul 1, 2023. It is now read-only.

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
Uploaded the two python codes and the stop word list.
  • Loading branch information
JamesWC-Work authored Jul 1, 2023
1 parent 27c78d5 commit df9bfe4
Show file tree
Hide file tree
Showing 3 changed files with 1,366 additions and 0 deletions.
113 changes: 113 additions & 0 deletions CoxJames_CategoryAnalysis_NgramGeneration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import openpyxl
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.util import ngrams
import string
import requests
import os

# Get the directory path of the Python script
dir_path = os.path.dirname(os.path.realpath(__file__))

# Load the Excel file in the same directory as the Python script
wb = openpyxl.load_workbook(os.path.join(dir_path, 'INSERT EXCEL FILE NAME'))

# Get the sheet named "Categories"
ws_categories = wb['Categories']

# Get the sheet named "Dictionary"
ws_dictionary = wb['Dictionary']

# Create a dictionary to store the categories and their keywords
categories = {}

# Define the set of stop words
with open(os.path.join(dir_path, 'Stopwords List Expanded.txt'), 'r') as f:
stopwords_list = f.readlines()
stop_words = set([word.strip() for word in stopwords_list])

# Loop through the rows of the sheet "Categories"
feedback_counts = {}
for row in ws_categories.rows:

# Get the feedback and the category
feedback = row[0].value
category = row[1].value

# Increment the feedback count for the category
if category not in feedback_counts:
feedback_counts[category] = 1
else:
feedback_counts[category] += 1

# If the category does not exist in the dictionary, create it
if category not in categories:
categories[category] = []

# Tokenise the feedback
tokens = [word.lower() for word in word_tokenize(feedback) if word.isalnum() and not word[0].isdigit()]

# Remove stop words from the list of tokens
tokens = [word for word in tokens if word not in stop_words and word.lower() not in stop_words]

# Remove duplicate words from the list of tokens
tokens = list(set(tokens))

# Create 3-grams and 4-grams from the list of tokens
n = 4
n_grams = list(ngrams(tokens, n))
for gram in n_grams:
categories[category].append(' '.join(gram))
n = 3
n_grams = list(ngrams(tokens, n))
for gram in n_grams:
categories[category].append(' '.join(gram))

# Tag the tokens with part-of-speech tags
tagged_tokens = nltk.pos_tag(tokens)

# Add the adjectives, adverbs, verbs, and nouns to the list of keywords for the category
for word, tag in tagged_tokens:
if tag.startswith('J') or tag.startswith('R') or tag.startswith('V') or tag.startswith('N'):
categories[category].append(word)

# Loop through the categories
for category in categories:

# Get the list of keywords for the category
keywords = categories[category]

# Create a frequency distribution of the keywords
fdist = FreqDist(keywords)

# Get the total number of feedbacks assigned to the category
total_feedbacks = feedback_counts[category]

# Get the list of unique keywords sorted by frequency
unique_keywords = [word for word, freq in sorted(fdist.items(), key=lambda x: x[1], reverse=True) if word not in string.punctuation and word not in stop_words and (len(word.split()) == 3 or len(word.split()) == 4)]

# Initialize a list to store the common n-grams
common_ngrams = []

# Loop through all possible n-grams for the category
for n in range(2, 5):
ngrams_list = list(ngrams(keywords, n))
ngrams_freq = FreqDist(ngrams_list)
for ngram, freq in ngrams_freq.items():
if freq/total_feedbacks > 0:
if n == 5:
common_ngrams.append(' '.join(ngram))
else:
common_ngrams.append(' '.join(ngram[:-1]))

# Modify the list of unique keywords to only include the common n-grams
unique_keywords = [word for word in unique_keywords if word in common_ngrams]

# Write the list of unique keywords to the sheet "Dictionary"
keywords_str = ', '.join(unique_keywords)
ws_dictionary.append([category, keywords_str])

# Save and close the Excel file
wb.save(os.path.join(dir_path, 'Dictionary_Output.xlsx'))
wb.close()
93 changes: 93 additions & 0 deletions CoxJames_FeedbackCategorisation+SentimentAnalysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import pandas as pd
import numpy as np
import matplotlib
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from IPython.display import display
import openpyxl
import os

# Get the directory path of the Python script
dir_path = os.path.dirname(os.path.realpath(__file__))

# Load the Excel file in the same directory as the Python script
wb = openpyxl.load_workbook(os.path.join(dir_path, 'Dictionary_Output.xlsx'))

# Get the "Dictionary" sheet
ws_dictionary = wb['Dictionary']

# Create a dictionary of custom categories and their associated keywords
custom_categories = {}

# Loop through the rows in the "Dictionary" sheet
for row in ws_dictionary.iter_rows(min_row=2, values_only=True):
category = row[0]
if row[1] is None:
continue
n_grams = [n_gram.strip() for n_gram in row[1].split(",")]
if len(n_grams) > 0:
custom_categories[category] = n_grams

# load the Excel file to analyse into a pandas dataframe
df = pd.read_excel(os.path.join(dir_path, 'INSERT EXCEL FILE NAME'), sheet_name='Raw')

# convert the 'Feedback' column to string type
df['Feedback'] = df['Feedback'].astype(str)

# create a function to categorise text based on n-grams
def categorize_text(text, custom_categories):
if pd.isna(text):
return []
categories = []
for category, n_grams in custom_categories.items():
for n_gram in n_grams:
words = [word.strip() for word in n_gram.split(" ")]
if all(word in text.lower() for word in words):
if category not in categories:
categories.append(category)
if len(categories) == 0:
categories.append("Z.Unknown")
return categories

# apply the categorize_text function to the dataframe to create a new column
df['categories'] = df['Feedback'].apply(lambda x: categorize_text(x, custom_categories))

# explode the 'categories' column to get a separate row for each category
df = df.explode('categories')

# create an instance of the SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# apply the sid polarity_scores function to the dataframe to create new columns for sentiment analysis
df['sentiment'] = df['Feedback'].apply(lambda x: sid.polarity_scores(x)['compound'])

# create a pivot table to summarize the data by category
pivot_table = df.groupby('categories').agg({'sentiment': ['count', 'mean']})

# add a row for the total count and mean sentiment
total_count = pivot_table['sentiment']['count'].sum()
total_mean_sentiment = pivot_table['sentiment']['mean'].mean()
pivot_table.loc['Total'] = [total_count, total_mean_sentiment]

# format the pivot table
pivot_table.columns = ['Count', 'Mean Sentiment']
pivot_table.index.name = None

# define functions to apply font colour based on value NB. Not working fully as intended
def color_negative_red(val):
color = 'red' if val < 0 else 'black'
return f'color: {color}'

def color_positive_green(val):
color = 'green' if val > 0 else 'black'
return f'color: {color}'

# apply formatting to mean sentiment column
styled_table = pivot_table.style.applymap(color_negative_red, subset=pd.IndexSlice[:, ['Mean Sentiment']]).applymap(color_positive_green, subset=pd.IndexSlice[:, ['Mean Sentiment']])

# Save the file to the current directory
output_file_path = os.path.join(dir_path, 'Category_Analysis_Output.xlsx')
writer = pd.ExcelWriter(output_file_path)
styled_table.to_excel(writer, sheet_name='summary')
df.to_excel(writer, sheet_name='raw')
writer._save()
Loading

0 comments on commit df9bfe4

Please sign in to comment.