Add files via upload

Uploaded the two python codes and the stop word list.
JamesWC-Work · Jul 1, 2023 · df9bfe4 · df9bfe4
1 parent 27c78d5
commit df9bfe4
Show file tree

Hide file tree

Showing 3 changed files with 1,366 additions and 0 deletions.
diff --git a/CoxJames_CategoryAnalysis_NgramGeneration.py b/CoxJames_CategoryAnalysis_NgramGeneration.py
@@ -0,0 +1,113 @@
+import openpyxl
+import nltk
+from nltk.tokenize import word_tokenize
+from nltk.probability import FreqDist
+from nltk.util import ngrams
+import string
+import requests
+import os
+
+# Get the directory path of the Python script
+dir_path = os.path.dirname(os.path.realpath(__file__))
+
+# Load the Excel file in the same directory as the Python script
+wb = openpyxl.load_workbook(os.path.join(dir_path, 'INSERT EXCEL FILE NAME'))
+
+# Get the sheet named "Categories"
+ws_categories = wb['Categories']
+
+# Get the sheet named "Dictionary"
+ws_dictionary = wb['Dictionary']
+
+# Create a dictionary to store the categories and their keywords
+categories = {}
+
+# Define the set of stop words
+with open(os.path.join(dir_path, 'Stopwords List Expanded.txt'), 'r') as f:
+    stopwords_list = f.readlines()
+stop_words = set([word.strip() for word in stopwords_list])
+
+# Loop through the rows of the sheet "Categories"
+feedback_counts = {}
+for row in ws_categories.rows:
+
+    # Get the feedback and the category
+    feedback = row[0].value
+    category = row[1].value
+
+    # Increment the feedback count for the category
+    if category not in feedback_counts:
+        feedback_counts[category] = 1
+    else:
+        feedback_counts[category] += 1
+
+    # If the category does not exist in the dictionary, create it
+    if category not in categories:
+        categories[category] = []
+
+    # Tokenise the feedback
+    tokens = [word.lower() for word in word_tokenize(feedback) if word.isalnum() and not word[0].isdigit()]
+
+    # Remove stop words from the list of tokens
+    tokens = [word for word in tokens if word not in stop_words and word.lower() not in stop_words]
+
+    # Remove duplicate words from the list of tokens
+    tokens = list(set(tokens))
+
+    # Create 3-grams and 4-grams from the list of tokens
+    n = 4
+    n_grams = list(ngrams(tokens, n))
+    for gram in n_grams:
+        categories[category].append(' '.join(gram))
+    n = 3
+    n_grams = list(ngrams(tokens, n))
+    for gram in n_grams:
+        categories[category].append(' '.join(gram))
+
+    # Tag the tokens with part-of-speech tags
+    tagged_tokens = nltk.pos_tag(tokens)
+
+    # Add the adjectives, adverbs, verbs, and nouns to the list of keywords for the category
+    for word, tag in tagged_tokens:
+        if tag.startswith('J') or tag.startswith('R') or tag.startswith('V') or tag.startswith('N'):
+            categories[category].append(word)
+
+# Loop through the categories
+for category in categories:
+
+    # Get the list of keywords for the category
+    keywords = categories[category]
+
+    # Create a frequency distribution of the keywords
+    fdist = FreqDist(keywords)
+
+    # Get the total number of feedbacks assigned to the category
+    total_feedbacks = feedback_counts[category]
+
+    # Get the list of unique keywords sorted by frequency
+    unique_keywords = [word for word, freq in sorted(fdist.items(), key=lambda x: x[1], reverse=True) if word not in string.punctuation and word not in stop_words and (len(word.split()) == 3 or len(word.split()) == 4)]
+
+    # Initialize a list to store the common n-grams
+    common_ngrams = []
+
+    # Loop through all possible n-grams for the category
+    for n in range(2, 5):
+        ngrams_list = list(ngrams(keywords, n))
+        ngrams_freq = FreqDist(ngrams_list)
+        for ngram, freq in ngrams_freq.items():
+            if freq/total_feedbacks > 0:
+                if n == 5:
+                    common_ngrams.append(' '.join(ngram))
+                else:
+                    common_ngrams.append(' '.join(ngram[:-1]))
+
+    # Modify the list of unique keywords to only include the common n-grams
+    unique_keywords = [word for word in unique_keywords if word in common_ngrams]
+
+    # Write the list of unique keywords to the sheet "Dictionary"
+    keywords_str = ', '.join(unique_keywords)
+    ws_dictionary.append([category, keywords_str])
+
+# Save and close the Excel file
+wb.save(os.path.join(dir_path, 'Dictionary_Output.xlsx'))
+wb.close()
diff --git a/CoxJames_FeedbackCategorisation+SentimentAnalysis.py b/CoxJames_FeedbackCategorisation+SentimentAnalysis.py
@@ -0,0 +1,93 @@
+import pandas as pd
+import numpy as np
+import matplotlib
+import nltk
+from nltk.sentiment import SentimentIntensityAnalyzer
+from IPython.display import display
+import openpyxl
+import os
+
+# Get the directory path of the Python script
+dir_path = os.path.dirname(os.path.realpath(__file__))
+
+# Load the Excel file in the same directory as the Python script
+wb = openpyxl.load_workbook(os.path.join(dir_path, 'Dictionary_Output.xlsx'))
+
+# Get the "Dictionary" sheet
+ws_dictionary = wb['Dictionary']
+
+# Create a dictionary of custom categories and their associated keywords
+custom_categories = {}
+
+# Loop through the rows in the "Dictionary" sheet
+for row in ws_dictionary.iter_rows(min_row=2, values_only=True):
+    category = row[0]
+    if row[1] is None:
+        continue
+    n_grams = [n_gram.strip() for n_gram in row[1].split(",")]
+    if len(n_grams) > 0:
+        custom_categories[category] = n_grams
+
+# load the Excel file to analyse into a pandas dataframe
+df = pd.read_excel(os.path.join(dir_path, 'INSERT EXCEL FILE NAME'), sheet_name='Raw')
+
+# convert the 'Feedback' column to string type
+df['Feedback'] = df['Feedback'].astype(str)
+
+# create a function to categorise text based on n-grams
+def categorize_text(text, custom_categories):
+    if pd.isna(text):
+        return []
+    categories = []
+    for category, n_grams in custom_categories.items():
+        for n_gram in n_grams:
+            words = [word.strip() for word in n_gram.split(" ")]
+            if all(word in text.lower() for word in words):
+                if category not in categories:
+                    categories.append(category)
+    if len(categories) == 0:
+        categories.append("Z.Unknown")
+    return categories
+
+# apply the categorize_text function to the dataframe to create a new column
+df['categories'] = df['Feedback'].apply(lambda x: categorize_text(x, custom_categories))
+
+# explode the 'categories' column to get a separate row for each category
+df = df.explode('categories')
+
+# create an instance of the SentimentIntensityAnalyzer
+sid = SentimentIntensityAnalyzer()
+
+# apply the sid polarity_scores function to the dataframe to create new columns for sentiment analysis
+df['sentiment'] = df['Feedback'].apply(lambda x: sid.polarity_scores(x)['compound'])
+
+# create a pivot table to summarize the data by category
+pivot_table = df.groupby('categories').agg({'sentiment': ['count', 'mean']})
+
+# add a row for the total count and mean sentiment
+total_count = pivot_table['sentiment']['count'].sum()
+total_mean_sentiment = pivot_table['sentiment']['mean'].mean()
+pivot_table.loc['Total'] = [total_count, total_mean_sentiment]
+
+# format the pivot table
+pivot_table.columns = ['Count', 'Mean Sentiment']
+pivot_table.index.name = None
+
+# define functions to apply font colour based on value NB. Not working fully as intended
+def color_negative_red(val):
+    color = 'red' if val < 0 else 'black'
+    return f'color: {color}'
+
+def color_positive_green(val):
+    color = 'green' if val > 0 else 'black'
+    return f'color: {color}'
+
+# apply formatting to mean sentiment column
+styled_table = pivot_table.style.applymap(color_negative_red, subset=pd.IndexSlice[:, ['Mean Sentiment']]).applymap(color_positive_green, subset=pd.IndexSlice[:, ['Mean Sentiment']])
+
+# Save the file to the current directory
+output_file_path = os.path.join(dir_path, 'Category_Analysis_Output.xlsx')
+writer = pd.ExcelWriter(output_file_path)
+styled_table.to_excel(writer, sheet_name='summary')
+df.to_excel(writer, sheet_name='raw')
+writer._save()