This repository has been archived by the owner on Jul 1, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCoxJames_CategoryAnalysis_NgramGeneration.py
113 lines (87 loc) · 3.87 KB
/
CoxJames_CategoryAnalysis_NgramGeneration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import openpyxl
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.util import ngrams
import string
import requests
import os
# Get the directory path of the Python script
dir_path = os.path.dirname(os.path.realpath(__file__))
# Load the Excel file in the same directory as the Python script
wb = openpyxl.load_workbook(os.path.join(dir_path, 'INSERT EXCEL FILE NAME'))
# Get the sheet named "Categories"
ws_categories = wb['Categories']
# Get the sheet named "Dictionary"
ws_dictionary = wb['Dictionary']
# Create a dictionary to store the categories and their keywords
categories = {}
# Define the set of stop words
with open(os.path.join(dir_path, 'Stopwords List Expanded.txt'), 'r') as f:
stopwords_list = f.readlines()
stop_words = set([word.strip() for word in stopwords_list])
# Loop through the rows of the sheet "Categories"
feedback_counts = {}
for row in ws_categories.rows:
# Get the feedback and the category
feedback = row[0].value
category = row[1].value
# Increment the feedback count for the category
if category not in feedback_counts:
feedback_counts[category] = 1
else:
feedback_counts[category] += 1
# If the category does not exist in the dictionary, create it
if category not in categories:
categories[category] = []
# Tokenise the feedback
tokens = [word.lower() for word in word_tokenize(feedback) if word.isalnum() and not word[0].isdigit()]
# Remove stop words from the list of tokens
tokens = [word for word in tokens if word not in stop_words and word.lower() not in stop_words]
# Remove duplicate words from the list of tokens
tokens = list(set(tokens))
# Create 3-grams and 4-grams from the list of tokens
n = 4
n_grams = list(ngrams(tokens, n))
for gram in n_grams:
categories[category].append(' '.join(gram))
n = 3
n_grams = list(ngrams(tokens, n))
for gram in n_grams:
categories[category].append(' '.join(gram))
# Tag the tokens with part-of-speech tags
tagged_tokens = nltk.pos_tag(tokens)
# Add the adjectives, adverbs, verbs, and nouns to the list of keywords for the category
for word, tag in tagged_tokens:
if tag.startswith('J') or tag.startswith('R') or tag.startswith('V') or tag.startswith('N'):
categories[category].append(word)
# Loop through the categories
for category in categories:
# Get the list of keywords for the category
keywords = categories[category]
# Create a frequency distribution of the keywords
fdist = FreqDist(keywords)
# Get the total number of feedbacks assigned to the category
total_feedbacks = feedback_counts[category]
# Get the list of unique keywords sorted by frequency
unique_keywords = [word for word, freq in sorted(fdist.items(), key=lambda x: x[1], reverse=True) if word not in string.punctuation and word not in stop_words and (len(word.split()) == 3 or len(word.split()) == 4)]
# Initialize a list to store the common n-grams
common_ngrams = []
# Loop through all possible n-grams for the category
for n in range(2, 5):
ngrams_list = list(ngrams(keywords, n))
ngrams_freq = FreqDist(ngrams_list)
for ngram, freq in ngrams_freq.items():
if freq/total_feedbacks > 0:
if n == 5:
common_ngrams.append(' '.join(ngram))
else:
common_ngrams.append(' '.join(ngram[:-1]))
# Modify the list of unique keywords to only include the common n-grams
unique_keywords = [word for word in unique_keywords if word in common_ngrams]
# Write the list of unique keywords to the sheet "Dictionary"
keywords_str = ', '.join(unique_keywords)
ws_dictionary.append([category, keywords_str])
# Save and close the Excel file
wb.save(os.path.join(dir_path, 'Dictionary_Output.xlsx'))
wb.close()