From 8c7fbfbb82515b4dc857dc93fcaeaac286f46942 Mon Sep 17 00:00:00 2001 From: jwen2 <31702182+jwen2@users.noreply.github.com> Date: Wed, 25 Oct 2017 00:37:19 -0400 Subject: [PATCH] Completed ToolBox 1 --- frequency.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 69 insertions(+), 6 deletions(-) diff --git a/frequency.py b/frequency.py index 68be559..c77b57b 100644 --- a/frequency.py +++ b/frequency.py @@ -1,16 +1,68 @@ """ Analyzes the word frequencies in a book downloaded from -Project Gutenberg """ +Project Gutenberg + +John Wen + """ import string +def remove_punct(textlist): + """ A helper function to remove all punctuations from a string while + converting the list into a string + """ + cleanedlist = [] + textlist = textlist.lower().split() + for word in textlist: + symbols = string.punctuation + for i in range (0,len(symbols)): + word = word.replace(symbols[i], '') + if len(word) > 0: + cleanedlist.append(word) + return cleanedlist + +# print(remove_punct('so people said. They judged it was him, anyway; said this drownded man')) + + def get_word_list(file_name): """ Reads the specified project Gutenberg book. Header comments, punctuation, and whitespace are stripped away. The function returns a list of the words used in the book as a list. All words are converted to lower case. """ - pass + f = open(file_name, 'r') + lines = f.readlines() + curr_line = 0 + while lines[curr_line].find('START OF THIS PROJECT GUTENBERG EBOOK') == -1: + curr_line += 1 + lines = lines[curr_line+1:] + """ first part of code removes the headers while the second part cleans the white space + and then cleans the string of punctuations using the helper function above + """ + cleanedwhitespace = [] + cleanedlist = [] + for words in lines: + cleanedwhitespace.append(words.rstrip()) + for eachstring in cleanedwhitespace: + cleanedlist.extend(remove_punct(eachstring)) + return cleanedlist + + + +def highestfreqword(d): + """ returns the highest value in the dictionary as a tuple pair + + >>> highestfreqword({'the': 5, 'apple': 2, 'is' : 3, 'large' : 15}) + ('large', 15) + >>> highestfreqword({'the': 5, 'apple': 8, 'is' : 3, 'large' : 2}) + ('apple', 8) + + Functionality: Breaks down the dictionary down into lists of keys and values and + picks the highest value and maps that index back to the key. + """ + v=list(d.values()) + k=list(d.keys()) + return k[v.index(max(v))], d[k[v.index(max(v))]] def get_top_n_words(word_list, n): @@ -21,10 +73,21 @@ def get_top_n_words(word_list, n): punctuation n: the number of words to return returns: a list of n most frequently occurring words ordered from most - frequently to least frequentlyoccurring + frequently to least frequently occuring. + """ - pass + d = dict() + for words in word_list: + d[words] = d.get(words,0) + 1 + newdictionary = d + listoftups = [] + while n > 0: + listoftups.append(highestfreqword(newdictionary)) + newdictionary.pop(highestfreqword(newdictionary)[0]) + n = n - 1 + return listoftups if __name__ == "__main__": - print("Running WordFrequency Toolbox") - print(string.punctuation) + print(get_top_n_words(get_word_list('pg32325.txt') ,100)) + import doctest + doctest.testmod(verbose=False)