forked from sd17fall/TextMining
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcomparing_authorby_frquency.py
More file actions
131 lines (108 loc) · 4.54 KB
/
comparing_authorby_frquency.py
File metadata and controls
131 lines (108 loc) · 4.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""
Author: Shreya Rangarajan
Date: 10/12/17
Description: Compare text frequency of two books from from Jane Austen
and two books Nathaniel Hawthorne
"""
import requests
import numpy as np
import matplotlib.pyplot as plt
# Get books from Gutenberg
pride_and_prejudice = requests.get('http://www.gutenberg.org/files/1342/1342-0.txt').text
emma = requests.get('http://www.gutenberg.org/files/158/158-0.txt').text
scarlet_letter = requests.get('http://www.gutenberg.org/cache/epub/33/pg33.txt').text
seven_gables = requests.get('http://www.gutenberg.org/cache/epub/77/pg77.txt').text
def most_frequent_words(n, text):
""" Computes the most frequent words in a text as well as the number of
words in the entire text
n: an integer indication # of most frequent words wanting to output
text: a text requested from the Gutenberg database
returns: tuples with the first element as the word and the second
element as how frequently it occurs in the text and the numbers of words
in the text
>>> most_frequent_words(5,pride_and_prejudice)
([('the', 4205), ('to', 4121), ('of', 3662), ('and', 3309), ('a', 1945)], 124592)
>>> most_frequent_words(7, scarlet_letter)
([('the', 5029), ('of', 3332), ('and', 2642), ('a', 2028), ('to', 1993), ('in', 1386), ('with', 996)], 86639)
"""
frequency = {}
split_text = text.split()
for word in split_text:
frequency[word] = 0
for word in split_text:
frequency[word] += 1
sort_top_words = sorted(frequency.items(), key=lambda x:x[1])
sort_top_words.reverse()
return sort_top_words[:n],len(split_text)
def compare_authors(text1,text2):
""" Takes two texts, determines the 500 most frequent words in each text
and returns how many of most frequent words are the same for both
texts.
text1: a unique text requested from Gutenberg database
text2: another unique text requested from Gutenberg database
returns: the count of how many "most frequent words" are similar for
the 2 texts
>>> compare_authors(emma, pride_and_prejudice)
375
>>> compare_authors(seven_gables, scarlet_letter)
366
"""
most_freq_word_txt1 = most_frequent_words(500,text1)[0]
most_freq_word_txt2 = most_frequent_words(500,text2)[0]
words_txt1 = [x[0] for x in most_freq_word_txt1]
words_txt2 = [x[0] for x in most_freq_word_txt2]
count = 0
for word in words_txt1:
if word in words_txt2:
count += 1
return count
def frequency_table(textlist):
""" Takes a list of texts and returns the titles of the texts along with
the count of how many "most frequent words" are similar to both texts
#>>> frequency_table([emma, pride_and_prejudice])
#The Project Gutenberg EBook of Pride and Prejudice, by Jane Austen
#The Project Gutenberg EBook of Emma, by Jane Austen
#375
Disclaimer: Doctest "expected" and "got" give the same results, but does not
pass.
"""
running_text_list = textlist
for text in textlist:
running_text_list.remove(text)
for compare_text in textlist:
compare1 = compare_authors(compare_text, text)
print(compare_text.split('\n', 1)[0].replace('\ufeff',''))
print(text.split('\n', 1)[0].replace('\ufeff',''))
print(compare1)
def lexical_diversity(text):
""" Computes and returns the lexial diversity of a text
text: a text requested form the Gutenberg database
returns: lexical diversity of the text
>>> lexical_diversity(emma)
0.11230976330254647
>>> lexical_diversity(scarlet_letter)
0.17630628239014762
"""
split_text = text.split()
total_num_uniqu_words = len(set(split_text))
lex_div = total_num_uniqu_words/len(split_text)
return lex_div
def plot_lexical_diversity(textlist, title_list):
""" Plots the lexical diversity of each text in a list against their title
textlist: list of texts from Gutenberg database
title_list: list of book titles that correspond to the textlist
returns: a plot of the lexical diversity title_list on x axis and lexica
"""
lex_div_list = []
for text in textlist:
lex_div_list.append(lexical_diversity(text))
x = np.array([0,1,2,3])
plt.xticks(x, title_list)
plt.plot(x, lex_div_list)
plt.title('Lexical Diveristy of 4 different books')
plt.xlabel('Book Name')
plt.ylabel('Units')
plt.show()
if __name__ == "__main__":
import doctest
doctest.testmod()