-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathgutenberg.py
More file actions
117 lines (98 loc) · 3.67 KB
/
gutenberg.py
File metadata and controls
117 lines (98 loc) · 3.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
"""Various useful functionality related to Project Gutenberg
Gutenberg Index and URLs
========================
http://www.gutenberg.org/dirs/GUTINDEX.ALL
def get_etext_url(number):
[[TODO: DOES NOT WORK]]
Get the url for an etext given its number.
This is non-trivial and follows instructions at start of GUTINDEX.ALL
baseUrl = 'http://www.gutenberg.org/dirs/'
ss = ''
if number > 10000:
ss = str(number)
for char in ss[:-1]:
pass
if number <= 10000:
raise 'Cannot deal with etext numbers less than 10000'
return ss
"""
import os
from io import StringIO
import re
headerEndPhrases = [
"Project Gutenberg's Etext of",
'This etext was prepared by',
'END.*THE SMALL PRINT',
'START OF THIS PROJECT GUTENBERG',
]
notesStartPhrases = ["Executive Director's Notes:"]
notesEndPhrases = ['David Reed']
footerStartPhrases = ['End of Project Gutenberg', 'End of The Project Gutenberg'
]
class GutenbergCleaner(object):
'''Clean up Gutenberg texts by removing all the header and footer bumpf.
Usage: init and then run extract_text.
TODO: deal with 'Produced by ' which occurs in both header and footer (and
so cannot be dealt with by the usual methods).
'''
def __init__(self, etext_str):
"""
@param etext: file like object containing the etext
Procedure:
1. strip out header and footer bumpf
2. are there notes? If so strip them out
"""
self.etextStr = etext_str
# normalize the line endings to save us grief later
self.etextStr = self.etextStr.replace('\r\n', '\n')
self.hasNotes = False
@classmethod
def make_re_from_phrase(self, phrase):
"""
Make a regular expression that matches a phrase and its surrounding
paragraph, i.e. that look like:
... phrase ....
more text
[blank]
[blank]+
"""
paragraphText = '(^.+\w.+\n)*' # need \S to ensure not just whitespace
# [[TODO: check slowdown due to inclusion of '^.*' at start
tmp = '^.*' + phrase + '.*\n' + paragraphText + '\s+'
return re.compile(tmp, re.I | re.M) # make it case insensitive
def _find_max(self, phrase, string):
maxIndex = 0
regex = self.make_re_from_phrase(phrase)
matches = regex.finditer(string)
for match in matches:
maxIndex = max(match.end(), maxIndex)
return maxIndex
def _find_min(self, phrase, string):
minIndex = len(string)
regex = self.make_re_from_phrase(phrase)
matches = regex.finditer(string)
for match in matches:
minIndex = min(match.start(), minIndex)
return minIndex
def extract_text(self):
"""Extract the core text.
"""
self.notesEnd = self.get_notes_end()
self.headerEnd = self.get_header_end()
self.footerStart = self.get_footer_start()
startIndex = self.headerEnd
if self.notesEnd > 0:
startIndex = self.notesEnd
return self.etextStr[startIndex : self.footerStart].rstrip()
def get_notes_end(self):
"Return 0 if no notes"
indices = [ self._find_max(phrase, self.etextStr) for phrase in notesEndPhrases]
index = max(indices)
return index
def get_header_end(self):
indices = [ self._find_max(phrase, self.etextStr) for phrase in headerEndPhrases]
return max(indices)
def get_footer_start(self):
indices = [ self._find_min(phrase, self.etextStr) for phrase in footerStartPhrases]
return min(indices)