DH-Code-Analysis-Assignment/gutenberg.py at master · dh-fall-2018/DH-Code-Analysis-Assignment · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python
"""Various useful functionality related to Project Gutenberg

Gutenberg Index and URLs
========================

http://www.gutenberg.org/dirs/GUTINDEX.ALL

def get_etext_url(number):
    [[TODO: DOES NOT WORK]]
    Get the url for an etext given its number.
    This is non-trivial and follows instructions at start of GUTINDEX.ALL
    baseUrl = 'http://www.gutenberg.org/dirs/'
    ss = ''
    if number > 10000:
        ss = str(number)
        for char in ss[:-1]:
            pass
    if number <= 10000:
        raise 'Cannot deal with etext numbers less than 10000'
    return ss
"""
import os
from io import StringIO

import re
headerEndPhrases = [
        "Project Gutenberg's Etext of",
        'This etext was prepared by',
        'END.*THE SMALL PRINT',
        'START OF THIS PROJECT GUTENBERG',
        ]
notesStartPhrases = ["Executive Director's Notes:"]
notesEndPhrases = ['David Reed']
footerStartPhrases = ['End of Project Gutenberg', 'End of The Project Gutenberg'
    ]

class GutenbergCleaner(object):
    '''Clean up Gutenberg texts by removing all the header and footer bumpf.

    Usage: init and then run extract_text.

    TODO: deal with 'Produced by ' which occurs in both header and footer (and
    so cannot be dealt with by the usual methods).
    '''

    def __init__(self, etext_str):
        """
        @param etext: file like object containing the etext

        Procedure:
            1. strip out header and footer bumpf
            2. are there notes? If so strip them out
        """
        self.etextStr = etext_str
        # normalize the line endings to save us grief later
        self.etextStr = self.etextStr.replace('\r\n', '\n')
        self.hasNotes = False

    @classmethod
    def make_re_from_phrase(self, phrase):
        """
        Make a regular expression that matches a phrase and its surrounding
        paragraph, i.e. that look like:

        ... phrase ....
        more text
        [blank]
        [blank]+
        """
        paragraphText = '(^.+\w.+\n)*' # need \S to ensure not just whitespace
        # [[TODO: check slowdown due to inclusion of '^.*' at start
        tmp = '^.*' + phrase + '.*\n' + paragraphText + '\s+'
        return re.compile(tmp, re.I | re.M)  # make it case insensitive

    def _find_max(self, phrase, string):
        maxIndex = 0
        regex = self.make_re_from_phrase(phrase)
        matches = regex.finditer(string)
        for match in matches:
            maxIndex = max(match.end(), maxIndex)
        return maxIndex

    def _find_min(self, phrase, string):
        minIndex = len(string)
        regex = self.make_re_from_phrase(phrase)
        matches = regex.finditer(string)
        for match in matches:
            minIndex = min(match.start(), minIndex)
        return minIndex

    def extract_text(self):
        """Extract the core text.
        """
        self.notesEnd = self.get_notes_end()
        self.headerEnd = self.get_header_end()
        self.footerStart = self.get_footer_start()
        startIndex = self.headerEnd
        if self.notesEnd > 0:
            startIndex = self.notesEnd
        return self.etextStr[startIndex : self.footerStart].rstrip()

    def get_notes_end(self):
        "Return 0 if no notes"
        indices = [ self._find_max(phrase, self.etextStr) for phrase in notesEndPhrases]
        index = max(indices)
        return index

    def get_header_end(self):
        indices = [ self._find_max(phrase, self.etextStr) for phrase in headerEndPhrases]
        return max(indices)

    def get_footer_start(self):
        indices = [ self._find_min(phrase, self.etextStr) for phrase in footerStartPhrases]
        return min(indices)