AutoComment-NLP/nlp.py at master · e32wong/AutoComment-NLP · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/python

import subprocess
import re
import os

# Clean sentence
def cleanSentence(sentences):

    # No need to process empty sentences
    if sentences == "":
        return ""

    # Replace the "i" with "I"
    sentences = re.sub('\\bi\\b', "I", sentences)

    # Replace "..." with "."
    sentences = re.sub('\s*\.{2,}', ".", sentences)

    # Replace ":" with "."
    sentences = re.sub('\s*\:', ".", sentences)

    # Replace "n't" with "not"
    # http://www.learnenglish.de/grammar/shortforms.html
    sentences = re.sub("can't\\b", "cannot", sentences)
    sentences = re.sub("won't\\b", "will not", sentences)
    sentences = re.sub("shan't\\b", "shall not", sentences)
    sentences = re.sub("n't\\b", " not", sentences)
    # Replace "'ll" with " will"
    sentences = re.sub("'ll\\b", " will", sentences)
    # Replace "I'm" with "I am"
    sentences = re.sub("I'm\\b", "I am", sentences)
    # Replace "'ve" with " have
    sentences = re.sub("'ve\\b", " have", sentences)

    # Ensure last word of the paragraph has a valid ending character
    pattern = re.compile('[a-zA-Z;()/]$')
    if pattern.search(sentences):
        sentences = sentences + "."

    # Split sentences and make first word in each sentence capitalize
    # Split sentence using space after the dot
    listOfSentences = re.findall('(.+?(\.|\?|\!)(\s|$)+)', sentences)
    if listOfSentences:
        sentences = ""
        for thisSentence in listOfSentences:
            # Detect for URLs in sentence and remove them
            # Note we substituded ":" with "." previously
            patternURL = re.compile('(http\.|https\.)')
            if patternURL.search(thisSentence[0]):
                continue

            # Make first letter upper case and append to list
            sentences = sentences + thisSentence[0][0:1].upper() + thisSentence[0][1:]

    return sentences

# main #
########

inputDir = "/home/edmund/research/autocomment/posts/testPreNLP/"

originalDir = os.path.dirname(os.path.realpath(__file__))
nlpDir = "/home/edmund/research/mate-tools/srl/"
nlpCommand = "./scripts/parse_full.sh"
print "Current dir: " + originalDir

# process every single snippet
for f in os.listdir(inputDir):

    with open(inputDir + f, "r") as mapping:
        content = mapping.readlines()

        commentTitle = content[0]
        commentBody = content[1]

        print commentTitle[2:]
        print commentBody[2:]

        os.chdir(nlpDir)
        subprocess.call([nlpCommand, "/home/edmund/research/mate-tools/srl/test.txt"])
        os.chdir(originalDir)