-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathprocess_transcript.py
109 lines (84 loc) · 4.13 KB
/
process_transcript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import re
class Dialogue:
def __init__(self, conversation_id, num_utterances, utterances):
self.conversation_id = conversation_id
self.num_utterances = num_utterances
self.utterances = utterances
def __str__(self):
return str("Conversation: " + self.conversation_id + "\n"
+ "Number of Utterances: " + str(self.num_utterances))
class Utterance:
def __init__(self, speaker, text, da_label):
self.speaker = speaker
self.text = text
self.da_label = da_label
def __str__(self):
return str(self.speaker + " " + self.text + " " + self.da_label)
def process_transcript(transcript, excluded_tags=None, excluded_chars=None):
# Process each utterance in the transcript and create list of Utterance objects
utterances = []
for utt in transcript.utterances:
# Remove the word annotations that filter_disfluency does not (i.e. <laughter>)
utterance_text = []
for word in utt.text_words(filter_disfluency=True):
# If no excluded characters are present just add it
if all(char not in excluded_chars for char in word):
utterance_text.append(word)
# Else, if it contains'#' that is sometimes appended to words remove
elif any(char is '#' for char in word):
word = word.replace('#', "")
utterance_text.append(word)
# Else, to keep hyphenated words, check 1st, last and 2nd-to-last char for interruptions (i.e. 'spi-,')
elif len(word) > 1:
if word[0] not in excluded_chars and word[-1] not in excluded_chars and word[-2] not in excluded_chars:
utterance_text.append(word)
# Join words for complete sentence
utterance_text = " ".join(utterance_text)
# Strip extra, leading and trailing whitespace
utterance_text = re.sub(' +', ' ', utterance_text)
# Print original and processed utterances
# print(utt.transcript_index, " ", utt.text_words(filter_disfluency=True), " ", utt.damsl_act_tag())
# print(utt.transcript_index, " ", utterance_text, " ", utt.damsl_act_tag())
# Check we are not adding an empty utterance (i.e. because it was just <laughter>),
# or adding an utterance with an excluded tag.
if (not utterance_text.isspace() and len(utterance_text) >= 1) and utt.damsl_act_tag() not in excluded_tags:
# Create Utterance and add to list
current_utt = Utterance(utt.caller, utterance_text, utt.damsl_act_tag())
utterances.append(current_utt)
# # Concatenate multi-utterance's with '+' label
utterances = concatenate(utterances)
# Create Dialogue
conversation_id = str(transcript.utterances[0].conversation_no)
dialogue = Dialogue(conversation_id, len(utterances), utterances)
return dialogue
def concatenate(utterances):
current_a = None
current_b = None
for utt in reversed(utterances):
# If we find an utterance that must be concatenated
if utt.da_label == '+':
# Save to temp variable
if utt.speaker == 'A':
# Need to check if we have multiple lines to concatenate
if current_a:
current_a = utt.text + " " + current_a
else:
current_a = utt.text
elif utt.speaker == 'B':
if current_b:
current_b = utt.text + " " + current_b
else:
current_b = utt.text
# And remove utterance from list
utterances.remove(utt)
# Else if we have an utterance to concatenate
elif current_a and utt.speaker == 'A':
# Add it to the utterance and set temp empty
utt.text = utt.text + " " + current_a
current_a = None
# print("Concatenating '", utt.text, "' + '", current_a, "'")
elif current_b and utt.speaker == 'B':
utt.text = utt.text + " " + current_b
current_b = None
# print("Concatenating '", utt.text, "' + '", current_b, "'")
return utterances