-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata.py
46 lines (37 loc) · 1.22 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
BEGIN_TAG = "▶"
END_TAG = "◀"
EMPTY_TOKEN = "◌"
UNK_TOKEN = "<unk>"
import transformer.Constants as Constants
def load_conv_text():
questions = []
answers = []
with open('conv3.txt') as f:
for line in f:
question_answer_pair = line.split("||")
question = question_answer_pair[0].strip()
answer = question_answer_pair[1].strip()
questions.append(question)
answers.append( Constants.BOS_WORD + ' ' + answer + ' ' + Constants.EOS_WORD)
return questions, answers
MAX_LEN = 200
def load_twitter_text():
with open('data/twitter/chat.txt') as f:
content = f.readlines()
pairs = [
(q.strip()[:MAX_LEN],
f"{BEGIN_TAG} {a.strip()[:MAX_LEN]} {END_TAG}")
for q, a in pairwise(content)
]
return tuple(zip(*pairs))
def pairwise(it):
it = iter(it)
while True:
yield next(it), next(it)
def load_opensubtitles_text():
with open('dataset/movie_lines_selected_10k.txt', 'rb') as f:
pairs = [
(str(q).strip()[:MAX_LEN],
f"{BEGIN_TAG} {str(a).strip()[:MAX_LEN]} {END_TAG}")
for q, a in pairwise(f)]
return tuple(zip(*pairs))