forked from jasonwei20/eda_nlp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsst1_clean.py
108 lines (84 loc) · 2.62 KB
/
sst1_clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from utils import *
def get_label(decimal):
if decimal >= 0 and decimal <= 0.2:
return 0
elif decimal > 0.2 and decimal <= 0.4:
return 1
elif decimal > 0.4 and decimal <= 0.6:
return 2
elif decimal > 0.6 and decimal <= 0.8:
return 3
elif decimal > 0.8 and decimal <= 1:
return 4
else:
return -1
def get_label_binary(decimal):
if decimal >= 0 and decimal <= 0.4:
return 0
elif decimal > 0.6 and decimal <= 1:
return 1
else:
return -1
def get_split(split_num):
if split_num == 1 or split_num == 3:
return 'train'
elif split_num == 2:
return 'test'
if __name__ == "__main__":
data_path = 'raw/sst_1/stanfordSentimentTreebank/datasetSentences.txt'
labels_path = 'raw/sst_1/stanfordSentimentTreebank/sentiment_labels.txt'
split_path = 'raw/sst_1/stanfordSentimentTreebank/datasetSplit.txt'
dictionary_path = 'raw/sst_1/stanfordSentimentTreebank/dictionary.txt'
sentence_lines = open(data_path, 'r').readlines()
labels_lines = open(labels_path, 'r').readlines()
split_lines = open(split_path, 'r').readlines()
dictionary_lines = open(dictionary_path, 'r').readlines()
print(len(sentence_lines))
print(len(split_lines))
print(len(labels_lines))
print(len(dictionary_lines))
#create dictionary for id to label
id_to_label = {}
for line in labels_lines[1:]:
parts = line[:-1].split("|")
_id = parts[0]
score = float(parts[1])
label = get_label_binary(score)
id_to_label[_id] = label
print(len(id_to_label), "id to labels read in")
#create dictionary for phrase to label
phrase_to_label = {}
for line in dictionary_lines:
parts = line[:-1].split("|")
phrase = parts[0]
_id = parts[1]
label = id_to_label[_id]
phrase_to_label[phrase] = label
print(len(phrase_to_label), "phrase to id read in")
#create id to split
id_to_split = {}
for line in split_lines[1:]:
parts = line[:-1].split(",")
_id = parts[0]
split_num = float(parts[1])
split = get_split(split_num)
id_to_split[_id] = split
print(len(id_to_split), "id to split read in")
train_writer = open('datasets/sst2/train_orig.txt', 'w')
test_writer = open('datasets/sst2/test.txt', 'w')
#create sentence to split and label
for sentence_line in sentence_lines[1:]:
parts = sentence_line[:-1].split('\t')
_id = parts[0]
sentence = get_only_chars(parts[1])
split = id_to_split[_id]
if parts[1] in phrase_to_label:
label = phrase_to_label[parts[1]]
if label in {0, 1}:
#print(label, sentence, split)
if split == 'train':
train_writer.write(str(label) + '\t' + sentence + '\n')
elif split == 'test':
test_writer.write(str(label) + '\t' + sentence + '\n')
#print(parts, split)
#label = []