-
Notifications
You must be signed in to change notification settings - Fork 9
/
preprocess.py
169 lines (140 loc) · 6.05 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
import numpy as np
import json, re, nltk, string
from nltk.corpus import wordnet
from gensim.models import Word2Vec
np.random.seed(1337)
def clean_word_list(item):
# 1. Remove \r
current_title = item["issue_title"].replace("\r", " ")
current_desc = item["description"].replace("\r", " ")
# 2. Remove URLs
current_desc = re.sub(
r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
"",
current_desc,
)
# 3. Remove Stack Trace
start_loc = current_desc.find("Stack trace:")
current_desc = current_desc[:start_loc]
# 4. Remove hex code
current_desc = re.sub(r"(\w+)0x\w+", "", current_desc)
current_title = re.sub(r"(\w+)0x\w+", "", current_title)
# 5. Change to lower case
current_desc = current_desc.lower()
current_title = current_title.lower()
# 6. Tokenize
current_desc_tokens = nltk.word_tokenize(current_desc)
current_title_tokens = nltk.word_tokenize(current_title)
# 7. Strip trailing punctuation marks
current_desc_filter = [
word.strip(string.punctuation) for word in current_desc_tokens
]
current_title_filter = [
word.strip(string.punctuation) for word in current_title_tokens
]
# 8. Join the lists
current_data = current_title_filter + current_desc_filter
current_data = [x for x in current_data if x] # list(filter(None, current_data))
return current_data
def preprocess_dataset(dataset_name):
print("Preprocessing {0} dataset: Start".format(dataset_name))
# The JSON file location containing the data for deep learning model training
open_bugs_json = "./data/{0}/deep_data.json".format(dataset_name)
# Word2vec parameters
min_word_frequency_word2vec = 5
embed_size_word2vec = 200
context_window_word2vec = 5
# The bugs are loaded from the JSON file and the preprocessing is performed
with open(open_bugs_json) as data_file:
text = data_file.read()
# Fix json files for mozilla core and mozilla firefox
text = text.replace('" : NULL', '" : "NULL"')
data = json.loads(text, strict=False)
all_data = []
for item in data:
current_data = clean_word_list(item)
all_data.append(current_data)
print("Preprocessing {0} dataset: Word2Vec model".format(dataset_name))
# A vocabulary is constructed and the word2vec model is learned using the preprocessed data. The word2vec model provides a semantic word representation for every word in the vocabulary.
wordvec_model = Word2Vec(
all_data,
min_count=min_word_frequency_word2vec,
size=embed_size_word2vec,
window=context_window_word2vec,
)
# Save word2vec model to use in the model again and again
wordvec_model.save("./data/{0}/word2vec.model".format(dataset_name))
# The data used for training and testing the classifier is loaded and the preprocessing is performed
for min_train_samples_per_class in [0, 5, 10, 20]:
print(
"Preprocessing {0} dataset: Classifier data {1}".format(
dataset_name, min_train_samples_per_class
)
)
closed_bugs_json = "./data/{0}/classifier_data_{1}.json".format(
dataset_name, min_train_samples_per_class
)
with open(closed_bugs_json) as data_file:
text = data_file.read()
# Fix json files for mozilla core and mozilla firefox
text = text.replace('" : NULL', '" : "NULL"')
data = json.loads(text, strict=False)
all_data = []
all_owner = []
for item in data:
current_data = clean_word_list(item)
all_data.append(current_data)
all_owner.append(item["owner"])
# Save all data arrays to use in the model again and again
np.save(
"./data/{0}/all_data_{1}.npy".format(
dataset_name, min_train_samples_per_class
),
all_data,
)
np.save(
"./data/{0}/all_owner_{1}.npy".format(
dataset_name, min_train_samples_per_class
),
all_owner,
)
def preprocess_all_datasets():
preprocess_dataset("google_chromium")
preprocess_dataset("mozilla_core")
preprocess_dataset("mozilla_firefox")
def read_json_and_clean(filename):
# The bugs are loaded from the JSON file and the preprocessing is performed
with open(filename) as data_file:
text = data_file.read()
# Fix json files for mozilla core and mozilla firefox
text = text.replace('" : NULL', '" : "NULL"')
data = json.loads(text, strict=False)
all_data = []
for item in data:
current_data = clean_word_list(item)
all_data.append(current_data)
return all_data
def wordvec_all_datasets_merged():
print("Preprocessing all datasets merged: Word2Vec model")
# The JSON file location containing the data for deep learning model training
open_bugs_json_gc = "./data/{0}/deep_data.json".format("google_chromium")
open_bugs_json_mc = "./data/{0}/deep_data.json".format("mozilla_core")
open_bugs_json_mf = "./data/{0}/deep_data.json".format("mozilla_firefox")
# The bugs are loaded from the JSON file and the preprocessing is performed
all_data_gc = read_json_and_clean(open_bugs_json_gc)
all_data_mc = read_json_and_clean(open_bugs_json_mc)
all_data_mf = read_json_and_clean(open_bugs_json_mf)
all_data_merged = all_data_gc + all_data_mc + all_data_mf
# Word2vec parameters
min_word_frequency_word2vec = 5
embed_size_word2vec = 200
context_window_word2vec = 5
# A vocabulary is constructed and the word2vec model is learned using the preprocessed data. The word2vec model provides a semantic word representation for every word in the vocabulary.
wordvec_model = Word2Vec(
all_data_merged,
min_count=min_word_frequency_word2vec,
size=embed_size_word2vec,
window=context_window_word2vec,
)
# Save word2vec model to use in the model again and again
wordvec_model.save("./data/merged/word2vec.model")