-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathdriver.py
221 lines (195 loc) · 6.76 KB
/
driver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import argparse
import os
import subprocess
import time
from util import util
def clean_cache():
print("Cleaning cache...")
try:
subprocess.run(
"lsof /dev/nvidia* | grep 'chenyuan' | grep 'python' | grep 'nvidia0' | awk '{print $2}' | xargs -I {} kill {}",
shell=True,
timeout=10,
)
except Exception as e:
print("Error when cleaning cache")
print(e)
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument(
"--car", type=str, default="torch2cuda.py"
) # car is something that is driven by a driver
parser.add_argument(
"--tf", action="store_true", default=False
) # tensorflow or torch
parser.add_argument("--mode", type=str, default="race") # mode to be passed to car
parser.add_argument("--input", type=str, default=None) # source directory
parser.add_argument("--output", type=str, default="trace.txt") # output filename
parser.add_argument(
"--cont", action="store_true", default=False
) # continue on a trace in the middle
args = parser.parse_args()
tasks = util.readAllTasksFromDir(args.input)
nTasks = len(tasks)
if not args.cont:
catchesLog = open("catches.log", "w")
catchesLog.write(args.input + " " + str(nTasks) + "\n")
catchesLog.close()
trace = open(args.output, "w")
trace.write(args.input + " " + str(nTasks) + "\n")
trace.close()
trace = open(args.output, "r")
cur: int = 0
# Support for manually re-booting in the middle
for line in trace.readlines():
id, api, label, state = util.parseResultSummary(line)
if id == None:
continue
cur = id + 1
trace.close()
trace = open(args.output, "a")
lastFailApi: str = ""
continuousTimeouts: int = 0
cnt_OOM = 0
while cur < nTasks:
proc = None
try:
print("\nDriver: Car Reboot", file=trace)
trace.flush()
api, label, src = util.parseTask(tasks[cur])
if continuousTimeouts >= 5:
while api == lastFailApi:
cur += 1
print(
"\nTitanFuzzTestcase",
id,
api,
label,
"TimeoutSkipped",
"no detail",
file=trace,
)
trace.flush()
api, label, src = util.parseTask(tasks[cur])
continuousTimeouts = 0
continue
cmdLine = [
"python",
"-u",
args.car,
"--mode",
args.mode,
"--input",
args.input,
"--start",
str(cur),
"--singleapi",
]
if args.tf:
cmdLine.append("--tf")
proc = subprocess.Popen(
cmdLine, stdin=subprocess.PIPE, stdout=trace, stderr=subprocess.STDOUT
)
# Monitor the size of output file to see whether subprocess is stuck
noNewOutputSecs = 0
oldFileSize = 0
while proc.poll() == None and noNewOutputSecs < 20:
time.sleep(1)
fileSize = os.path.getsize(args.output)
if oldFileSize != fileSize:
if noNewOutputSecs >= 10:
print("Got some output, timer reset")
noNewOutputSecs = 0
else:
noNewOutputSecs += 1
if noNewOutputSecs >= 10:
print(noNewOutputSecs, "seconds without new output")
oldFileSize = fileSize
metTimeout = False
if noNewOutputSecs >= 20:
print("Timeout, killed")
metTimeout = True
while proc.poll() is None:
proc.kill()
# Try to find how many testcases have been covered
trace.close()
trace = open(args.output, "r")
lastId: int = 0
cur_OOM = 0
for line in trace.readlines():
if "CUDA error: out of memory" in line:
cur_OOM += 1
id, api, label, state = util.parseResultSummary(line)
if id == None:
continue
lastId = id
trace.close()
if cur_OOM > cnt_OOM + 20:
cnt_OOM = cur_OOM
clean_cache()
trace = open(args.output, "a")
reason = ""
if metTimeout: # Increase cursor and print timeout catch
cur = max(lastId + 2, cur + 1) # Skip the one causing timeout
reason = "TimeoutFail"
continuousTimeouts += 1
print("continuousTimeouts", continuousTimeouts)
elif proc.returncode != 233: # FrameworkCrashCatch
cur = max(lastId + 2, cur + 1)
reason = "FrameworkCrashCatch"
continuousTimeouts += 1
else: # no problem
cur = lastId + 1
continuousTimeouts = 0
continue
if cur >= nTasks:
break
failId = cur - 1
failTask = tasks[failId]
failApi = failTask[0]
failLabel = failTask[1]
print(
"\nTitanFuzzTestcase",
failId,
failApi,
failLabel,
reason,
"no detail",
file=trace,
)
trace.flush()
lastFailApi = failApi
except KeyboardInterrupt:
proc.kill() # Safely kill subprocess
exit(-1)
# Statistics
trace.close()
trace = open(args.output, "r")
resultStat = {}
caughtApiCount = 0
testedCount = 0
lastCaughtApi = ""
catches = []
for line in trace.readlines():
id, api, label, state = util.parseResultSummary(line)
if id == None:
continue
testedCount += 1
if state not in resultStat:
resultStat[state] = 0
resultStat[state] += 1
if "Catch" in state and api != lastCaughtApi:
caughtApiCount += 1
lastCaughtApi = api
catches.append([api, label, state])
trace.close()
trace = open(args.output, "a")
print("==== driven race finished ====", file=trace)
print("total", len(tasks), file=trace)
print("tested", testedCount, file=trace)
for reason, count in resultStat.items():
print(reason, count, file=trace)
trace.close()
return 0
if __name__ == "__main__":
exit(main())