-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathllmperf.py
250 lines (220 loc) · 11.7 KB
/
llmperf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import argparse
import openai_perf
import tgi_perf
import vllm_perf
import triton_perf
import asyncio
import math
import json
from timeit import default_timer as timer
def read_prompt_from_file(file_path):
with open(file_path, 'r') as file:
prompt = file.read()
return prompt
def run_test_n_times(test, n):
total = 0
for i in range(n):
value = test()
total += value
print(f"Iteration {i}: {value}")
print(f"Average: {total/n}")
async def async_run_test_n_times(test, n):
total = 0
for i in range(n):
value = await test()
total += value
print(f"Iteration {i}: {value}")
print(f"Average: {total/n}")
async def send_request_periodically(request, qps, t, total):
tasks = []
start = timer()
for _ in range(math.floor(total/qps)):
for _ in range(qps):
task = asyncio.create_task(request())
tasks.append(task)
await asyncio.sleep(t)
results = await asyncio.gather(*tasks)
total_tokens = sum(results)
elapsed = timer() - start
return total_tokens / elapsed
async def send_sampled_request_periodically(request, samples, qps, t, total):
tasks = []
start = timer()
i = 0
for _ in range(math.floor(total/qps)):
for _ in range(qps):
task = asyncio.create_task(request(samples[i]))
tasks.append(task)
i += 1
await asyncio.sleep(t)
results = await asyncio.gather(*tasks)
total_tokens = sum(results)
elapsed = timer() - start
return total_tokens / elapsed
def run_ttft(args):
prompt = read_prompt_from_file(args.prompt_file)
measurer = None
if args.engine == "vllm":
measurer = vllm_perf.ttft_measurer(prompt, args)
elif args.engine == "openai":
measurer = openai_perf.ttft_measurer(prompt, args)
elif args.engine == "tgi":
measurer = tgi_perf.ttft_measurer(prompt, args)
elif args.engine == "triton":
measurer = triton_perf.ttft_measurer(prompt, args)
else:
print(f"TTFT test not implemented for {args.engine}")
return
run_test_n_times(measurer, args.iterations)
def run_tpot(args):
prompt = read_prompt_from_file(args.prompt_file)
measurer = None
if args.engine == "vllm":
measurer = vllm_perf.tpot_measurer(prompt, args)
elif args.engine == "openai":
measurer = openai_perf.tpot_measurer(prompt, args)
elif args.engine == "tgi":
measurer = tgi_perf.tpot_measurer(prompt, args)
elif args.engine == "triton":
measurer = triton_perf.tpot_measurer(prompt, args)
else:
print(f"TPOT test not implemented for {args.engine}")
return
asyncio.run(async_run_test_n_times(measurer, args.iterations))
def run_static_batch(args):
prompt = read_prompt_from_file(args.prompt_file)
measurer = None
if args.engine == "vllm":
measurer = vllm_perf.static_batch_measurer(prompt, args)
else:
print(f"Static batch test not implemented for {args.engine}")
return
run_test_n_times(measurer, args.iterations)
def run_rate_throughput(args):
prompt = read_prompt_from_file(args.prompt_file)
measurer = None
if args.engine == "vllm":
measurer = vllm_perf.rate_throughput_measurer(prompt, args)
elif args.engine == "openai":
measurer = openai_perf.rate_throughput_measurer(prompt, args)
elif args.engine == "tgi":
measurer = tgi_perf.rate_throughput_measurer(prompt, args)
elif args.engine == "triton":
measurer = triton_perf.rate_throughput_measurer(prompt, args)
else:
print(f"Rate throughput test not implemented for {args.engine}")
return
async def wrapper():
return await send_request_periodically(measurer, args.qps, args.t, args.total_requests)
asyncio.run(async_run_test_n_times(wrapper, args.iterations))
def run_rate_sampled_throughput(args):
with open(args.dataset, 'r') as file:
samples = json.load(file)
measurer = None
if args.engine == "vllm":
measurer = vllm_perf.sample_rate_throughput_measurer(args)
elif args.engine == "openai":
measurer = openai_perf.sample_rate_throughput_measurer(args)
elif args.engine == "tgi":
measurer = tgi_perf.sample_rate_throughput_measurer(args)
elif args.engine == "triton":
measurer = triton_perf.sample_rate_throughput_measurer(args)
else:
print(f"Rate sampled throughput test not implemented for {args.engine}")
return
async def wrapper():
return await send_sampled_request_periodically(measurer, samples, args.qps, args.t, args.total_requests)
asyncio.run(async_run_test_n_times(wrapper, args.iterations))
def run_rate_sampled_output_throughput(args):
with open(args.dataset, 'r') as file:
samples = json.load(file)
measurer = None
if args.engine == "vllm":
measurer = vllm_perf.sample_output_rate_throughput_measurer(args)
elif args.engine == "tgi":
measurer = tgi_perf.sample_output_rate_throughput_measurer(args)
elif args.engine == "openai":
measurer = openai_perf.sample_output_rate_throughput_measurer(args)
elif args.engine == "triton":
measurer = triton_perf.sample_output_rate_throughput_measurer(args)
else:
print(f"Rate sampled throughput test not implemented for {args.engine}")
return
async def wrapper():
return await send_sampled_request_periodically(measurer, samples, args.qps, args.t, args.total_requests)
asyncio.run(async_run_test_n_times(wrapper, args.iterations))
def add_engines_parser(base_parser, vllm_batch_size = False):
engine_parser = base_parser.add_subparsers(title="Engine", dest="engine", required=True)
vllm_parser = engine_parser.add_parser("vllm", help="vLLM Engine")
vllm_parser.add_argument("--model", type=str, default="", help="The model.")
vllm_parser.add_argument("--dtype", type=str, default="float16", help="The dtype.")
vllm_parser.add_argument("--gpu_memory_utilization", type=float, default=0.9, help="GPU Memory fraction")
if vllm_batch_size:
vllm_parser.add_argument("--batch_size", type=int, default=128, help="The batch size.")
openai_parser = engine_parser.add_parser("openai", help="OpenAI Engine")
openai_parser.add_argument("--api_key", type=str, default="API_KEY", help="The OpenAI API Key")
openai_parser.add_argument("--api_base", type=str, default="http://localhost:8000/v1", help="The OpenAI Server URL")
triton_parser = engine_parser.add_parser("triton", help="Triton Engine")
triton_parser.add_argument("--model", type=str, default="ensemble", help="The model.")
triton_parser.add_argument("--http_server", type=str, default="http://localhost:8000", help="The Triton Server URL")
triton_parser.add_argument("--grpc_server", type=str, default="localhost:8001", help="The Triton gRPC Server URL")
tgi_parser = engine_parser.add_parser("tgi", help="Text-generation-inference Engine")
tgi_parser.add_argument("--server", type=str, default="http://127.0.0.1:80/", help="The TGI Server URL")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="LLMPerf tools to measure LLM performance")
test_parser = parser.add_subparsers(title="Test", dest="test", required=True)
ttft_parser = test_parser.add_parser("ttft", help="Measure Time To First Token (TTFT)")
ttft_parser.add_argument("--prompt_file", type=str, help="Path to a file containing the prompt.")
ttft_parser.add_argument("--iterations", type=int, default=10, help="The iterations parameter.")
add_engines_parser(ttft_parser)
tpot_parser = test_parser.add_parser("tpot", help="Measure Time Per Output Token (TPOT)")
tpot_parser.add_argument("--prompt_file", type=str, help="Path to a file containing the prompt.")
tpot_parser.add_argument("--iterations", type=int, default=10, help="The iterations parameter.")
tpot_parser.add_argument("--output_tokens", type=int, default=128, help="Number of tokens to retrieve")
add_engines_parser(tpot_parser)
stb_parser = test_parser.add_parser("static_batch_throughput", help="Measure throughput for static batch")
stb_parser.add_argument("--prompt_file", type=str, help="Path to a file containing the prompt.")
stb_parser.add_argument("--iterations", type=int, default=10, help="The iterations parameter.")
stb_parser.add_argument("--output_tokens", type=int, default=128, help="Number of tokens to retrieve")
stb_parser.add_argument("--batch_size", type=int, default=128, help="Number of sequences to batch")
stb_engine_parser = stb_parser.add_subparsers(title="Engine", dest="engine", required=True)
stb_vllm_parser = stb_engine_parser.add_parser("vllm", help="vLLM Engine")
stb_vllm_parser.add_argument("--model", type=str, default="", help="The model.")
stb_vllm_parser.add_argument("--dtype", type=str, default="float16", help="The dtype.")
rth_parser = test_parser.add_parser("rate_throughput", help="Measure throughput with sending requests at constant rate")
rth_parser.add_argument("--prompt_file", type=str, help="Path to a file containing the prompt.")
rth_parser.add_argument("--iterations", type=int, default=1, help="The iterations parameter.")
rth_parser.add_argument("--output_tokens", type=int, default=128, help="Number of tokens to retrieve")
rth_parser.add_argument("--qps", type=int, default=4, help="Number of queries to send per second")
rth_parser.add_argument("--t", type=int, default=1, help="Time frame to send the QPS amount requests")
rth_parser.add_argument("--total_requests", type=int, default=5000, help="Number of requests to send in total")
add_engines_parser(rth_parser, True)
rst_parser = test_parser.add_parser("rate_sampled_throughput", help="Measure throughput with sending requests at constant rate")
rst_parser.add_argument("--dataset", type=str, help="Path to a file containing the dataset.")
rst_parser.add_argument("--iterations", type=int, default=1, help="The iterations parameter.")
rst_parser.add_argument("--qps", type=int, default=4, help="Number of queries to send per second (Per t)")
rst_parser.add_argument("--t", type=int, default=1, help="Time frame to send the QPS amount requests")
rst_parser.add_argument("--total_requests", type=int, default=5000, help="Number of requests to send in total")
add_engines_parser(rst_parser, True)
rsot_parser = test_parser.add_parser("rate_sampled_output_throughput", help="Measure throughput with sending requests at constant rate")
rsot_parser.add_argument("--dataset", type=str, help="Path to a file containing the dataset.")
rsot_parser.add_argument("--iterations", type=int, default=1, help="The iterations parameter.")
rsot_parser.add_argument("--qps", type=int, default=4, help="Number of queries to send per second (Per t)")
rsot_parser.add_argument("--t", type=int, default=1, help="Time frame to send the QPS amount requests")
rsot_parser.add_argument("--total_requests", type=int, default=5000, help="Number of requests to send in total")
rsot_parser.add_argument("--temperature", type=float, default=1, help="Temperature in sampling phase")
rsot_parser.add_argument("--top_k", type=int, default=15, help="Tok K in sampling phase")
add_engines_parser(rsot_parser, True)
args = parser.parse_args()
if args.test == "ttft":
run_ttft(args)
elif args.test == "tpot":
run_tpot(args)
elif args.test == "static_batch_throughput":
run_static_batch(args)
elif args.test == "rate_throughput":
run_rate_throughput(args)
elif args.test == "rate_sampled_throughput":
run_rate_sampled_throughput(args)
elif args.test == "rate_sampled_output_throughput":
run_rate_sampled_output_throughput(args)