Skip to content

Commit de8dc64

Browse files
hiworldwzjwangzaijun
andauthored
add profile_demo.py and add synchronize in infer_loop (#1091)
Co-authored-by: wangzaijun <[email protected]>
1 parent f0ff154 commit de8dc64

File tree

2 files changed

+21
-0
lines changed
  • lightllm/server/router/model_infer/mode_backend/chunked_prefill
  • test/benchmark/static_inference

2 files changed

+21
-0
lines changed

lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,18 @@ def infer_loop(self):
7070
run_way = self.control_state_machine.select_run_way(prefill_reqs=prefill_reqs, decode_reqs=decode_reqs)
7171

7272
if run_way.is_prefill():
73+
# 进行一次流同步,保证 _try_read_new_reqs 中的一些算子操作,必然已经完成。
74+
# 防止后续的推理流程读取到显存中可能存在错误的数据。
75+
g_infer_context.get_overlap_stream().wait_stream(torch.cuda.current_stream())
7376
self.prefill(
7477
event_pack=event_pack,
7578
prefill_reqs=prefill_reqs,
7679
)
7780
continue
7881
elif run_way.is_decode():
82+
# 进行一次流同步,保证 _try_read_new_reqs 中的一些算子操作,必然已经完成。
83+
# 防止后续的推理流程读取到显存中可能存在错误的数据。
84+
g_infer_context.get_overlap_stream().wait_stream(torch.cuda.current_stream())
7985
self.decode(
8086
event_pack=event_pack,
8187
decode_reqs=decode_reqs,
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import torch
2+
import numpy as np
3+
from torch.profiler import profile, record_function, ProfilerActivity
4+
5+
torch.cuda.synchronize()
6+
with profile(
7+
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
8+
record_shapes=False,
9+
profile_memory=False,
10+
on_trace_ready=torch.profiler.tensorboard_trace_handler("./log/"),
11+
) as prof:
12+
# test cuda code
13+
pass
14+
15+
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))

0 commit comments

Comments
 (0)