File tree Expand file tree Collapse file tree 2 files changed +21
-0
lines changed
lightllm/server/router/model_infer/mode_backend/chunked_prefill
test/benchmark/static_inference Expand file tree Collapse file tree 2 files changed +21
-0
lines changed Original file line number Diff line number Diff line change @@ -70,12 +70,18 @@ def infer_loop(self):
7070 run_way = self .control_state_machine .select_run_way (prefill_reqs = prefill_reqs , decode_reqs = decode_reqs )
7171
7272 if run_way .is_prefill ():
73+ # 进行一次流同步,保证 _try_read_new_reqs 中的一些算子操作,必然已经完成。
74+ # 防止后续的推理流程读取到显存中可能存在错误的数据。
75+ g_infer_context .get_overlap_stream ().wait_stream (torch .cuda .current_stream ())
7376 self .prefill (
7477 event_pack = event_pack ,
7578 prefill_reqs = prefill_reqs ,
7679 )
7780 continue
7881 elif run_way .is_decode ():
82+ # 进行一次流同步,保证 _try_read_new_reqs 中的一些算子操作,必然已经完成。
83+ # 防止后续的推理流程读取到显存中可能存在错误的数据。
84+ g_infer_context .get_overlap_stream ().wait_stream (torch .cuda .current_stream ())
7985 self .decode (
8086 event_pack = event_pack ,
8187 decode_reqs = decode_reqs ,
Original file line number Diff line number Diff line change 1+ import torch
2+ import numpy as np
3+ from torch .profiler import profile , record_function , ProfilerActivity
4+
5+ torch .cuda .synchronize ()
6+ with profile (
7+ activities = [ProfilerActivity .CPU , ProfilerActivity .CUDA ],
8+ record_shapes = False ,
9+ profile_memory = False ,
10+ on_trace_ready = torch .profiler .tensorboard_trace_handler ("./log/" ),
11+ ) as prof :
12+ # test cuda code
13+ pass
14+
15+ print (prof .key_averages ().table (sort_by = "cuda_time_total" , row_limit = 20 ))
You can’t perform that action at this time.
0 commit comments