diff --git a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py index f2b0fab49..35e68136d 100644 --- a/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py +++ b/lightllm/server/router/model_infer/mode_backend/chunked_prefill/impl.py @@ -70,12 +70,18 @@ def infer_loop(self): run_way = self.control_state_machine.select_run_way(prefill_reqs=prefill_reqs, decode_reqs=decode_reqs) if run_way.is_prefill(): + # 进行一次流同步,保证 _try_read_new_reqs 中的一些算子操作,必然已经完成。 + # 防止后续的推理流程读取到显存中可能存在错误的数据。 + g_infer_context.get_overlap_stream().wait_stream(torch.cuda.current_stream()) self.prefill( event_pack=event_pack, prefill_reqs=prefill_reqs, ) continue elif run_way.is_decode(): + # 进行一次流同步,保证 _try_read_new_reqs 中的一些算子操作,必然已经完成。 + # 防止后续的推理流程读取到显存中可能存在错误的数据。 + g_infer_context.get_overlap_stream().wait_stream(torch.cuda.current_stream()) self.decode( event_pack=event_pack, decode_reqs=decode_reqs, diff --git a/test/benchmark/static_inference/profile_demo.py b/test/benchmark/static_inference/profile_demo.py new file mode 100644 index 000000000..cfe5d315f --- /dev/null +++ b/test/benchmark/static_inference/profile_demo.py @@ -0,0 +1,15 @@ +import torch +import numpy as np +from torch.profiler import profile, record_function, ProfilerActivity + +torch.cuda.synchronize() +with profile( + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + record_shapes=False, + profile_memory=False, + on_trace_ready=torch.profiler.tensorboard_trace_handler("./log/"), +) as prof: + # test cuda code + pass + +print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=20))