Skip to content

[ET-VK] Split up prepack command buffer #12442

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions backends/vulkan/runtime/VulkanBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -507,8 +507,7 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
compute_graph->prepare();
compute_graph->prepare_pipelines();

compute_graph->encode_prepack();
compute_graph->prepack();
compute_graph->run_prepack();

// If dynamic shapes are not expected, then the command buffer only needs to
// be encoded once. Otherwise, wait until the first inference to encode the
Expand Down
52 changes: 51 additions & 1 deletion backends/vulkan/runtime/graph/ComputeGraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,12 @@ ComputeGraph::ComputeGraph(GraphConfig config)
execute_descriptor_counts_.descriptor_combined_sampler_count = 0;
execute_descriptor_counts_.descriptor_storage_image_count = 0;

context_->set_cmd(/*reusable = */ true);
// If certain graph config variables are not specified, then set them
// automatically.
if (config_.prepack_threshold_nbytes == 0) {
config_.prepack_threshold_nbytes = 10 * MB;
config_.prepack_initial_threshold_nbytes = 10 * MB;
}
}

ComputeGraph::~ComputeGraph() {
Expand Down Expand Up @@ -431,6 +436,7 @@ ValueRef ComputeGraph::add_tensorref(
ValueRef idx(static_cast<int>(values_.size()));
check_no_active_value_ptrs();
values_.emplace_back(TensorRef(sizes, dtype, data));
total_constant_nbytes_ += values_.back().toConstTensorRef().nbytes();
return idx;
}

Expand Down Expand Up @@ -750,6 +756,19 @@ void ComputeGraph::prepare_pipelines() {
vkapi::ComputePipelineCache::Hasher>();
}

void ComputeGraph::submit_current_cmd(const bool final_use) {
context_->submit_cmd_to_gpu(VK_NULL_HANDLE, final_use);
}

void ComputeGraph::submit_current_cmd_and_wait(const bool final_use) {
vkapi::VulkanFence fence = context_->fences().get_fence();
context_->submit_cmd_to_gpu(fence.get_submit_handle(), final_use);
fence.wait();
context_->fences().return_fence(fence);

context_->flush();
}

void ComputeGraph::encode_prepack() {
for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
node->encode(this);
Expand All @@ -766,6 +785,37 @@ void ComputeGraph::prepack() const {
context_->flush();
}

void ComputeGraph::run_prepack() {
int i = 0;
bool submitted = false;
const bool reduce_peak_memory = total_constant_nbytes_ > 500 * MB;
// int count = 0;
context_->set_cmd();
for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
// Do not trigger on the first or last prepack node.
const bool not_terminal = i != 0 && i != (prepack_nodes_.size() - 1);
size_t threshold = submitted ? config_.prepack_threshold_nbytes
: config_.prepack_initial_threshold_nbytes;
if (not_terminal && staging_nbytes_in_cmd_ > threshold) {
// If reducing peak memory usage, wait for the current command buffer to
// finish executing and flush to recycle the staging memory. This will
// reduce peak memory usage, but will slightly increase load latency.
// Otherwise, just submit the current command buffer for execution and
// proceed. This results in lower load latency at the cost of higher peak
// memory usage.
reduce_peak_memory ? submit_current_cmd_and_wait() : submit_current_cmd();
staging_nbytes_in_cmd_ = 0;
context_->set_cmd();
submitted = true;
}

node->encode(this);
i++;
}
submit_current_cmd_and_wait(/*final_use=*/true);
staging_nbytes_in_cmd_ = 0;
}

void ComputeGraph::encode_execute() {
context_->flush();
context_->set_cmd(/*reusable = */ true);
Expand Down
36 changes: 36 additions & 0 deletions backends/vulkan/runtime/graph/ComputeGraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -190,10 +190,20 @@ class ComputeGraph final {
vkapi::ComputePipelineCache::Hasher>
pipeline_descriptors_;

// Utility constexpr to express byte quantities
constexpr static size_t MB = 1024 * 1024;

protected:
size_t values_in_use_ = 0;
size_t execute_count_ = 0;

// Total number of bytes needed to store model weights
size_t total_constant_nbytes_ = 0;

// Represents the amount of staging buffer data that will be copied if the
// current Context's command buffer is submitted now.
size_t staging_nbytes_in_cmd_ = 0;

public:
//
// Accessors
Expand Down Expand Up @@ -812,13 +822,39 @@ class ComputeGraph final {
copy_into_staging(const ValueRef idx, const void* data, const size_t numel);
void copy_from_staging(const ValueRef idx, void* data, const size_t numel);

protected:
// Command Buffer Management

/*
* Submits the current command buffer in the Context to the GPU for execution.
*/
void submit_current_cmd(const bool final_use = false);

/*
* Submits the current command buffer in the Context to the GPU for execution,
* and wait for it to complete before returning. This function will also flush
* the Context after execution.
*/
void submit_current_cmd_and_wait(const bool final_use = false);

public:
//
// Graph Prepacking
//

inline void update_staging_nbytes_in_cmd(const size_t staging_bytes) {
staging_nbytes_in_cmd_ += staging_bytes;
}

void encode_prepack();
void prepack() const;

/*
* Executes prepacking operations to transfer model weight data from the CPU
* to GPU.
*/
void run_prepack();

//
// Graph Execution
//
Expand Down
14 changes: 14 additions & 0 deletions backends/vulkan/runtime/graph/GraphConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,20 @@ struct GraphConfig final {
// Whether or not the ComputeGraph should expect input shapes to be dynamic
bool expect_dynamic_shapes;

// Execution properties that determine specifics re: how command buffer
// submission is handled, etc. 0 means this field is not set.

// During prepacking, once this threshold is reached, submit the current
// command buffer for execution. This allows the work to be distributed over
// multiple command buffer submissions, which can improve model load
// performance and prevent crashes when loading large models.
size_t prepack_threshold_nbytes = 0;
// Threshold used for the first command buffer submission during prepacking.
// This can be set to be lower than prepack_submission_threshold_nbytes to
// submit a command buffer for execution earlier which can improve performance
// by taking more advantage of parallelism between the CPU and GPU.
size_t prepack_initial_threshold_nbytes = 0;

vkapi::Adapter* external_adapter;

// Generate a default graph config with pre-configured settings
Expand Down
4 changes: 4 additions & 0 deletions backends/vulkan/runtime/graph/containers/Constant.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ struct TensorRef final {
const std::vector<int64_t>& t_sizes,
vkapi::ScalarType t_dtype,
const void* const t_data);

inline size_t nbytes() const {
return utils::multiply_integers(sizes) * vkapi::element_size(dtype);
}
};

} // namespace vkcompute
1 change: 1 addition & 0 deletions backends/vulkan/runtime/graph/ops/PrepackNode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
TensorRefPtr tref = graph->get_tref(tref_);
size_t numel = utils::multiply_integers(tref->sizes);
api::StagingBuffer staging(graph->context(), tref->dtype, numel);
graph->update_staging_nbytes_in_cmd(staging.buffer().mem_size_as_size_t());
size_t nbytes = numel * vkapi::element_size(tref->dtype);
staging.copy_from(tref->data, nbytes);
return staging;
Expand Down
4 changes: 4 additions & 0 deletions backends/vulkan/runtime/vk_api/memory/Buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,10 @@ class VulkanBuffer final {
return buffer_properties_.size;
}

inline size_t mem_size_as_size_t() const {
return utils::safe_downcast<size_t>(mem_size());
}

inline bool has_memory() const {
return (memory_.allocation != VK_NULL_HANDLE);
}
Expand Down
Loading