diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp index 28e7574537c..594c00854a2 100644 --- a/backends/vulkan/runtime/VulkanBackend.cpp +++ b/backends/vulkan/runtime/VulkanBackend.cpp @@ -507,8 +507,7 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface { compute_graph->prepare(); compute_graph->prepare_pipelines(); - compute_graph->encode_prepack(); - compute_graph->prepack(); + compute_graph->run_prepack(); // If dynamic shapes are not expected, then the command buffer only needs to // be encoded once. Otherwise, wait until the first inference to encode the diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp index cb14a41e98a..cafe2f5e502 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.cpp +++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp @@ -145,7 +145,12 @@ ComputeGraph::ComputeGraph(GraphConfig config) execute_descriptor_counts_.descriptor_combined_sampler_count = 0; execute_descriptor_counts_.descriptor_storage_image_count = 0; - context_->set_cmd(/*reusable = */ true); + // If certain graph config variables are not specified, then set them + // automatically. + if (config_.prepack_threshold_nbytes == 0) { + config_.prepack_threshold_nbytes = 10 * MB; + config_.prepack_initial_threshold_nbytes = 10 * MB; + } } ComputeGraph::~ComputeGraph() { @@ -431,6 +436,7 @@ ValueRef ComputeGraph::add_tensorref( ValueRef idx(static_cast(values_.size())); check_no_active_value_ptrs(); values_.emplace_back(TensorRef(sizes, dtype, data)); + total_constant_nbytes_ += values_.back().toConstTensorRef().nbytes(); return idx; } @@ -750,6 +756,19 @@ void ComputeGraph::prepare_pipelines() { vkapi::ComputePipelineCache::Hasher>(); } +void ComputeGraph::submit_current_cmd(const bool final_use) { + context_->submit_cmd_to_gpu(VK_NULL_HANDLE, final_use); +} + +void ComputeGraph::submit_current_cmd_and_wait(const bool final_use) { + vkapi::VulkanFence fence = context_->fences().get_fence(); + context_->submit_cmd_to_gpu(fence.get_submit_handle(), final_use); + fence.wait(); + context_->fences().return_fence(fence); + + context_->flush(); +} + void ComputeGraph::encode_prepack() { for (std::unique_ptr& node : prepack_nodes_) { node->encode(this); @@ -766,6 +785,37 @@ void ComputeGraph::prepack() const { context_->flush(); } +void ComputeGraph::run_prepack() { + int i = 0; + bool submitted = false; + const bool reduce_peak_memory = total_constant_nbytes_ > 500 * MB; + // int count = 0; + context_->set_cmd(); + for (std::unique_ptr& node : prepack_nodes_) { + // Do not trigger on the first or last prepack node. + const bool not_terminal = i != 0 && i != (prepack_nodes_.size() - 1); + size_t threshold = submitted ? config_.prepack_threshold_nbytes + : config_.prepack_initial_threshold_nbytes; + if (not_terminal && staging_nbytes_in_cmd_ > threshold) { + // If reducing peak memory usage, wait for the current command buffer to + // finish executing and flush to recycle the staging memory. This will + // reduce peak memory usage, but will slightly increase load latency. + // Otherwise, just submit the current command buffer for execution and + // proceed. This results in lower load latency at the cost of higher peak + // memory usage. + reduce_peak_memory ? submit_current_cmd_and_wait() : submit_current_cmd(); + staging_nbytes_in_cmd_ = 0; + context_->set_cmd(); + submitted = true; + } + + node->encode(this); + i++; + } + submit_current_cmd_and_wait(/*final_use=*/true); + staging_nbytes_in_cmd_ = 0; +} + void ComputeGraph::encode_execute() { context_->flush(); context_->set_cmd(/*reusable = */ true); diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h index 78135a434e5..a8405bb312d 100644 --- a/backends/vulkan/runtime/graph/ComputeGraph.h +++ b/backends/vulkan/runtime/graph/ComputeGraph.h @@ -190,10 +190,20 @@ class ComputeGraph final { vkapi::ComputePipelineCache::Hasher> pipeline_descriptors_; + // Utility constexpr to express byte quantities + constexpr static size_t MB = 1024 * 1024; + protected: size_t values_in_use_ = 0; size_t execute_count_ = 0; + // Total number of bytes needed to store model weights + size_t total_constant_nbytes_ = 0; + + // Represents the amount of staging buffer data that will be copied if the + // current Context's command buffer is submitted now. + size_t staging_nbytes_in_cmd_ = 0; + public: // // Accessors @@ -812,13 +822,39 @@ class ComputeGraph final { copy_into_staging(const ValueRef idx, const void* data, const size_t numel); void copy_from_staging(const ValueRef idx, void* data, const size_t numel); + protected: + // Command Buffer Management + + /* + * Submits the current command buffer in the Context to the GPU for execution. + */ + void submit_current_cmd(const bool final_use = false); + + /* + * Submits the current command buffer in the Context to the GPU for execution, + * and wait for it to complete before returning. This function will also flush + * the Context after execution. + */ + void submit_current_cmd_and_wait(const bool final_use = false); + + public: // // Graph Prepacking // + inline void update_staging_nbytes_in_cmd(const size_t staging_bytes) { + staging_nbytes_in_cmd_ += staging_bytes; + } + void encode_prepack(); void prepack() const; + /* + * Executes prepacking operations to transfer model weight data from the CPU + * to GPU. + */ + void run_prepack(); + // // Graph Execution // diff --git a/backends/vulkan/runtime/graph/GraphConfig.h b/backends/vulkan/runtime/graph/GraphConfig.h index 753ce8362af..33c7ae73e62 100644 --- a/backends/vulkan/runtime/graph/GraphConfig.h +++ b/backends/vulkan/runtime/graph/GraphConfig.h @@ -36,6 +36,20 @@ struct GraphConfig final { // Whether or not the ComputeGraph should expect input shapes to be dynamic bool expect_dynamic_shapes; + // Execution properties that determine specifics re: how command buffer + // submission is handled, etc. 0 means this field is not set. + + // During prepacking, once this threshold is reached, submit the current + // command buffer for execution. This allows the work to be distributed over + // multiple command buffer submissions, which can improve model load + // performance and prevent crashes when loading large models. + size_t prepack_threshold_nbytes = 0; + // Threshold used for the first command buffer submission during prepacking. + // This can be set to be lower than prepack_submission_threshold_nbytes to + // submit a command buffer for execution earlier which can improve performance + // by taking more advantage of parallelism between the CPU and GPU. + size_t prepack_initial_threshold_nbytes = 0; + vkapi::Adapter* external_adapter; // Generate a default graph config with pre-configured settings diff --git a/backends/vulkan/runtime/graph/containers/Constant.h b/backends/vulkan/runtime/graph/containers/Constant.h index 9aa3716e28d..aaa92360a9e 100644 --- a/backends/vulkan/runtime/graph/containers/Constant.h +++ b/backends/vulkan/runtime/graph/containers/Constant.h @@ -28,6 +28,10 @@ struct TensorRef final { const std::vector& t_sizes, vkapi::ScalarType t_dtype, const void* const t_data); + + inline size_t nbytes() const { + return utils::multiply_integers(sizes) * vkapi::element_size(dtype); + } }; } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp index bdbecc866ab..05729172420 100644 --- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp +++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp @@ -62,6 +62,7 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) { TensorRefPtr tref = graph->get_tref(tref_); size_t numel = utils::multiply_integers(tref->sizes); api::StagingBuffer staging(graph->context(), tref->dtype, numel); + graph->update_staging_nbytes_in_cmd(staging.buffer().mem_size_as_size_t()); size_t nbytes = numel * vkapi::element_size(tref->dtype); staging.copy_from(tref->data, nbytes); return staging; diff --git a/backends/vulkan/runtime/vk_api/memory/Buffer.h b/backends/vulkan/runtime/vk_api/memory/Buffer.h index 0ef9f7e95e4..e1b441397b4 100644 --- a/backends/vulkan/runtime/vk_api/memory/Buffer.h +++ b/backends/vulkan/runtime/vk_api/memory/Buffer.h @@ -138,6 +138,10 @@ class VulkanBuffer final { return buffer_properties_.size; } + inline size_t mem_size_as_size_t() const { + return utils::safe_downcast(mem_size()); + } + inline bool has_memory() const { return (memory_.allocation != VK_NULL_HANDLE); }