[ET-VK] Consolidate shader compilation into one vkCreateComputePipelines call

jorgep31415 · jorgep31415 · commit d0d965b828a9 · 2025-06-04T10:01:06.000-07:00
Pull Request resolved: #11345 We target the QC Adreno driver implementation of Vulkan. The Vulkan API does not enforce how QC actually uses the cache. As the plural naming of `vkCreateComputePipelines` suggests, we observed that the `createInfoCount`, `pCreateInfos` and `pPipelines` arguments above allow construction of multiple compute pipelines in one invocation. We refactor ET-VK to accumulate metadata necessary for pipeline construction and invoke vkCreateComputePipelines only once. QC's implementation maximizes the cache if we create the same number of compute pipelines in fewer invocations of vkCreateComputePipelines. This decreases model load for a sample model from 1.7s to 200ms. ghstack-source-id: 288170368 Differential Revision: [D75763660](https://our.internmc.facebook.com/intern/diff/D75763660/)
diff --git a/backends/vulkan/runtime/VulkanBackend.cpp b/backends/vulkan/runtime/VulkanBackend.cpp
@@ -495,6 +495,7 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
     builder.build_graph();
 
     compute_graph->prepare();
+    compute_graph->prepare_pipelines();
 
     compute_graph->encode_prepack();
     compute_graph->prepack();
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -561,6 +561,42 @@ void ComputeGraph::update_descriptor_counts(
   }
 }
 
+void ComputeGraph::register_pipeline_to_create(
+    const vkapi::ShaderInfo& shader_info,
+    const utils::WorkgroupSize& local_workgroup_size,
+    const vkapi::SpecVarList& spec_vars,
+    const std::vector<PushConstantDataInfo>& push_constants) {
+  VkDescriptorSetLayout shader_layout =
+      context()->shader_layout_cache().retrieve(shader_info.kernel_layout);
+
+  uint32_t pc_offset = 0;
+  std::array<uint8_t, kMaxPushConstantSize> pc_data;
+  for (const auto& pc : push_constants) {
+    pc_offset += pc.write(pc_data.data(), pc_offset, kMaxPushConstantSize);
+  }
+
+  vkapi::SpecVarList spec_constants = {
+      SV(local_workgroup_size[0u]),
+      SV(local_workgroup_size[1u]),
+      SV(local_workgroup_size[2u])};
+
+  spec_constants.append(spec_vars);
+
+  const vkapi::ComputePipelineCache::Key desc = {
+      context()->pipeline_layout_cache().retrieve(shader_layout, pc_offset),
+      context()->shader_cache().retrieve(shader_info),
+      spec_constants};
+
+  if (context_->pipeline_cache().contains(desc)) {
+    return;
+  }
+  auto it = pipeline_descriptors_.find(desc);
+  if (it != pipeline_descriptors_.cend()) {
+    return;
+  }
+  pipeline_descriptors_.insert(desc);
+}
+
 utils::uvec3 ComputeGraph::create_global_wg_size(const ValueRef idx) {
   if (is_buffer_storage(idx)) {
     return {uint32_t(numel_of(idx)), 1u, 1u};
@@ -670,6 +706,20 @@ void ComputeGraph::prepare() {
   }
 }
 
+void ComputeGraph::prepare_pipelines() {
+  for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
+    node->prepare_pipelines(this);
+  }
+  for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
+    node->prepare_pipelines(this);
+  }
+  context_->pipeline_cache().create_pipelines(pipeline_descriptors_);
+
+  pipeline_descriptors_ = std::unordered_set<
+      vkapi::ComputePipelineCache::Key,
+      vkapi::ComputePipelineCache::Hasher>();
+}
+
 void ComputeGraph::encode_prepack() {
   for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
     node->encode(this);
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -185,6 +185,11 @@ class ComputeGraph final {
   std::vector<IOValueRef> inputs_;
   std::vector<IOValueRef> outputs_;
 
+  std::unordered_set<
+      vkapi::ComputePipelineCache::Key,
+      vkapi::ComputePipelineCache::Hasher>
+      pipeline_descriptors_;
+
  protected:
   size_t values_in_use_ = 0;
   size_t execute_count_ = 0;
@@ -711,8 +716,16 @@ class ComputeGraph final {
       const vkapi::ShaderInfo& shader_info,
       bool execute);
 
+  void register_pipeline_to_create(
+      const vkapi::ShaderInfo& shader_info,
+      const utils::WorkgroupSize& local_workgroup_size,
+      const vkapi::SpecVarList& spec_vars,
+      const std::vector<PushConstantDataInfo>& push_constants);
+
   void prepare();
 
+  void prepare_pipelines();
+
   //
   // Dispatch Utilities
   //
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.cpp b/backends/vulkan/runtime/graph/ops/DispatchNode.cpp
@@ -35,6 +35,11 @@ DispatchNode::DispatchNode(
   graph.update_descriptor_counts(shader, /*execute = */ true);
 }
 
+void DispatchNode::prepare_pipelines(ComputeGraph* graph) {
+  graph->register_pipeline_to_create(
+      shader_, local_workgroup_size_, spec_vars_, push_constants_);
+}
+
 void DispatchNode::encode(ComputeGraph* graph) {
   if (!shader_) {
     return;
diff --git a/backends/vulkan/runtime/graph/ops/DispatchNode.h b/backends/vulkan/runtime/graph/ops/DispatchNode.h
@@ -40,6 +40,8 @@ class DispatchNode : public ExecuteNode {
 
   ~DispatchNode() override = default;
 
+  void prepare_pipelines(ComputeGraph* graph) override;
+
   void encode(ComputeGraph* graph) override;
 
  protected:
diff --git a/backends/vulkan/runtime/graph/ops/ExecuteNode.h b/backends/vulkan/runtime/graph/ops/ExecuteNode.h
@@ -61,6 +61,10 @@ class ExecuteNode {
 
   virtual ~ExecuteNode() = default;
 
+  virtual void prepare_pipelines(ComputeGraph* graph) {
+    (void)graph;
+  }
+
   virtual void encode(ComputeGraph* graph) {
     (void)graph;
   }
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -67,6 +67,13 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
   return staging;
 }
 
+void PrepackNode::prepare_pipelines(ComputeGraph* graph) {
+  graph->register_pipeline_to_create(
+      shader_, local_workgroup_size_, spec_vars_, push_constants_);
+  graph->register_pipeline_to_create(
+      noop_shader_, utils::WorkgroupSize(1, 1, 1), {}, {});
+}
+
 void PrepackNode::encode(ComputeGraph* graph) {
   api::Context* const context = graph->context();
 
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.h b/backends/vulkan/runtime/graph/ops/PrepackNode.h
@@ -40,6 +40,8 @@ class PrepackNode final {
 
   ~PrepackNode() = default;
 
+  void prepare_pipelines(ComputeGraph* graph);
+
   void encode(ComputeGraph* graph);
 
   inline void set_node_id(uint32_t node_id) {
diff --git a/backends/vulkan/runtime/vk_api/Pipeline.cpp b/backends/vulkan/runtime/vk_api/Pipeline.cpp
@@ -270,6 +270,9 @@ void swap(PipelineLayout& lhs, PipelineLayout& rhs) noexcept {
 // ComputePipeline
 //
 
+ComputePipeline::ComputePipeline(VkDevice device, VkPipeline handle)
+    : device_{device}, handle_{handle} {}
+
 ComputePipeline::ComputePipeline(
     VkDevice device,
     const ComputePipeline::Descriptor& descriptor,
@@ -444,19 +447,94 @@ ComputePipelineCache::~ComputePipelineCache() {
   pipeline_cache_ = VK_NULL_HANDLE;
 }
 
+bool ComputePipelineCache::contains(const ComputePipelineCache::Key& key) {
+  std::lock_guard<std::mutex> lock(cache_mutex_);
+
+  auto it = cache_.find(key);
+  return it != cache_.cend();
+}
+
+void ComputePipelineCache::create_pipelines(
+    const std::unordered_set<Key, Hasher>& descriptors) {
+  std::lock_guard<std::mutex> lock(cache_mutex_);
+
+  const auto num_pipelines = descriptors.size();
+  std::vector<VkPipeline> pipelines(num_pipelines);
+
+  std::vector<std::vector<VkSpecializationMapEntry>> map_entries;
+  map_entries.reserve(num_pipelines);
+
+  std::vector<VkSpecializationInfo> specialization_infos;
+  specialization_infos.reserve(num_pipelines);
+
+  std::vector<VkPipelineShaderStageCreateInfo> shader_stage_create_infos;
+  shader_stage_create_infos.reserve(num_pipelines);
+
+  std::vector<VkComputePipelineCreateInfo> create_infos;
+  create_infos.reserve(num_pipelines);
+
+  for (auto& key : descriptors) {
+    map_entries.push_back(key.specialization_constants.generate_map_entries());
+
+    specialization_infos.push_back(VkSpecializationInfo{
+        key.specialization_constants.size(), // mapEntryCount
+        map_entries.back().data(), // pMapEntries
+        key.specialization_constants.data_nbytes(), // dataSize
+        key.specialization_constants.data(), // pData
+    });
+
+    shader_stage_create_infos.push_back(VkPipelineShaderStageCreateInfo{
+        VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, // sType
+        nullptr, // pNext
+        0u, // flags
+        VK_SHADER_STAGE_COMPUTE_BIT, // stage
+        key.shader_module, // module
+        "main", // pName
+        &specialization_infos.back(), // pSpecializationInfo
+    });
+
+    create_infos.push_back(VkComputePipelineCreateInfo{
+        VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, // sType
+        nullptr, // pNext
+        0u, // flags
+        shader_stage_create_infos.back(), // stage
+        key.pipeline_layout, // layout
+        VK_NULL_HANDLE, // basePipelineHandle
+        0u, // basePipelineIndex
+    });
+  }
+
+  VK_CHECK(vkCreateComputePipelines(
+      device_,
+      pipeline_cache_,
+      create_infos.size(),
+      create_infos.data(),
+      nullptr,
+      pipelines.data()));
+
+  uint32_t i = 0;
+  for (auto& key : descriptors) {
+    auto it = cache_.find(key);
+    if (it != cache_.cend()) {
+      continue;
+    }
+    cache_.insert({key, ComputePipelineCache::Value(device_, pipelines[i])});
+    ++i;
+  }
+}
+
 VkPipeline ComputePipelineCache::retrieve(
     const ComputePipelineCache::Key& key) {
   std::lock_guard<std::mutex> lock(cache_mutex_);
 
   auto it = cache_.find(key);
-  if (cache_.cend() == it) {
+  if (it == cache_.cend()) {
     it = cache_
              .insert(
                  {key,
                   ComputePipelineCache::Value(device_, key, pipeline_cache_)})
              .first;
   }
-
   return it->second.handle();
 }
 
diff --git a/backends/vulkan/runtime/vk_api/Pipeline.h b/backends/vulkan/runtime/vk_api/Pipeline.h
@@ -19,6 +19,7 @@
 
 #include <mutex>
 #include <unordered_map>
+#include <unordered_set>
 
 #define SV(x) ::vkcompute::vkapi::SpecVar(x)
 
@@ -158,6 +159,8 @@ class ComputePipeline final {
     SpecVarList specialization_constants;
   };
 
+  explicit ComputePipeline(VkDevice device, VkPipeline handle);
+
   explicit ComputePipeline(
       VkDevice device,
       const Descriptor& descriptor,
@@ -185,6 +188,10 @@ class ComputePipeline final {
   // does not allow for move assignment. The swap function will
   // be used in the hash map.
   friend void swap(ComputePipeline& lhs, ComputePipeline& rhs) noexcept;
+
+  friend bool operator==(
+      const ComputePipeline::Descriptor& _1,
+      const ComputePipeline::Descriptor& _2);
 };
 
 class PipelineLayoutCache final {
@@ -293,6 +300,8 @@ class ComputePipelineCache final {
   const std::string cache_data_path_;
 
  public:
+  bool contains(const Key&);
+  void create_pipelines(const std::unordered_set<Key, Hasher>&);
   VkPipeline retrieve(const Key&);
   void purge();
 };

Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,10 @@ class ExecuteNode {`
`61`	`61`
`62`	`62`	`virtual ~ExecuteNode() = default;`
`63`	`63`
	`64`	`+ virtual void prepare_pipelines(ComputeGraph* graph) {`
	`65`	`+ (void)graph;`
	`66`	`+ }`
	`67`	`+`
`64`	`68`	`virtual void encode(ComputeGraph* graph) {`
`65`	`69`	`(void)graph;`
`66`	`70`	`}`