halide · derek-gerstmann · Jan 7, 2025 · Jan 7, 2025 · Jan 7, 2025 · Jan 7, 2025
diff --git a/src/Target.cpp b/src/Target.cpp
@@ -1304,17 +1304,28 @@ int Target::get_arm_v8_lower_bound() const {
 }
 
 bool Target::supports_type(const Type &t) const {
+    if (has_feature(Vulkan)) {
+        if (t.is_float() && t.bits() == 64) {
+            return has_feature(Target::VulkanFloat64);
+        } else if (t.is_float() && t.bits() == 16) {
+            return has_feature(Target::VulkanFloat16);
+        } else if (t.is_int_or_uint() && t.bits() == 64) {
+            return has_feature(Target::VulkanInt64);
+        } else if (t.is_int_or_uint() && t.bits() == 16) {
+            return has_feature(Target::VulkanInt16);
+        } else if (t.is_int_or_uint() && t.bits() == 8) {
+            return has_feature(Target::VulkanInt8);
+        }
+    }
     if (t.bits() == 64) {
         if (t.is_float()) {
             return (!has_feature(Metal) &&
                     !has_feature(D3D12Compute) &&
                     (!has_feature(Target::OpenCL) || has_feature(Target::CLDoubles)) &&
-                    (!has_feature(Vulkan) || has_feature(Target::VulkanFloat64)) &&
                     !has_feature(WebGPU));
         } else {
             return (!has_feature(Metal) &&
                     !has_feature(D3D12Compute) &&
-                    (!has_feature(Vulkan) || has_feature(Target::VulkanInt64)) &&
                     !has_feature(WebGPU));
         }
     }

diff --git a/src/runtime/internal/memory_resources.h b/src/runtime/internal/memory_resources.h
@@ -151,7 +151,7 @@ ALWAYS_INLINE size_t aligned_size(size_t offset, size_t size, size_t alignment)
 ALWAYS_INLINE size_t conform_size(size_t offset, size_t size, size_t alignment, size_t nearest_multiple) {
     size_t adjusted_size = aligned_size(offset, size, alignment);
     adjusted_size = (alignment > adjusted_size) ? alignment : adjusted_size;
-    if (nearest_multiple > 0) {
+    if ((nearest_multiple > 0) && ((adjusted_size % nearest_multiple) != 0)) {
         size_t rounded_size = (((adjusted_size + nearest_multiple - 1) / nearest_multiple) * nearest_multiple);
         return rounded_size;
     } else {

diff --git a/src/runtime/internal/region_allocator.h b/src/runtime/internal/region_allocator.h
@@ -74,7 +74,7 @@ class RegionAllocator {
     BlockRegion *coalesce_block_regions(void *user_context, BlockRegion *region);
 
     // Returns true if the given region can be split to accomodate the given size
-    bool can_split(const BlockRegion *region, const MemoryRequest &request) const;
+    bool can_split(void *use_context, const BlockRegion *region, const MemoryRequest &request) const;
 
     // Splits the given block region into a smaller region to accomodate the given size, followed by empty space for the remaining
     BlockRegion *split_block_region(void *user_context, BlockRegion *region, const MemoryRequest &request);
@@ -195,7 +195,7 @@ MemoryRegion *RegionAllocator::reserve(void *user_context, const MemoryRequest &
         return nullptr;
     }
 
-    if (can_split(block_region, region_request)) {
+    if (can_split(user_context, block_region, region_request)) {
 #ifdef DEBUG_RUNTIME_INTERNAL
         debug(user_context) << "RegionAllocator: Splitting region of size ( " << (int32_t)(block_region->memory.size) << ") "
                             << "to accomodate requested size (" << (int32_t)(region_request.size) << " bytes)";
@@ -443,8 +443,29 @@ BlockRegion *RegionAllocator::coalesce_block_regions(void *user_context, BlockRe
     return block_region;
 }
 
-bool RegionAllocator::can_split(const BlockRegion *block_region, const MemoryRequest &split_request) const {
-    return (block_region && (block_region->memory.size > split_request.size) && (block_region->usage_count == 0));
+bool RegionAllocator::can_split(void *user_context, const BlockRegion *block_region, const MemoryRequest &split_request) const {
+
+    // See if we can actually split the block region and create empty space big enough
+    if (block_region && (block_region->memory.size > split_request.size) && (block_region->usage_count == 0)) {
+
+        // We can only split if there's still room left after conforming the allocation request since the
+        // conform method may actually grow the requested size to accomodate alignment constraints
+        MemoryRequest test_request = split_request;
+        test_request.size = block_region->memory.size - test_request.size;
+        test_request.offset = block_region->memory.offset + test_request.size;
+        int error_code = conform(user_context, &test_request);
+        if (error_code) {
+#ifdef DEBUG_RUNTIME_INTERNAL
+            debug(nullptr) << "RegionAllocator: Failed to conform test request for splitting block region!\n";
+#endif
+            return false;
+        }
+
+        if ((block_region->memory.size - test_request.size) > 0) {
+            return true;
+        }
+    }
+    return false;
 }
 
 BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion *block_region, const MemoryRequest &request) {
@@ -470,8 +491,9 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
 
 #ifdef DEBUG_RUNTIME_INTERNAL
     debug(user_context) << "RegionAllocator: Splitting "
-                        << "current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes) "
-                        << "to create empty region (offset=" << (int32_t)split_request.offset << " size=" << (int32_t)(split_request.size) << " bytes)";
+                        << "current region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes) into ...\n\t"
+                        << "existing region (offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size - split_request.size) << " bytes)\n\t"
+                        << "empty region    (offset=" << (int32_t)split_request.offset << " size=" << (int32_t)(split_request.size) << " bytes)\n";
 #endif
     BlockRegion *next_region = block_region->next_ptr;
     BlockRegion *empty_region = create_block_region(user_context, split_request);
@@ -484,6 +506,12 @@ BlockRegion *RegionAllocator::split_block_region(void *user_context, BlockRegion
     empty_region->prev_ptr = block_region;
     block_region->next_ptr = empty_region;
     block_region->memory.size -= empty_region->memory.size;
+
+#ifdef DEBUG_RUNTIME_INTERNAL
+    debug(user_context) << "RegionAllocator: Split block region into ...\n\t"
+                        << "existing region (ptr=" << (void *)block_region << " prev_ptr=" << block_region->prev_ptr << " next_ptr=" << block_region->next_ptr << " offset=" << (int32_t)block_region->memory.offset << " size=" << (int32_t)(block_region->memory.size) << " bytes)\n\t"
+                        << "empty region    (ptr=" << (void *)empty_region << " prev_ptr=" << empty_region->prev_ptr << " next_ptr=" << empty_region->next_ptr << " offset=" << (int32_t)empty_region->memory.offset << " size=" << (int32_t)(empty_region->memory.size) << " bytes)\n";
+#endif
     return empty_region;
 }
 
@@ -605,8 +633,22 @@ int RegionAllocator::alloc_block_region(void *user_context, BlockRegion *block_r
 #endif
     halide_abort_if_false(user_context, allocators.region.allocate != nullptr);
     halide_abort_if_false(user_context, block_region->status == AllocationStatus::Available);
+
     int error_code = 0;
     MemoryRegion *memory_region = &(block_region->memory);
+    if (memory_region->size <= 0) {
+#ifdef DEBUG_RUNTIME_INTERNAL
+        debug(user_context) << "    skipping zero size region ("
+                            << "block_ptr=" << (void *)block_region->block_ptr << " "
+                            << "block_region=" << (void *)block_region << " "
+                            << "memory_offset=" << (uint32_t)(block_region->memory.offset) << " "
+                            << "memory_size=" << (uint32_t)(block_region->memory.size) << " "
+                            << "block_reserved=" << (uint32_t)block->reserved << " "
+                            << ")\n";
+#endif
+        return error_code;
+    }
+
     if (memory_region->handle == nullptr) {
         error_code = allocators.region.allocate(user_context, memory_region);
         memory_region->is_owner = true;

diff --git a/src/runtime/vulkan.cpp b/src/runtime/vulkan.cpp
@@ -1193,13 +1193,6 @@ WEAK int halide_vulkan_run(void *user_context,
                 }
             }
         }
-
-        // 2b. Create the pipeline layout
-        error_code = vk_create_pipeline_layout(user_context, ctx.allocator, shader_module->shader_count, shader_module->descriptor_set_layouts, &(shader_module->pipeline_layout));
-        if (error_code != halide_error_code_success) {
-            error(user_context) << "Vulkan: Failed to create pipeline layout!\n";
-            return error_code;
-        }
     }
 
     VulkanDispatchData dispatch_data = {};
@@ -1213,16 +1206,8 @@ WEAK int halide_vulkan_run(void *user_context,
 
     VulkanShaderBinding *entry_point_binding = (shader_module->shader_bindings + entry_point_index);
 
-    // 2c. Setup the compute pipeline (eg override any specializations for shared mem or workgroup size)
-    error_code = vk_setup_compute_pipeline(user_context, ctx.allocator, entry_point_binding, &dispatch_data, shader_module->shader_module, shader_module->pipeline_layout, &(entry_point_binding->compute_pipeline));
-    if (error_code != halide_error_code_success) {
-        error(user_context) << "Vulkan: Failed to setup compute pipeline!\n";
-        return error_code;
-    }
-
-    // 2d. Create a descriptor set
-    if (entry_point_binding->descriptor_set == VK_NULL_HANDLE) {
-
+    // 2c. If Push Descriptor Set isn't supported, then allocate a descriptor set
+    if ((vkCmdPushDescriptorSetKHR == nullptr) && (entry_point_binding->descriptor_set == VK_NULL_HANDLE)) {
         // Construct a descriptor pool
         //
         // NOTE: while this could be re-used across multiple pipelines, we only know the storage requirements of this kernel's
@@ -1244,7 +1229,7 @@ WEAK int halide_vulkan_run(void *user_context,
         }
     }
 
-    // 3a. Create a buffer for the scalar parameters
+    // 2d. Create a buffer for the scalar parameters
     if ((entry_point_binding->args_region == nullptr) && entry_point_binding->uniform_buffer_count) {
         size_t scalar_buffer_size = vk_estimate_scalar_uniform_buffer_size(user_context, arg_sizes, args, arg_is_buffer);
         if (scalar_buffer_size > 0) {
@@ -1256,7 +1241,7 @@ WEAK int halide_vulkan_run(void *user_context,
         }
     }
 
-    // 3b. Update uniform buffer with scalar parameters
+    // 2e. Update uniform buffer with scalar parameters
     VkBuffer *args_buffer = nullptr;
     if ((entry_point_binding->args_region != nullptr) && entry_point_binding->uniform_buffer_count) {
         error_code = vk_update_scalar_uniform_buffer(user_context, ctx.allocator, entry_point_binding->args_region, arg_sizes, args, arg_is_buffer);
@@ -1272,10 +1257,28 @@ WEAK int halide_vulkan_run(void *user_context,
         }
     }
 
-    // 3c. Update buffer bindings for descriptor set
-    error_code = vk_update_descriptor_set(user_context, ctx.allocator, args_buffer, entry_point_binding->uniform_buffer_count, entry_point_binding->storage_buffer_count, arg_sizes, args, arg_is_buffer, entry_point_binding->descriptor_set);
+    // 2f. If Push Descriptor Set isn't supported, then update the buffer bindings for the allocated descriptor set
+    if (vkCmdPushDescriptorSetKHR == nullptr) {
+        error_code = vk_update_descriptor_set(user_context, ctx.allocator, args_buffer, entry_point_binding->uniform_buffer_count, entry_point_binding->storage_buffer_count, arg_sizes, args, arg_is_buffer, entry_point_binding->descriptor_set);
+        if (error_code != halide_error_code_success) {
+            error(user_context) << "Vulkan: Failed to update descriptor set!\n";
+            return error_code;
+        }
+    }
+
+    // 2b. Create the pipeline layout
+    if (shader_module->pipeline_layout == VK_NULL_HANDLE) {
+        error_code = vk_create_pipeline_layout(user_context, ctx.allocator, shader_module->shader_count, shader_module->descriptor_set_layouts, &(shader_module->pipeline_layout));
+        if (error_code != halide_error_code_success) {
+            error(user_context) << "Vulkan: Failed to create pipeline layout!\n";
+            return error_code;
+        }
+    }
+
+    // 3. Setup the compute pipeline (eg override any specializations for shared mem or workgroup size)
+    error_code = vk_setup_compute_pipeline(user_context, ctx.allocator, entry_point_binding, &dispatch_data, shader_module->shader_module, shader_module->pipeline_layout, &(entry_point_binding->compute_pipeline));
     if (error_code != halide_error_code_success) {
-        error(user_context) << "Vulkan: Failed to update descriptor set!\n";
+        error(user_context) << "Vulkan: Failed to setup compute pipeline!\n";
         return error_code;
     }
 
@@ -1287,18 +1290,49 @@ WEAK int halide_vulkan_run(void *user_context,
     }
 
     // 5. Fill the command buffer
-    error_code = vk_fill_command_buffer_with_dispatch_call(user_context,
-                                                           ctx.device, cmds.command_buffer,
-                                                           entry_point_binding->compute_pipeline,
-                                                           shader_module->pipeline_layout,
-                                                           entry_point_binding->descriptor_set,
-                                                           entry_point_index,
-                                                           blocksX, blocksY, blocksZ);
+    error_code = vk_begin_command_buffer(user_context, cmds.command_buffer);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to start command buffer for dispatch call!\n";
+        return error_code;
+    }
+    error_code = vk_bind_pipeline(user_context, cmds.command_buffer, entry_point_binding->compute_pipeline);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to bind compute pipeline to command buffer for dispatch call!\n";
+        return error_code;
+    }
+
+    if (vkCmdPushDescriptorSetKHR != nullptr) {
+        error_code = vk_push_descriptor_set(user_context, ctx.allocator, cmds.command_buffer, entry_point_binding->compute_pipeline, shader_module->pipeline_layout, entry_point_binding->descriptor_set, args_buffer, entry_point_binding->uniform_buffer_count, entry_point_binding->storage_buffer_count, arg_sizes, args, arg_is_buffer);
+        if (error_code != halide_error_code_success) {
+            error(user_context) << "Vulkan: Failed to update descriptor set!\n";
+            return error_code;
+        }
+    } else {
+        error_code = vk_bind_descriptor_sets(user_context, cmds.command_buffer, shader_module->pipeline_layout, entry_point_binding->descriptor_set, entry_point_index);
+        if (error_code != halide_error_code_success) {
+            error(user_context) << "Vulkan: Failed to bind descriptor set to command buffer for dispatch call!\n";
+            return error_code;
+        }
+    }
+
+    error_code = vk_dispatch_kernel(user_context,
+                                    ctx.device, cmds.command_buffer,
+                                    entry_point_binding->compute_pipeline,
+                                    shader_module->pipeline_layout,
+                                    entry_point_binding->descriptor_set,
+                                    entry_point_index,
+                                    blocksX, blocksY, blocksZ);
     if (error_code != halide_error_code_success) {
         error(user_context) << "Vulkan: Failed to fill command buffer with dispatch call!\n";
         return error_code;
     }
 
+    error_code = vk_end_command_buffer(user_context, cmds.command_buffer);
+    if (error_code != halide_error_code_success) {
+        error(user_context) << "Vulkan: Failed to end command buffer for dispatch call!\n";
+        return error_code;
+    }
+
     // 6. Submit the command buffer to our command queue
     error_code = vk_submit_command_buffer(user_context, ctx.queue, cmds.command_buffer);
     if (error_code != halide_error_code_success) {

diff --git a/src/runtime/vulkan_extensions.h b/src/runtime/vulkan_extensions.h
@@ -203,10 +203,18 @@ uint32_t vk_get_required_device_extensions(void *user_context, StringTable &ext_
 uint32_t vk_get_optional_device_extensions(void *user_context, StringTable &ext_table) {
     const char *optional_ext_table[] = {
         "VK_KHR_portability_subset",  //< necessary for running under Molten (aka Vulkan on Mac)
+        VK_KHR_MAINTENANCE_1_EXTENSION_NAME,
+        VK_KHR_MAINTENANCE_2_EXTENSION_NAME,
+        VK_KHR_MAINTENANCE_3_EXTENSION_NAME,
+        VK_KHR_MAINTENANCE_4_EXTENSION_NAME,
         VK_KHR_MAINTENANCE_5_EXTENSION_NAME,
+        VK_KHR_MAINTENANCE_6_EXTENSION_NAME,
+        VK_KHR_MAINTENANCE_7_EXTENSION_NAME,
         VK_KHR_16BIT_STORAGE_EXTENSION_NAME,
         VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME,
-        VK_KHR_SHADER_FLOAT_CONTROLS_EXTENSION_NAME};
+        VK_KHR_SHADER_FLOAT_CONTROLS_EXTENSION_NAME,
+        VK_KHR_PUSH_DESCRIPTOR_EXTENSION_NAME};
+
     const uint32_t optional_ext_count = sizeof(optional_ext_table) / sizeof(optional_ext_table[0]);
     ext_table.fill(user_context, (const char **)optional_ext_table, optional_ext_count);
     return optional_ext_count;

diff --git a/src/runtime/vulkan_functions.h b/src/runtime/vulkan_functions.h
@@ -204,6 +204,7 @@ VULKAN_FN(vkCmdCopyBuffer2)
 // VULKAN_FN(vkCmdCopyImageToBuffer2)
 // VULKAN_FN(vkCmdEndRendering)
 VULKAN_FN(vkCmdPipelineBarrier2)
+VULKAN_FN(vkCmdPushDescriptorSetKHR)
 VULKAN_FN(vkCmdResetEvent2)
 // VULKAN_FN(vkCmdResolveImage2)
 // VULKAN_FN(vkCmdSetCullMode)

diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h
@@ -106,14 +106,19 @@ int vk_destroy_command_buffer(void *user_context, VulkanMemoryAllocator *allocat
 
 struct ScopedVulkanCommandBufferAndPool;
 
-int vk_fill_command_buffer_with_dispatch_call(void *user_context,
-                                              VkDevice device,
-                                              VkCommandBuffer command_buffer,
-                                              VkPipeline compute_pipeline,
-                                              VkPipelineLayout pipeline_layout,
-                                              VkDescriptorSet descriptor_set,
-                                              uint32_t descriptor_set_index,
-                                              int blocksX, int blocksY, int blocksZ);
+int vk_begin_command_buffer(void *user_context, VkCommandBuffer command_buffer);
+int vk_end_command_buffer(void *user_context, VkCommandBuffer command_buffer);
+int vk_bind_pipeline(void *user_context, VkCommandBuffer command_buffer, VkPipeline compute_pipeline);
+int vk_bind_descriptor_sets_to_command_buffer(void *user_context, VkCommandBuffer command_buffer, VkPipeline compute_pipeline);
+
+int vk_dispatch_kernel(void *user_context,
+                       VkDevice device,
+                       VkCommandBuffer command_buffer,
+                       VkPipeline compute_pipeline,
+                       VkPipelineLayout pipeline_layout,
+                       VkDescriptorSet descriptor_set,
+                       uint32_t descriptor_set_index,
+                       int blocksX, int blocksY, int blocksZ);
 
 int vk_submit_command_buffer(void *user_context, VkQueue queue, VkCommandBuffer command_buffer);
 
@@ -175,6 +180,24 @@ int vk_create_descriptor_set(void *user_context,
                              VkDescriptorPool descriptor_pool,
                              VkDescriptorSet *descriptor_set);
 
+int vk_get_descriptor_buffer_info(void *user_context,
+                                  VulkanMemoryAllocator *allocator,
+                                  VkDescriptorSet descriptor_set,
+                                  VkBuffer *scalar_args_buffer,
+                                  size_t uniform_buffer_count,
+                                  size_t storage_buffer_count,
+                                  size_t arg_sizes[],
+                                  void *args[],
+                                  int8_t arg_is_buffer[],
+                                  BlockStorage *descriptor_buffer_info_result);
+
+int vk_get_write_descriptor_set_info(void *user_context,
+                                     VulkanMemoryAllocator *allocator,
+                                     BlockStorage *descriptor_buffer_info,
+                                     VkDescriptorSet descriptor_set,
+                                     VkBuffer *scalar_args_buffer,
+                                     BlockStorage *write_descriptor_set_result);
+
 int vk_update_descriptor_set(void *user_context,
                              VulkanMemoryAllocator *allocator,
                              VkBuffer *scalar_args_buffer,
@@ -185,6 +208,25 @@ int vk_update_descriptor_set(void *user_context,
                              int8_t arg_is_buffer[],
                              VkDescriptorSet descriptor_set);
 
+int vk_bind_descriptor_sets(void *user_context,
+                            VkCommandBuffer command_buffer,
+                            VkPipelineLayout pipeline_layout,
+                            VkDescriptorSet descriptor_set,
+                            uint32_t descriptor_set_index);
+
+int vk_push_descriptor_set(void *user_context,
+                           VulkanMemoryAllocator *allocator,
+                           VkCommandBuffer command_buffer,
+                           VkPipeline compute_pipeline,
+                           VkPipelineLayout pipeline_layout,
+                           VkDescriptorSet descriptor_set,
+                           VkBuffer *scalar_args_buffer,
+                           size_t uniform_buffer_count,
+                           size_t storage_buffer_count,
+                           size_t arg_sizes[],
+                           void *args[],
+                           int8_t arg_is_buffer[]);
+
 // -- Pipeline Layout
 int vk_create_pipeline_layout(void *user_context,
                               VulkanMemoryAllocator *allocator,

diff --git a/src/runtime/vulkan_memory.h b/src/runtime/vulkan_memory.h
@@ -556,7 +556,7 @@ int VulkanMemoryAllocator::lookup_requirements(void *user_context, size_t size,
 #if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Looking up requirements ("
                    << "user_context=" << user_context << " "
-                   << "size=" << (uint32_t)block->size << ", "
+                   << "size=" << (uint32_t)size << ", "
                    << "usage_flags=" << usage_flags << ") ... \n";
 #endif
     VkBufferCreateInfo create_info = {
@@ -998,7 +998,7 @@ int VulkanMemoryAllocator::conform(void *user_context, MemoryRequest *request) {
 
 #if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Buffer requirements ("
-                   << "requested_size=" << (uint32_t)region->size << ", "
+                   << "requested_size=" << (uint32_t)request->size << ", "
                    << "required_alignment=" << (uint32_t)memory_requirements.alignment << ", "
                    << "required_size=" << (uint32_t)memory_requirements.size << ")\n";
 #endif
@@ -1051,7 +1051,7 @@ int VulkanMemoryAllocator::conform_region_request(void *instance_ptr, MemoryRequ
 #if defined(HL_VK_DEBUG_MEM)
     debug(nullptr) << "VulkanMemoryAllocator: Conforming region request ("
                    << "user_context=" << user_context << " "
-                   << "request=" << (void *)(region) << ") ... \n";
+                   << "request=" << (void *)(request) << ") ... \n";
 #endif
 
     if ((instance->device == nullptr) || (instance->physical_device == nullptr)) {
@@ -1125,6 +1125,9 @@ int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *reg
 
     VkResult result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, buffer);
     if (result != VK_SUCCESS) {
+        debug(user_context) << "VulkanRegionAllocator: Failed to create buffer!\n\t"
+                            << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n";
+
         // Allocation failed ... collect unused regions and try again ...
         instance->collect(user_context);
         result = vkCreateBuffer(instance->device, &create_info, instance->alloc_callbacks, buffer);
@@ -1165,12 +1168,9 @@ int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *reg
                                 << "vkCreateBuffer returned: " << vk_get_error_name(result) << "\n";
             return halide_error_code_device_malloc_failed;
         }
+        region->size = create_info.size;
     }
 
-#ifdef DEBUG_RUNTIME
-    debug(nullptr) << "vkCreateBuffer: Created buffer for device region (" << (uint64_t)region->size << " bytes) ...\n";
-#endif
-
     RegionAllocator *region_allocator = RegionAllocator::find_allocator(user_context, region);
     if (region_allocator == nullptr) {
         error(user_context) << "VulkanBlockAllocator: Unable to allocate region! Invalid region allocator!\n";
@@ -1189,6 +1189,10 @@ int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *reg
         return halide_error_code_internal_error;
     }
 
+#ifdef DEBUG_RUNTIME
+    debug(nullptr) << "vkCreateBuffer: Created buffer for device region (" << (uint64_t)region->size << " bytes) ...\n";
+#endif
+
     // Finally, bind buffer to the device memory
     result = vkBindBufferMemory(instance->device, *buffer, *device_memory, region->offset);
     if (result != VK_SUCCESS) {
@@ -1197,6 +1201,10 @@ int VulkanMemoryAllocator::allocate_region(void *instance_ptr, MemoryRegion *reg
         return halide_error_code_generic_error;
     }
 
+#ifdef DEBUG_RUNTIME
+    debug(nullptr) << "vkBindBufferMemory: Bound memory to device buffer for device region (" << (uint64_t)region->size << " bytes) ...\n";
+#endif
+
     region->handle = (void *)buffer;
     region->is_owner = true;
     instance->region_byte_count += region->size;

diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
diff --git a/test/autoschedulers/mullapudi2016/histogram.cpp b/test/autoschedulers/mullapudi2016/histogram.cpp
@@ -120,11 +120,17 @@ double run_test(bool auto_schedule) {
 }
 
 int main(int argc, char **argv) {
-    if (get_jit_target_from_environment().arch == Target::WebAssembly) {
+    Halide::Target target = get_jit_target_from_environment();
+    if (target.arch == Target::WebAssembly) {
         printf("[SKIP] Autoschedulers do not support WebAssembly.\n");
         return 0;
     }
 
+    if (target.has_feature(Target::Vulkan) && (!target.has_feature(Target::VulkanInt8))) {
+        printf("[SKIP] Skipping test for Vulkan ... missing Int8 support!\n");
+        return 0;
+    }
+
     if (argc != 2) {
         fprintf(stderr, "Usage: %s <autoscheduler-lib>\n", argv[0]);
         return 1;

diff --git a/test/correctness/bool_predicate_cast.cpp b/test/correctness/bool_predicate_cast.cpp
@@ -8,6 +8,11 @@ int main(int argc, char **argv) {
     // Test explicit casting of a predicate to an integer as part of a reduction
     // NOTE: triggers a convert_to_bool in Vulkan for a SelectOp
     Target target = get_jit_target_from_environment();
+    if (target.has_feature(Target::Vulkan) && (!target.has_feature(Target::VulkanInt8))) {
+        printf("[SKIP] Skipping test for Vulkan ... missing Int8 support!\n");
+        return 0;
+    }
+
     Var x("x"), y("y");
 
     Func input("input");

diff --git a/test/correctness/boundary_conditions.cpp b/test/correctness/boundary_conditions.cpp
@@ -392,6 +392,10 @@ int main(int argc, char **argv) {
         // The wasm jit is very slow, so shorten this test here.
         vector_width_max = 8;
     }
+    if (target.has_feature(Target::Vulkan) && (!target.has_feature(Target::VulkanInt8))) {
+        printf("[SKIP] Skipping test for Vulkan ... missing Int8 support!\n");
+        return 0;
+    }
 
     std::vector<Task> tasks;
     for (int vector_width = 1; vector_width <= vector_width_max; vector_width *= 2) {

diff --git a/test/correctness/convolution.cpp b/test/correctness/convolution.cpp
@@ -4,6 +4,13 @@
 using namespace Halide;
 
 int main(int argc, char **argv) {
+    Target target = get_jit_target_from_environment();
+    if (target.has_feature(Target::Vulkan)) {
+        if (!target.has_feature(Target::VulkanInt16)) {
+            printf("[SKIP] Skipping test for Vulkan ... missing Int16 support!\n");
+            return 0;
+        }
+    }
 
     // int W = 64*3, H = 64*3;
     const int W = 128, H = 48;
@@ -64,8 +71,6 @@ int main(int argc, char **argv) {
     Func blur2("blur2");
     blur2(x, y) = sum(tent(r.x, r.y) * input(x + r.x - 1, y + r.y - 1));
 
-    Target target = get_jit_target_from_environment();
-
     if (target.has_gpu_feature()) {
         Var xi("xi"), yi("yi");
 

diff --git a/test/correctness/convolution_multiple_kernels.cpp b/test/correctness/convolution_multiple_kernels.cpp
@@ -39,6 +39,11 @@ int main(int argc, char **argv) {
                  sum(cast<uint16_t>(box2(r.x, r.y)) * input(x + r.x, y + r.y));
 
     Target target = get_jit_target_from_environment();
+    if (target.has_feature(Target::Vulkan) && (!target.has_feature(Target::VulkanInt16))) {
+        printf("[SKIP] Skipping test for Vulkan ... missing Int16 support!\n");
+        return 0;
+    }
+
     if (target.has_gpu_feature()) {
         Var xi("xi"), yi("yi");
         blur.gpu_tile(x, y, xi, yi, 16, 16);

diff --git a/test/correctness/dilate3x3.cpp b/test/correctness/dilate3x3.cpp
@@ -27,6 +27,10 @@ int main(int argc, char **argv) {
 
     // Schedule.
     Target target = get_jit_target_from_environment();
+    if (target.has_feature(Target::Vulkan) && (!target.has_feature(Target::VulkanInt8))) {
+        printf("[SKIP] Skipping test for Vulkan ... missing Int8 support!\n");
+        return 0;
+    }
     if (target.has_gpu_feature()) {
         Var xi("xi"), yi("yi");
         dilate3x3.gpu_tile(x, y, xi, yi, 16, 16);

diff --git a/test/correctness/gpu_arg_types.cpp b/test/correctness/gpu_arg_types.cpp
@@ -3,10 +3,15 @@
 using namespace Halide;
 int main(int argc, char *argv[]) {
 
-    if (!get_jit_target_from_environment().has_gpu_feature()) {
+    Halide::Target target = get_jit_target_from_environment();
+    if (!target.has_gpu_feature()) {
         printf("[SKIP] No GPU target enabled.\n");
         return 0;
     }
+    if (target.has_feature(Target::Vulkan) && (!target.has_feature(Target::VulkanInt16))) {
+        printf("[SKIP] Skipping test for Vulkan ... missing Int16 support!\n");
+        return 0;
+    }
 
     Func f, g;
     Var x, y, tx, ty;

diff --git a/test/correctness/gpu_dynamic_shared.cpp b/test/correctness/gpu_dynamic_shared.cpp
@@ -11,14 +11,20 @@ int main(int argc, char **argv) {
     }
 
     if (t.has_feature(Target::Vulkan)) {
+        if (!t.has_feature(Target::VulkanV13)) {
+            printf("[SKIP] Skipping test for Vulkan ... missing 1.3 feature in target!\n");
+            return 0;
+        }
+
         const auto *interface = get_device_interface_for_device_api(DeviceAPI::Vulkan);
         assert(interface->compute_capability != nullptr);
         int major, minor;
         int err = interface->compute_capability(nullptr, &major, &minor);
         if (err != 0 || (major == 1 && minor < 3)) {
-            printf("[SKIP] Vulkan %d.%d is less than required 1.3.\n", major, minor);
+            printf("[SKIP] Vulkan runtime support %d.%d is less than required 1.3.\n", major, minor);
             return 0;
         }
+
         if ((t.os == Target::IOS) || (t.os == Target::OSX)) {
             printf("[SKIP] Skipping test for Vulkan on iOS/OSX (MoltenVK doesn't support dynamic LocalSizeId yet)!\n");
             return 0;

diff --git a/test/correctness/gpu_reuse_shared_memory.cpp b/test/correctness/gpu_reuse_shared_memory.cpp
@@ -172,6 +172,17 @@ int main(int argc, char **argv) {
         return 0;
     }
 
+    if (t.has_feature(Target::Vulkan)) {
+        if (!t.has_feature(Target::VulkanInt8)) {
+            printf("[SKIP] Skipping test for Vulkan ... missing Int8 support!\n");
+            return 0;
+        }
+        if (!t.has_feature(Target::VulkanInt16)) {
+            printf("[SKIP] Skipping test for Vulkan ... missing Int16 support!\n");
+            return 0;
+        }
+    }
+
     for (auto memory_type : {MemoryType::GPUShared, MemoryType::Heap}) {
         printf("Running multi thread type test\n");
         if (multi_thread_type_test(memory_type) != 0) {

diff --git a/test/correctness/gpu_transpose.cpp b/test/correctness/gpu_transpose.cpp
@@ -4,11 +4,17 @@
 using namespace Halide;
 
 int main(int argc, char **argv) {
-    if (!get_jit_target_from_environment().has_gpu_feature()) {
+    Target t = get_jit_target_from_environment();
+    if (!t.has_gpu_feature()) {
         printf("[SKIP] No GPU target enabled.\n");
         return 0;
     }
 
+    if (t.has_feature(Target::Vulkan) && (!t.has_feature(Target::VulkanInt8))) {
+        printf("[SKIP] Skipping test for Vulkan ... missing Int8 support!\n");
+        return 0;
+    }
+
     ImageParam in(UInt(8), 2);
 
     Var x, y;

diff --git a/test/correctness/interleave_rgb.cpp b/test/correctness/interleave_rgb.cpp
@@ -103,14 +103,15 @@ bool test_deinterleave(int x_stride) {
 }
 
 int main(int argc, char **argv) {
+    Target target = get_jit_target_from_environment();
     for (int x_stride : {3, 4}) {
-        if (!test_interleave<uint8_t>(x_stride)) return 1;
-        if (!test_interleave<uint16_t>(x_stride)) return 1;
-        if (!test_interleave<uint32_t>(x_stride)) return 1;
+        if (target.supports_type(halide_type_of<uint8_t>()) && !test_interleave<uint8_t>(x_stride)) return 1;
+        if (target.supports_type(halide_type_of<uint16_t>()) && !test_interleave<uint16_t>(x_stride)) return 1;
+        if (target.supports_type(halide_type_of<uint32_t>()) && !test_interleave<uint32_t>(x_stride)) return 1;
 
-        if (!test_deinterleave<uint8_t>(x_stride)) return 1;
-        if (!test_deinterleave<uint16_t>(x_stride)) return 1;
-        if (!test_deinterleave<uint32_t>(x_stride)) return 1;
+        if (target.supports_type(halide_type_of<uint8_t>()) && !test_deinterleave<uint8_t>(x_stride)) return 1;
+        if (target.supports_type(halide_type_of<uint16_t>()) && !test_deinterleave<uint16_t>(x_stride)) return 1;
+        if (target.supports_type(halide_type_of<uint32_t>()) && !test_deinterleave<uint32_t>(x_stride)) return 1;
     }
     printf("Success!\n");
     return 0;

diff --git a/test/correctness/interleave_x.cpp b/test/correctness/interleave_x.cpp
@@ -11,6 +11,11 @@ int main(int argc, char **argv) {
     interleaved(x, y) = select(x % 2 == 0, cast<uint16_t>(3), cast<uint16_t>(7));
 
     Target target = get_jit_target_from_environment();
+    if (target.has_feature(Target::Vulkan) && (!target.has_feature(Target::VulkanInt16))) {
+        printf("[SKIP] Skipping test for Vulkan ... missing support for Int16!\n");
+        return 0;
+    }
+
     if (target.has_gpu_feature()) {
         Var tx("tx"), ty("ty");
         interleaved.gpu_tile(x, y, tx, ty, 16, 16);

diff --git a/test/correctness/logical.cpp b/test/correctness/logical.cpp
@@ -13,6 +13,12 @@ Expr u16(Expr a) {
 
 int main(int argc, char **argv) {
 
+    Target target = get_jit_target_from_environment();
+    if (target.has_feature(Target::Vulkan) && (!target.has_feature(Target::VulkanInt8))) {
+        printf("[SKIP] Skipping test for Vulkan ... missing Int8 support!\n");
+        return 0;
+    }
+
     Buffer<uint8_t> input(128, 64);
 
     for (int y = 0; y < input.height(); y++) {
@@ -28,7 +34,6 @@ int main(int argc, char **argv) {
                              ((input(x, y) > 40) && (!(input(x, y) > 50))),
                          u8(255), u8(0));
 
-        Target target = get_jit_target_from_environment();
         if (target.has_gpu_feature()) {
             f.gpu_tile(x, y, xi, yi, 16, 16);
             f.vectorize(xi, 4);
@@ -62,7 +67,6 @@ int main(int argc, char **argv) {
                              ((input(x, y) > 40) && (!common_cond)),
                          u8(255), u8(0));
 
-        Target target = get_jit_target_from_environment();
         if (target.has_gpu_feature()) {
             f.gpu_tile(x, y, xi, yi, 16, 16);
             f.vectorize(xi, 4);
@@ -93,8 +97,6 @@ int main(int argc, char **argv) {
         Func f("f");
         f(x, y) = select(x < 10 || x > 20 || y < 10 || y > 20, 0, input(x, y));
 
-        Target target = get_jit_target_from_environment();
-
         if (target.has_gpu_feature()) {
             f.gpu_tile(x, y, xi, yi, 16, 16);
             f.vectorize(xi, 4);
@@ -124,7 +126,6 @@ int main(int argc, char **argv) {
         Expr ten = 10;
         f(x, y) = select(input(x, y) > ten, u8(255), u8(0));
 
-        Target target = get_jit_target_from_environment();
         if (target.has_gpu_feature()) {
             f.gpu_tile(x, y, xi, yi, 16, 16);
             f.vectorize(xi, 4);
@@ -177,7 +178,6 @@ int main(int argc, char **argv) {
             cpu.compute_root();
             gpu.compute_root();
 
-            Target target = get_jit_target_from_environment();
             if (target.has_feature(Target::OpenCL) && n == 16 && w == 32) {
                 // Workaround for https://github.com/halide/Halide/issues/2477
                 printf("Skipping uint%d -> uint%d for OpenCL\n", n, w);

diff --git a/test/correctness/median3x3.cpp b/test/correctness/median3x3.cpp
@@ -13,6 +13,13 @@ Expr mid3(Expr a, Expr b, Expr c) {
 }
 
 int main(int arch, char **argv) {
+
+    Target target = get_jit_target_from_environment();
+    if (target.has_feature(Target::Vulkan) && (!target.has_feature(Target::VulkanInt8))) {
+        printf("[SKIP] Skipping test for Vulkan ... missing Int8 support!\n");
+        return 0;
+    }
+
     const int W = 256, H = 256;
     Buffer<uint8_t> in(W, H);
     // Set up the input.
@@ -43,7 +50,6 @@ int main(int arch, char **argv) {
     median3x3(x, y) = mid3(min_max(x, y), max_min(x, y), mid_mid(x, y));
 
     // Schedule.
-    Target target = get_jit_target_from_environment();
     if (target.has_gpu_feature()) {
         Var xi("xi"), yi("yi");
         median3x3.gpu_tile(x, y, xi, yi, 16, 16);

diff --git a/test/correctness/mul_div_mod.cpp b/test/correctness/mul_div_mod.cpp
@@ -540,6 +540,21 @@ void add_test_div_mod(int vector_width, ScheduleVariant scheduling, Target targe
 int main(int argc, char **argv) {
     Target target = get_jit_target_from_environment();
 
+    if (target.has_feature(Target::Vulkan)) {
+        if (!target.has_feature(Target::VulkanInt8)) {
+            printf("[SKIP] Skipping test for Vulkan ... missing Int8 support!\n");
+            return 0;
+        }
+        if (!target.has_feature(Target::VulkanInt16)) {
+            printf("[SKIP] Skipping test for Vulkan ... missing Int16 support!\n");
+            return 0;
+        }
+        if (!target.has_feature(Target::VulkanInt64)) {
+            printf("[SKIP] Skipping test for Vulkan ... missing Int64 support!\n");
+            return 0;
+        }
+    }
+
     ScheduleVariant scheduling = CPU;
     if (target.has_gpu_feature()) {
         scheduling = TiledGPU;

diff --git a/test/correctness/multiple_outputs.cpp b/test/correctness/multiple_outputs.cpp
@@ -4,7 +4,8 @@
 using namespace Halide;
 
 int main(int argc, char **argv) {
-    const bool use_gpu = get_jit_target_from_environment().has_gpu_feature();
+    Target target = get_jit_target_from_environment();
+    const bool use_gpu = target.has_gpu_feature();
 
     // An internal Func that produces multiple values.
     {
@@ -93,7 +94,7 @@ int main(int argc, char **argv) {
     }
 
     // Now multiple output Funcs via inferred Realization
-    {
+    if (target.supports_type(halide_type_of<uint8_t>()) && target.supports_type(halide_type_of<int16_t>())) {
         Func f, g;
         Var x, xi;
         f(x) = cast<float>(100 * x);

diff --git a/test/correctness/widening_reduction.cpp b/test/correctness/widening_reduction.cpp
@@ -9,11 +9,20 @@ using namespace Halide::Internal;
 int main(int arch, char **argv) {
 
     Halide::Target target = get_jit_target_from_environment();
-    if (target.has_feature(Target::Vulkan) && ((target.os == Target::IOS) || target.os == Target::OSX)) {
-        printf("[SKIP] Skipping test for Vulkan on iOS/OSX (MoltenVK fails to convert max/min intrinsics correctly)!\n");
-        return 0;
+    if (target.has_feature(Target::Vulkan)) {
+        if (!target.has_feature(Target::VulkanInt8)) {
+            printf("[SKIP] Skipping test for Vulkan ... missing Int8 support!\n");
+            return 0;
+        }
+        if (!target.has_feature(Target::VulkanInt16)) {
+            printf("[SKIP] Skipping test for Vulkan ... missing Int16 support!\n");
+            return 0;
+        }
+        if ((target.os == Target::IOS) || (target.os == Target::OSX)) {
+            printf("[SKIP] Skipping test for Vulkan on iOS/OSX (MoltenVK fails to convert max/min intrinsics correctly)!\n");
+            return 0;
+        }
     }
-
     const int W = 256, H = 256;
 
     Buffer<uint8_t> in(W, H);