diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 705a6135a6584..f899f8d24414f 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1713,9 +1713,9 @@ static void ggml_vk_load_shaders(vk_device& device) { m_mmqid_wg_denoms = { 128, 64, 1 }; s_mmqid_wg_denoms = { 128, 64, 1 }; - l_align = 128; - m_align = 64; - s_align = 32; + l_align = 8; + m_align = 8; + s_align = 8; } else { // Matrix cores require different warp group sizes const uint32_t tm_l = device->coopmat_support ? device->coopmat_m : 4; @@ -1743,9 +1743,10 @@ static void ggml_vk_load_shaders(vk_device& device) { l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 }; m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 }; s_mmq_wg_denoms = s_wg_denoms = { 32, 32, 1 }; - l_align = 128; - m_align = 64; - s_align = 32; + + l_align = 8; + m_align = 8; + s_align = 8; for (uint32_t i = 0; i < GGML_TYPE_COUNT; ++i) { ggml_type t = (ggml_type)i; @@ -1879,14 +1880,18 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl) #undef CREATE_FA + auto const &mm_spec_constants = [&](const std::vector& constants, uint32_t aligned) -> std::vector { + return {constants[0], constants[1], constants[2], constants[3], aligned}; + }; + // Create 6 variants, {s,m,l}x{unaligned,aligned} #define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, mm_spec_constants(l_ ## WARPTILE, 0), 1); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, mm_spec_constants(m_ ## WARPTILE, 0), 1); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, mm_spec_constants(s_ ## WARPTILE, 0), 1); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, mm_spec_constants(l_ ## WARPTILE, 1), l_align); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, mm_spec_constants(m_ ## WARPTILE, 1), m_align); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, mm_spec_constants(s_ ## WARPTILE, 1), s_align); \ // Create 2 variants, {f16,f32} accumulator #define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ @@ -1940,20 +1945,24 @@ static void ggml_vk_load_shaders(vk_device& device) { #endif // defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) #if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) if (device->coopmat_support) { + auto const &mm_spec_constants = [&](const std::vector& constants, uint32_t load_vec_shift) -> std::vector { + return {constants[0], constants[1], constants[2], constants[3], constants[4], constants[5], constants[6], constants[7], constants[8], constants[9], constants[10], load_vec_shift, load_vec_shift}; + }; + // Create 6 variants, {s,m,l}x{unaligned,aligned} #define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, mm_spec_constants(l_ ## WARPTILE, 0), 1, false, true); \ if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, mm_spec_constants(m_ ## WARPTILE, 0), 1, false, true); \ if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, mm_spec_constants(s_ ## WARPTILE, 0), 1, false, true); \ if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, mm_spec_constants(l_ ## WARPTILE, 3), l_align, false, true); \ if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, mm_spec_constants(m_ ## WARPTILE, 3), m_align, false, true); \ if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, mm_spec_constants(s_ ## WARPTILE, 3), s_align, false, true); \ // Create 2 variants, {f16,f32} accumulator #define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ @@ -2065,20 +2074,24 @@ static void ggml_vk_load_shaders(vk_device& device) { } else #endif // defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) if (device->fp16) { + auto const &mm_spec_constants = [&](const std::vector& constants, uint32_t load_vec_shift) -> std::vector { + return {constants[0], constants[1], constants[2], constants[3], constants[4], constants[5], constants[6], constants[7], constants[8], constants[9], constants[10], load_vec_shift, load_vec_shift}; + }; + // Create 6 variants, {s,m,l}x{unaligned,aligned} #define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, mm_spec_constants(l_ ## WARPTILE, 0), 1); \ if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, mm_spec_constants(m_ ## WARPTILE, 0), 1); \ if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, mm_spec_constants(s_ ## WARPTILE, 0), 1); \ if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, mm_spec_constants(l_ ## WARPTILE, 3), l_align); \ if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, mm_spec_constants(m_ ## WARPTILE, 3), m_align); \ if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, mm_spec_constants(s_ ## WARPTILE, 3), s_align); \ #define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ if (device->mul_mat ## ID ## _l[TYPE]) \ @@ -2157,20 +2170,24 @@ static void ggml_vk_load_shaders(vk_device& device) { #undef CREATE_MMQ #undef CREATE_MM } else { + auto const &mm_spec_constants = [&](const std::vector& constants, uint32_t load_vec_shift) -> std::vector { + return {constants[0], constants[1], constants[2], constants[3], constants[4], constants[5], constants[6], constants[7], constants[8], constants[9], constants[10], load_vec_shift, load_vec_shift}; + }; + // Create 6 variants, {s,m,l}x{unaligned,aligned} #define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, mm_spec_constants(l_ ## WARPTILE, 0), 1); \ if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, mm_spec_constants(m_ ## WARPTILE, 0), 1); \ if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, mm_spec_constants(s_ ## WARPTILE, 0), 1); \ if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, mm_spec_constants(l_ ## WARPTILE, 2), l_align); \ if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, mm_spec_constants(m_ ## WARPTILE, 2), m_align); \ if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, mm_spec_constants(s_ ## WARPTILE, 2), s_align); \ #define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ if (device->mul_mat ## ID ## _l[TYPE]) \ @@ -7675,6 +7692,8 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, vk_pipeline p; std::string shname; + const size_t kpad = ggml_vk_align_size(k, p->align); + if (shader_size == 0) { p = fp16acc ? pipelines[quant].f16acc->a_s : pipelines[quant].f32acc->a_s; shname = std::string(ggml_type_name(quant)) + "_ALIGNED_S"; diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.hpp b/ggml/src/ggml-vulkan/ggml-vulkan.hpp new file mode 100644 index 0000000000000..619fb65d54cfd --- /dev/null +++ b/ggml/src/ggml-vulkan/ggml-vulkan.hpp @@ -0,0 +1,582 @@ +#ifndef GGML_VULKAN_HPP +#define GGML_VULKAN_HPP + +#include + +#include +#include +#include +#include +#include +#include + +#include "ggml.h" +#include "ggml-backend-impl.h" + +#include "ggml-vulkan-matmul.hpp" + +#define ROUNDUP_POW2(M, N) (((M) + (N) - 1) & ~((N) - 1)) +#define CEIL_DIV(M, N) (((M) + (N)-1) / (N)) + +#define VK_VENDOR_ID_AMD 0x1002 +#define VK_VENDOR_ID_APPLE 0x106b +#define VK_VENDOR_ID_INTEL 0x8086 +#define VK_VENDOR_ID_NVIDIA 0x10de + +#define VK_DEVICE_DESCRIPTOR_POOL_SIZE 32 + +#define GGML_VK_MAX_NODES 8192 + +#define MAX_VK_BUFFERS 256 + +#define VK_CHECK(err, msg) \ + do { \ + vk::Result err_ = (err); \ + if (err_ != vk::Result::eSuccess) { \ + fprintf(stderr, "ggml_vulkan: %s error %s at %s:%d\n", \ + #err, to_string(err_).c_str(), __FILE__, __LINE__); \ + exit(1); \ + } \ + } while (0) + +#ifdef GGML_VULKAN_DEBUG +#define VK_LOG_DEBUG(msg) std::cerr << msg << std::endl +#else +#define VK_LOG_DEBUG(msg) ((void) 0) +#endif // GGML_VULKAN_DEBUG + +struct ggml_backend_vk_context; + +struct vk_queue { + uint32_t queue_family_index; + vk::Queue queue; + vk::CommandPool pool; + uint32_t cmd_buffer_idx; + std::vector cmd_buffers; + + vk::PipelineStageFlags stage_flags; + + bool transfer_only; +}; + +struct vk_pipeline_struct { + std::string name; + vk::ShaderModule shader_module; + vk::DescriptorSetLayout dsl; + std::vector descriptor_pools; + std::vector descriptor_sets; + uint32_t descriptor_set_idx; + vk::PipelineLayout layout; + vk::Pipeline pipeline; + uint32_t push_constant_size; + uint32_t parameter_count; + std::array wg_denoms; + uint32_t align; + // set to true to request the pipeline is compiled after the dryrun + bool needed {}; + // set to true when the shader has been compiled + bool compiled {}; +}; + +typedef std::shared_ptr vk_pipeline; +typedef std::weak_ptr vk_pipeline_ref; + +struct vk_device_struct; +typedef std::shared_ptr vk_device; +typedef std::weak_ptr vk_device_ref; + +struct vk_buffer_struct; +typedef std::shared_ptr vk_buffer; +typedef std::weak_ptr vk_buffer_ref; + +struct ggml_backend_vk_buffer_type_context { + std::string name; + vk_device device; +}; + +enum vk_device_architecture { + OTHER, + AMD_GCN, + AMD_RDNA1, + AMD_RDNA2, + AMD_RDNA3, +}; + +static constexpr uint32_t mul_mat_vec_max_cols = 8; + +struct vk_device_struct { + std::mutex mutex; + + vk::PhysicalDevice physical_device; + vk::PhysicalDeviceProperties properties; + std::string name; + uint64_t max_memory_allocation_size; + uint64_t suballocation_block_size; + bool fp16; + bool pipeline_robustness; + vk::Device device; + uint32_t vendor_id; + vk_device_architecture architecture; + vk_queue compute_queue; + vk_queue transfer_queue; + bool single_queue; + uint32_t subgroup_size; + uint32_t shader_core_count; + bool uma; + bool prefer_host_memory; + bool float_controls_rte_fp16; + + bool subgroup_size_control; + uint32_t subgroup_min_size; + uint32_t subgroup_max_size; + bool subgroup_require_full_support; + + bool coopmat_support; + bool coopmat_acc_f32_support; + bool coopmat_acc_f16_support; + uint32_t coopmat_m; + uint32_t coopmat_n; + uint32_t coopmat_k; + bool coopmat2; + + size_t idx; + + bool mul_mat_l[GGML_TYPE_COUNT]; + bool mul_mat_m[GGML_TYPE_COUNT]; + bool mul_mat_s[GGML_TYPE_COUNT]; + bool mul_mat_id_l[GGML_TYPE_COUNT]; + bool mul_mat_id_m[GGML_TYPE_COUNT]; + bool mul_mat_id_s[GGML_TYPE_COUNT]; + + // set to true to indicate that some shaders need to be compiled after the dryrun + bool need_compiles {}; + + vk_matmul_pipeline pipeline_matmul_f32 {}; + vk_matmul_pipeline pipeline_matmul_f32_f16 {}; + vk_matmul_pipeline2 pipeline_matmul_f16; + vk_matmul_pipeline2 pipeline_matmul_f16_f32; + vk_pipeline pipeline_matmul_split_k_reduce; + + vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_f16[GGML_TYPE_COUNT]; + vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat[GGML_TYPE_COUNT]; + + vk_matmul_pipeline pipeline_matmul_id_f32 {}; + vk_matmul_pipeline2 pipeline_matmul_id_f16; + vk_matmul_pipeline2 pipeline_matmul_id_f16_f32; + + vk_matmul_pipeline2 pipeline_dequant_mul_mat_mat_id[GGML_TYPE_COUNT]; + + vk_pipeline pipeline_dequant[GGML_TYPE_COUNT]; + vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_COUNT][mul_mat_vec_max_cols]; + vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_COUNT][mul_mat_vec_max_cols]; + vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_COUNT]; + + vk_pipeline pipeline_mul_mat_vec_p021_f16_f32; + vk_pipeline pipeline_mul_mat_vec_nc_f16_f32; + vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT]; + vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT]; + vk_pipeline pipeline_acc_f32; + vk_pipeline pipeline_add_f32, pipeline_add_f32_norepeat; + vk_pipeline pipeline_add_f16_f32_f16, pipeline_add_f16_f32_f16_norepeat; + vk_pipeline pipeline_sub_f32, pipeline_sub_f32_norepeat; + vk_pipeline pipeline_mul_f32, pipeline_mul_f32_norepeat; + vk_pipeline pipeline_div_f32, pipeline_div_f32_norepeat; + vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32; + vk_pipeline pipeline_upscale_f32; + vk_pipeline pipeline_scale_f32; + vk_pipeline pipeline_sqr_f32; + vk_pipeline pipeline_sin_f32; + vk_pipeline pipeline_cos_f32; + vk_pipeline pipeline_clamp_f32; + vk_pipeline pipeline_pad_f32; + vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32; + vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16; + vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16; + vk_pipeline pipeline_cpy_f32_quant[GGML_TYPE_COUNT]; + vk_pipeline pipeline_cpy_quant_f32[GGML_TYPE_COUNT]; + vk_pipeline pipeline_norm_f32; + vk_pipeline pipeline_group_norm_f32; + vk_pipeline pipeline_rms_norm_f32; + vk_pipeline pipeline_rms_norm_back_f32; + vk_pipeline pipeline_l2_norm_f32; + vk_pipeline pipeline_gelu_f32; + vk_pipeline pipeline_gelu_quick_f32; + vk_pipeline pipeline_silu_f32; + vk_pipeline pipeline_silu_back_f32; + vk_pipeline pipeline_relu_f32; + vk_pipeline pipeline_leaky_relu_f32; + vk_pipeline pipeline_tanh_f32; + vk_pipeline pipeline_sigmoid_f32; + vk_pipeline pipeline_diag_mask_inf_f32; + vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16; + vk_pipeline pipeline_soft_max_f32_wg512, pipeline_soft_max_f32_f16_wg512; + vk_pipeline pipeline_soft_max_back_f32; + vk_pipeline pipeline_rope_norm_f32, pipeline_rope_norm_f16; + vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16; + vk_pipeline pipeline_rope_multi_f32, pipeline_rope_multi_f16; + vk_pipeline pipeline_rope_vision_f32, pipeline_rope_vision_f16; + vk_pipeline pipeline_argsort_f32; + vk_pipeline pipeline_sum_rows_f32; + vk_pipeline pipeline_argmax_f32; + vk_pipeline pipeline_count_equal_i32; + vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16; + vk_pipeline pipeline_timestep_embedding_f32; + vk_pipeline pipeline_pool2d_f32; + vk_pipeline pipeline_rwkv_wkv6_f32; + vk_pipeline pipeline_rwkv_wkv7_f32; + vk_pipeline pipeline_opt_step_adamw_f32; + + // [2][2][2] is for {f16acc,f32acc}x{large,small_rows}x{unaligned, aligned} + vk_pipeline pipeline_flash_attn_f32_f16_D64[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D80[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D96[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D112[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D128[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_f32_f16_D256[GGML_TYPE_COUNT][2][2][2]; + + std::unordered_map pipelines; + std::unordered_map pipeline_descriptor_set_requirements; + + std::vector> pinned_memory; + + vk::Fence fence; + vk_buffer sync_staging; + + ggml_backend_buffer_type buffer_type; + +#ifdef GGML_VULKAN_MEMORY_DEBUG + std::unique_ptr memory_logger; +#endif +#ifdef GGML_VULKAN_PERF + std::unique_ptr perf_logger; +#endif + + ~vk_device_struct(); +}; + +struct vk_buffer_struct { + vk::Buffer buffer = VK_NULL_HANDLE; + vk::DeviceMemory device_memory = VK_NULL_HANDLE; + vk::MemoryPropertyFlags memory_property_flags; + void * ptr; + size_t size = 0; + + vk_device device; + + ~vk_buffer_struct(); +}; + +struct vk_subbuffer { + vk_buffer buffer; + uint64_t offset; + uint64_t size; + + operator vk::DescriptorBufferInfo() const { + return { buffer->buffer, offset, size }; + } +}; + +struct vk_semaphore { + vk::Semaphore s; + uint64_t value; +}; + +struct vk_submission { + vk::CommandBuffer buffer; + std::vector wait_semaphores; + std::vector signal_semaphores; +}; + +typedef std::vector vk_sequence; + +struct vk_mat_mat_push_constants { + uint32_t M; uint32_t N; uint32_t K; + uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; + uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; + uint32_t k_split; + uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3; + uint32_t padded_N; +}; +struct vk_mat_vec_push_constants { + uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; + uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; + uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3; +}; + +struct vk_mat_mat_id_push_constants { + uint32_t M; uint32_t N; uint32_t K; + uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; + uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; + uint32_t nei0; uint32_t nei1; uint32_t nbi1; uint32_t ne11; + uint32_t padded_N; +}; +struct vk_mat_vec_id_push_constants { + uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; + uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; + uint32_t nei0; uint32_t ne11; +}; + +struct vk_flash_attn_push_constants { + uint32_t N; + uint32_t KV; + + uint32_t ne1; + uint32_t ne2; + uint32_t ne3; + + uint32_t neq2; + uint32_t neq3; + uint32_t nek2; + uint32_t nek3; + uint32_t nev2; + uint32_t nev3; + uint32_t nem1; + + uint32_t nb01; + uint32_t nb02; + uint32_t nb03; + uint32_t nb11; + uint32_t nb12; + uint32_t nb13; + uint32_t nb21; + uint32_t nb22; + uint32_t nb23; + uint32_t nb31; + + float scale; + float max_bias; + float logit_softcap; + + uint32_t mask; + uint32_t n_head_log2; + float m0; + float m1; +}; + +struct vk_op_push_constants { + uint32_t KX; + uint32_t KY; + float param1; + float param2; +}; + +struct vk_op_unary_push_constants { + uint32_t ne; + uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; + uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; + uint32_t misalign_offsets; + float param1; float param2; + uint32_t ne0_012mp; uint32_t ne0_012L; + uint32_t ne0_01mp; uint32_t ne0_01L; + uint32_t ne0_0mp; uint32_t ne0_0L; + uint32_t ne1_012mp; uint32_t ne1_012L; + uint32_t ne1_01mp; uint32_t ne1_01L; + uint32_t ne1_0mp; uint32_t ne1_0L; +}; +static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128"); + +struct vk_op_binary_push_constants { + uint32_t ne; + uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; + uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; + uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; uint32_t nb20; uint32_t nb21; uint32_t nb22; uint32_t nb23; + uint32_t misalign_offsets; + float param1; float param2; int32_t param3; +}; + +struct vk_op_diag_mask_push_constants { + uint32_t ncols; + uint32_t rows_per_channel; + int32_t n_past; +}; + +struct vk_op_rope_push_constants { + uint32_t ncols; + uint32_t n_dims; + float freq_scale; + uint32_t p_delta_rows; + float freq_base; + float ext_factor; + float attn_factor; + float corr_dims[2]; + float theta_scale; + uint32_t has_ff; + uint32_t ne02; + uint32_t s1; + uint32_t s2; + int32_t sections[4]; + uint32_t is_back; +}; + +struct vk_op_soft_max_push_constants { + uint32_t KX; + uint32_t KY; + float scale; + float max_bias; + float m0; + float m1; + uint32_t n_head_log2; + uint32_t nrows_x; +}; + +struct vk_op_argsort_push_constants { + uint32_t ncols; + uint32_t ncols_pad; + int32_t order; +}; + +struct vk_op_im2col_push_constants { + uint32_t batch_offset; uint32_t offset_delta; + uint32_t IC; + uint32_t IW; uint32_t IH; + uint32_t OW; uint32_t OH; + uint32_t KW; uint32_t KH; + uint32_t pelements; + uint32_t CHW; + int32_t s0; int32_t s1; + int32_t p0; int32_t p1; + int32_t d0; int32_t d1; +}; + +struct vk_op_timestep_embedding_push_constants { + uint32_t nb1; + uint32_t dim; + uint32_t max_period; +}; + +struct vk_op_pool2d_push_constants { + uint32_t IW; uint32_t IH; + uint32_t OW; uint32_t OH; + uint32_t OC; + uint32_t pelements; + uint32_t op; + int32_t k0; int32_t k1; + int32_t s0; int32_t s1; + int32_t p0; int32_t p1; +}; + +struct vk_op_rwkv_wkv6_push_constants { + uint32_t B; + uint32_t T; + uint32_t C; + uint32_t H; +}; + +struct vk_op_rwkv_wkv7_push_constants { + uint32_t B; + uint32_t T; + uint32_t C; + uint32_t H; +}; + +// Allow pre-recording command buffers +struct vk_staging_memcpy { + vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {} + + void * dst; + const void * src; + size_t n; +}; + +struct vk_op_upscale_push_constants { + uint32_t ne; uint32_t a_offset; uint32_t d_offset; + uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; + uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; + float sf0; float sf1; float sf2; float sf3; +}; + +struct vk_context_struct { + vk_submission * s; + std::vector seqs; + + int exit_tensor_idx; + + std::vector in_memcpys; + std::vector out_memcpys; + + vk_queue * q; +}; +typedef std::shared_ptr vk_context; +typedef std::weak_ptr vk_context_ref; + +struct ggml_vk_garbage_collector { + std::vector tl_semaphores; + std::vector semaphores; + std::vector events; + std::vector temp_buffers; + std::vector contexts; +}; + +#if defined(GGML_VULKAN_MEMORY_DEBUG) || defined(GGML_VULKAN_DEBUG) +#define VK_LOG_MEMORY(msg) std::cerr << "ggml_vulkan memory: " << msg << std::endl + +static std::mutex log_mutex; + +class vk_memory_logger { +public: + vk_memory_logger(): total_device(0), total_host(0) {} + void log_allocation(vk_buffer_ref buf_ref, size_t size); + void log_deallocation(vk_buffer_ref buf_ref); + +private: + std::map allocations; // Track allocations + size_t total_device; + size_t total_host; +}; +#else +#define VK_LOG_MEMORY(msg) ((void) 0) +#endif // GGML_VULKAN_MEMORY_DEBUG + +#if defined(GGML_VULKAN_PERF) + +class vk_perf_logger { +public: + void print_timings(); + + void log_timing(const ggml_tensor * node, uint64_t time); +private: + std::map> timings; +}; +#endif // GGML_VULKAN_PERF + +struct ggml_backend_vk_context { + std::string name; + + vk_device device; + + size_t semaphore_idx, event_idx; + ggml_vk_garbage_collector gc; + size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k; + vk_buffer prealloc_x, prealloc_y, prealloc_split_k; + vk::Fence fence; + + vk_buffer buffer_pool[MAX_VK_BUFFERS]; + + vk_context_ref compute_ctx; + vk_context_ref transfer_ctx; + + std::vector tensor_ctxs; +}; + +static void * const vk_ptr_base = (void *)(uintptr_t) 0x1000; // NOLINT + +static uint64_t vk_tensor_offset(const ggml_tensor * tensor); + +struct ggml_backend_vk_buffer_context { + vk_device_ref device; + vk_buffer dev_buffer; + std::string name; + + ggml_backend_vk_buffer_context(vk_device_ref device, vk_buffer&& dev_buffer, std::string& name); + + ~ggml_backend_vk_buffer_context(); +}; + +struct vk_instance_t { + vk::Instance instance; + + std::vector device_indices; + vk_device devices[GGML_VK_MAX_DEVICES]; +}; + +#endif // GGML_VULKAN_HPP diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp index 23ce8ceec332b..94a93da55c75c 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp @@ -22,16 +22,15 @@ #include "types.comp" -#ifndef LOAD_VEC_A -#define LOAD_VEC_A 1 -#endif -#ifndef LOAD_VEC_B -#define LOAD_VEC_B 1 -#endif - layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; +#if defined(A_TYPE_VEC4) +layout (binding = 0) readonly buffer A_VEC4 {A_TYPE_VEC4 data_a_vec4[];}; +#endif +#if defined(FLOAT16) && defined(A_TYPE_VEC8) +layout (binding = 0) readonly buffer A_VEC8 {A_TYPE_VEC8 data_a_vec8[];}; +#endif #if defined(A_TYPE_PACKED16) layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];}; #endif @@ -40,7 +39,14 @@ layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32 #endif layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; +layout (binding = 1) readonly buffer B_VEC4 {B_TYPE_VEC4 data_b_vec4[];}; +#if defined(B_TYPE_VEC8) +layout (binding = 1) readonly buffer B_VEC8 {B_TYPE_VEC8 data_b_vec8[];}; +#endif + layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; +layout (binding = 2) writeonly buffer D_VEC2 {D_TYPE_VEC2 data_d_vec2[];}; +layout (binding = 2) writeonly buffer D_VEC4 {D_TYPE_VEC4 data_d_vec4[];}; #ifdef MUL_MAT_ID layout (binding = 3) readonly buffer IDS {int data_ids[];}; @@ -84,15 +90,19 @@ layout (constant_id = 7) const uint TM = 4; layout (constant_id = 8) const uint TN = 2; layout (constant_id = 9) const uint TK = 1; // Only needed for coopmat layout (constant_id = 10) const uint WARP = 32; +#if !defined(LOAD_VEC_A_SHIFT) +layout (constant_id = 11) const uint LOAD_VEC_A_SHIFT = 0; +#endif +layout (constant_id = 12) const uint LOAD_VEC_B_SHIFT = 0; #ifdef COOPMAT #define SHMEM_STRIDE (BK + 8) #else -#define SHMEM_STRIDE (BK + 1) +#define SHMEM_STRIDE (BK / 2 + 1) #endif -shared FLOAT_TYPE buf_a[BM * SHMEM_STRIDE]; -shared FLOAT_TYPE buf_b[BN * SHMEM_STRIDE]; +shared FLOAT_TYPE_VEC2 buf_a[BM * SHMEM_STRIDE]; +shared FLOAT_TYPE_VEC2 buf_b[BN * SHMEM_STRIDE]; #ifdef MUL_MAT_ID shared u16vec2 row_ids[3072]; @@ -104,6 +114,8 @@ shared u16vec2 row_ids[3072]; shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS]; #endif +#include "mul_mm_funcs.comp" + void main() { #ifdef NEEDS_INIT_IQ_SHMEM init_iq_shmem(gl_WorkGroupSize); @@ -155,13 +167,21 @@ void main() { const uint warp_r = warp_i % (BM / WM); const uint warp_c = warp_i / (BM / WM); - const uint loadr_a = gl_LocalInvocationID.x % (BK / LOAD_VEC_A); - const uint loadc_a = gl_LocalInvocationID.x / (BK / LOAD_VEC_A); - const uint loadr_b = gl_LocalInvocationID.x % (BK / LOAD_VEC_B); - const uint loadc_b = gl_LocalInvocationID.x / (BK / LOAD_VEC_B); +#ifdef MUL_MAT_ID + const uint start_k = 0; + const uint end_k = p.K; +#else + const uint start_k = ik * p.k_split; + const uint end_k = min(p.K, (ik + 1) * p.k_split); +#endif + + const uint loadr_a = gl_LocalInvocationID.x % (BK >> LOAD_VEC_A_SHIFT); + const uint loadc_a = gl_LocalInvocationID.x / (BK >> LOAD_VEC_A_SHIFT); + const uint loadr_b = gl_LocalInvocationID.x % (BK >> LOAD_VEC_B_SHIFT); + const uint loadc_b = gl_LocalInvocationID.x / (BK >> LOAD_VEC_B_SHIFT); - const uint loadstride_a = gl_WorkGroupSize.x * LOAD_VEC_A / BK; - const uint loadstride_b = gl_WorkGroupSize.x * LOAD_VEC_B / BK; + const uint loadstride_a = (gl_WorkGroupSize.x << LOAD_VEC_A_SHIFT) / BK; + const uint loadstride_b = (gl_WorkGroupSize.x << LOAD_VEC_B_SHIFT) / BK; #ifdef MUL_MAT_ID uint _ne1 = 0; @@ -180,25 +200,17 @@ void main() { if (ic * BN >= _ne1) return; #endif -#ifdef MUL_MAT_ID - const uint start_k = 0; - const uint end_k = p.K; -#else - const uint start_k = ik * p.k_split; - const uint end_k = min(p.K, (ik + 1) * p.k_split); -#endif - uint pos_a = ( #ifdef MUL_MAT_ID expert_idx * p.batch_stride_a + #else batch_idx_a * p.batch_stride_a + #endif - ir * BM * p.stride_a + start_k) / LOAD_VEC_A; + ir * BM * p.stride_a + start_k) >> LOAD_VEC_A_SHIFT; #ifdef MUL_MAT_ID uint pos_b = 0; #else - uint pos_b = (batch_idx * p.batch_stride_b + ic * BN * p.stride_b + start_k) / LOAD_VEC_B; + uint pos_b = (batch_idx * p.batch_stride_b + ic * BN * p.stride_b + start_k) >> LOAD_VEC_B_SHIFT; #endif #ifdef COOPMAT @@ -211,8 +223,8 @@ void main() { } #else ACC_TYPE sums[WMITER * TM * WNITER * TN]; - FLOAT_TYPE cache_a[WMITER * TM]; - FLOAT_TYPE cache_b[TN]; + FLOAT_TYPE_VEC2 cache_a[WMITER * TM]; + FLOAT_TYPE_VEC2 cache_b[TN]; [[unroll]] for (uint i = 0; i < WMITER*TM*WNITER*TN; i++) { sums[i] = ACC_TYPE(0.0f); @@ -221,505 +233,20 @@ void main() { for (uint block = start_k; block < end_k; block += BK) { [[unroll]] for (uint l = 0; l < BM; l += loadstride_a) { - -#if defined(DATA_A_F32) || defined(DATA_A_F16) -#if LOAD_VEC_A == 8 - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - buf_a[buf_idx ] = FLOAT_TYPE(data_a[idx][0].x); - buf_a[buf_idx + 1] = FLOAT_TYPE(data_a[idx][0].y); - buf_a[buf_idx + 2] = FLOAT_TYPE(data_a[idx][0].z); - buf_a[buf_idx + 3] = FLOAT_TYPE(data_a[idx][0].w); - buf_a[buf_idx + 4] = FLOAT_TYPE(data_a[idx][1].x); - buf_a[buf_idx + 5] = FLOAT_TYPE(data_a[idx][1].y); - buf_a[buf_idx + 6] = FLOAT_TYPE(data_a[idx][1].z); - buf_a[buf_idx + 7] = FLOAT_TYPE(data_a[idx][1].w); -#elif LOAD_VEC_A == 4 - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - buf_a[buf_idx ] = FLOAT_TYPE(data_a[idx].x); - buf_a[buf_idx + 1] = FLOAT_TYPE(data_a[idx].y); - buf_a[buf_idx + 2] = FLOAT_TYPE(data_a[idx].z); - buf_a[buf_idx + 3] = FLOAT_TYPE(data_a[idx].w); -#else - if (ir * BM + loadc_a + l < p.M && block + loadr_a < end_k) { - buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(data_a[pos_a + (loadc_a + l) * p.stride_a + loadr_a]); - } else { - buf_a[(loadc_a + l) * SHMEM_STRIDE + loadr_a] = FLOAT_TYPE(0.0f); - } -#endif -#elif defined(DATA_A_Q4_0) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 4 * loadr_a; - - const uint ib = idx / 4; - const uint iqs = idx & 0x03; - - const float d = float(data_a_packed16[ib].d); - const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16); - const vec4 v0 = (vec4(unpack8(vui & 0x0F0F0F0F)) - 8.0f) * d; - const vec4 v1 = (vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) - 8.0f) * d; - - buf_a[buf_idx ] = FLOAT_TYPE(v0.x); - buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y); - buf_a[buf_idx + 2 ] = FLOAT_TYPE(v0.z); - buf_a[buf_idx + 3 ] = FLOAT_TYPE(v0.w); - buf_a[buf_idx + 16] = FLOAT_TYPE(v1.x); - buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y); - buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z); - buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w); -#elif defined(DATA_A_Q4_1) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 4 * loadr_a; - - const uint ib = idx / 4; - const uint iqs = idx & 0x03; - - const float d = float(data_a_packed16[ib].d); - const float m = float(data_a_packed16[ib].m); - const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16); - const vec4 v0 = vec4(unpack8(vui & 0x0F0F0F0F)) * d + m; - const vec4 v1 = vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) * d + m; - - buf_a[buf_idx ] = FLOAT_TYPE(v0.x); - buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y); - buf_a[buf_idx + 2 ] = FLOAT_TYPE(v0.z); - buf_a[buf_idx + 3 ] = FLOAT_TYPE(v0.w); - buf_a[buf_idx + 16] = FLOAT_TYPE(v1.x); - buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y); - buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z); - buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w); -#elif defined(DATA_A_Q5_0) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a; - - const uint ib = idx / 8; - const uint iqs = idx & 0x07; - - const float d = float(data_a_packed16[ib].d); - const uint uint_qh = uint(data_a_packed16[ib].qh[1]) << 16 | uint(data_a_packed16[ib].qh[0]); - const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10); - const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10); - - const uint vui = uint(data_a_packed16[ib].qs[iqs]); - const vec4 v = (vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) - 16.0f) * d; - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1 ] = FLOAT_TYPE(v.z); - buf_a[buf_idx + 16] = FLOAT_TYPE(v.y); - buf_a[buf_idx + 17] = FLOAT_TYPE(v.w); -#elif defined(DATA_A_Q5_1) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a; - - const uint ib = idx / 8; - const uint iqs = idx & 0x07; - - const float d = float(data_a_packed16[ib].d); - const float m = float(data_a_packed16[ib].m); - const uint uint_qh = data_a_packed16[ib].qh; - const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10); - const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10); - - const uint vui = uint(data_a_packed16[ib].qs[iqs]); - const vec4 v = vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) * d + m; - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1 ] = FLOAT_TYPE(v.z); - buf_a[buf_idx + 16] = FLOAT_TYPE(v.y); - buf_a[buf_idx + 17] = FLOAT_TYPE(v.w); -#elif defined(DATA_A_Q8_0) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 8; - const uint iqs = idx & 0x07; - - const float d = float(data_a_packed16[ib].d); - const i8vec2 v0 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs])).xy; // vec4 used due to #12147 - const i8vec2 v1 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs + 1])).xy; - const vec4 v = vec4(v0.x, v0.y, v1.x, v1.y) * d; - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); - buf_a[buf_idx + 2] = FLOAT_TYPE(v.z); - buf_a[buf_idx + 3] = FLOAT_TYPE(v.w); -#elif defined(DATA_A_Q2_K) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 128; // 2 values per idx - const uint iqs = idx % 128; // 0..127 - - const uint qsi = (iqs / 64) * 32 + (iqs % 16) * 2; // 0,2,4..30 - const uint scalesi = iqs / 8; // 0..15 - const uint qsshift = ((iqs % 64) / 16) * 2; // 0,2,4,6 - - const uvec2 qs = uvec2(data_a[ib].qs[qsi], data_a[ib].qs[qsi + 1]); - const uint scales = data_a[ib].scales[scalesi]; - const vec2 d = vec2(data_a[ib].d); - - const vec2 v = d.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - d.y * float(scales >> 4); - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); -#elif defined(DATA_A_Q3_K) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 128; // 2 values per idx - const uint iqs = idx % 128; // 0..127 - - const uint n = iqs / 64; // 0,1 - const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..62 - const uint hmi = (iqs % 16) * 2; // 0,2,4..30 - const uint j = (iqs % 64) / 4; // 0..3 - const uint is = iqs / 8; // 0..15 - const uint halfsplit = ((iqs % 64) / 16); // 0,1,2,3 - const uint qsshift = halfsplit * 2; // 0,2,4,6 - const uint m = 1 << (4 * n + halfsplit); // 1,2,4,8,16,32,64,128 - - const int8_t us = int8_t(((data_a[ib].scales[is % 8] >> (4 * int(is / 8))) & 0xF) - | (((data_a[ib].scales[8 + (is % 4)] >> (2 * int(is / 4))) & 3) << 4)); - const float dl = float(data_a[ib].d) * float(us - 32); - - buf_a[buf_idx ] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi ] >> qsshift) & 3) - (((data_a[ib].hmask[hmi ] & m) != 0) ? 0 : 4))); - buf_a[buf_idx + 1] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[ib].hmask[hmi + 1] & m) != 0) ? 0 : 4))); -#elif defined(DATA_A_Q4_K) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 128; // 2 values per idx - const uint iqs = idx % 128; // 0..127 - - const uint n = iqs / 32; // 0,1,2,3 - const uint b = (iqs % 32) / 16; // 0,1 - const uint is = 2 * n + b; // 0..7 - const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..126 - - const vec2 loadd = vec2(data_a[ib].d); - - const uint scidx0 = (is < 4) ? is : (is + 4); - const uint scidx1 = (is < 4) ? is : (is - 4); - const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0; - const uint scidxshift1 = (is < 4) ? 0 : 2; - const uint mbidx0 = is + 4; - const uint mbidx1 = (is < 4) ? is + 4 : is; - const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0; - const uint mbidxshift0 = (is < 4) ? 0 : 4; - const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0; - const uint mbidxshift1 = (is < 4) ? 0 : 2; - - const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1)); - const uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1)); - - const float d = loadd.x * sc; - const float m = -loadd.y * mbyte; - - buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF), m)); - buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF), m)); -#elif defined(DATA_A_Q5_K) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 128; // 2 values per idx - const uint iqs = idx % 128; // 0..127 - - const uint n = iqs / 32; // 0,1,2,3 - const uint b = (iqs % 32) / 16; // 0,1 - const uint is = 2 * n + b; // 0..7 - const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..126 - const uint qhi = (iqs % 16) * 2; // 0,2,4..30 - - const uint8_t hm = uint8_t(1 << (iqs / 16)); - - const vec2 loadd = vec2(data_a[ib].d); - - const uint scidx0 = (is < 4) ? is : (is + 4); - const uint scidx1 = (is < 4) ? is : (is - 4); - const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0; - const uint scidxshift1 = (is < 4) ? 0 : 2; - const uint mbidx0 = is + 4; - const uint mbidx1 = (is < 4) ? is + 4 : is; - const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0; - const uint mbidxshift0 = (is < 4) ? 0 : 4; - const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0; - const uint mbidxshift1 = (is < 4) ? 0 : 2; - - const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1)); - const uint8_t mbyte = uint8_t(((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1)); - - const float d = loadd.x * sc; - const float m = -loadd.y * mbyte; - - buf_a[buf_idx ] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi ] & hm) != 0 ? 16 : 0), m)); - buf_a[buf_idx + 1] = FLOAT_TYPE(fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0), m)); -#elif defined(DATA_A_Q6_K) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 128; // 2 values per idx - const uint iqs = idx % 128; // 0..127 - - const uint n = iqs / 64; // 0,1 - const uint b = (iqs % 64) / 32; // 0,1 - const uint is_b = (iqs % 16) / 8; // 0,1 - const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6 - const uint is = 8 * n + qhshift + is_b; // 0..15 - const uint qsi = n * 64 + (iqs % 32) * 2; // 0,2,4..126 - const uint qhi = n * 32 + (iqs % 16) * 2; // 0,2,4..62 - - const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]); - - buf_a[buf_idx ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32)); - buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32)); -#elif defined(DATA_A_IQ1_S) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 128; // 2 values per idx - const uint ib32 = (idx % 128) / 16; // 0..7 - const uint ib8 = (idx % 128) / 4; - const int i8 = 2 * int(idx % 4); - - const float d = float(data_a[ib].d); - const uint qh = data_a[ib].qh[ib32]; - const uint qs = data_a[ib].qs[ib8]; - const float dl = d * (2 * bitfieldExtract(qh, 12, 3) + 1); - const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA; - const int16_t grid = int16_t(iq1s_grid[qs | (bitfieldExtract(qh, 3 * int(ib8 & 3), 3) << 8)]); - - const ivec2 gvec = ivec2( - bitfieldExtract(grid, 2 * (i8), 2), - bitfieldExtract(grid, 2 * (i8 + 1), 2) - ); - const vec2 v = dl * (vec2(gvec) + delta); - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); -#elif defined(DATA_A_IQ1_M) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 128; // 2 values per idx - const uint ib8 = (idx % 128) / 4; - const uint ib16 = ib8 / 2; - const int i8 = 2 * int(idx % 4); - - const uint16_t[4] scales = data_a[ib].scales; - const u16vec4 s = u16vec4(scales[0], scales[1], scales[2], scales[3]) >> 12; - const float d = float(unpackHalf2x16(s.x | (s.y << 4) | (s.z << 8) | (s.w << 12)).x); - const uint sc = scales[ib8 / 8]; - const uint qs = data_a[ib].qs[ib8]; - const uint qh = data_a[ib].qh[ib16] >> (4 * (ib8 & 1)); - const float dl = d * (2 * bitfieldExtract(sc, 3 * int(ib16 & 3), 3) + 1); - const float delta = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA; - const int16_t grid = int16_t(iq1s_grid[qs | ((qh & 7) << 8)]); - const ivec2 gvec = ivec2( - bitfieldExtract(grid, 2 * (i8), 2), - bitfieldExtract(grid, 2 * (i8 + 1), 2) - ); - const vec2 v = dl * (vec2(gvec) + delta); - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); -#elif defined(DATA_A_IQ2_XXS) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 128; // 2 values per idx - const uint ib32 = (idx % 128) / 16; // 0..7 - const uint ib8 = (idx / 4) % 4; - - const float d = float(data_a[ib].d); - const uint qs = data_a[ib].qs[8 * ib32 + ib8]; - const uint signs = pack32(u8vec4( - data_a[ib].qs[8*ib32 + 4], - data_a[ib].qs[8*ib32 + 5], - data_a[ib].qs[8*ib32 + 6], - data_a[ib].qs[8*ib32 + 7] - )); - const float db = d * 0.25 * (0.5 + (signs >> 28)); - const uint32_t sign7 = bitfieldExtract(signs, 7 * int(ib8), 7); - const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4)); - const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); - const uint grid = iq2xxs_grid[qs][(idx % 4) / 2] >> (16 * (idx & 1)); - const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); // vec4 used due to #12147 - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); -#elif defined(DATA_A_IQ2_XS) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 128; // 2 values per idx - const uint ib32 = (idx % 128) / 16; // 0..7 - const uint ib8 = (idx / 4) % 4; // 0..3 - - const float d = float(data_a[ib].d); - const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf; - const float db = d * 0.25 * (0.5 + scale); - const uint qs = data_a[ib].qs[4 * ib32 + ib8]; - const uint sign7 = qs >> 9; - const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4)); - const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); - const uint grid = iq2xs_grid[qs & 511][(idx % 4) / 2] >> (16 * (idx & 1)); - const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); // vec4 used due to #12147 - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); -#elif defined(DATA_A_IQ2_S) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 128; // 2 values per idx - const uint ib8 = (idx % 128) / 4; // 0..31 - const uint ib32 = ib8 / 4; // 0..7 - - const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf; - const uint qs = data_a[ib].qs[ib8]; - const uint qh = data_a[ib].qh[ib32]; - const uint qhshift = 2 * (ib8 % 4); - const uint sign = data_a[ib].qs[QUANT_K / 8 + ib8] >> (2 * (idx % 4)); - - const float d = float(data_a[ib].d); - const float db = d * 0.25 * (0.5 + scale); - const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); - const uint16_t grid = unpack16(iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(idx & 2) >> 1])[idx & 1]; - const vec2 v = db * vec2(sign01) * vec2(unpack8(uint32_t(grid)).xy); // vec4 used due to #12147 - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); -#elif defined(DATA_A_IQ3_XXS) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 128; // 2 values per idx - const uint iqs = (idx % 128) / 2; // 0..63 - const uint is = QUANT_K / 4 + 4 * (iqs / 8); // 8 values - - const float d = float(data_a[ib].d); - const uint qs = data_a[ib].qs[iqs]; - const uint signs = pack32(u8vec4( - data_a[ib].qs[is+0], - data_a[ib].qs[is+1], - data_a[ib].qs[is+2], - data_a[ib].qs[is+3] - )); - const float db = d * 0.5 * (0.5 + (signs >> 28)); - const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7); - const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4)); - const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); - const uint grid = iq3xxs_grid[qs] >> (16 * (idx & 1)); - const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); // vec4 used due to #12147 - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); -#elif defined(DATA_A_IQ3_S) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 128; // 2 values per idx - const uint iqs = (idx % 128) / 2; // 0..63 - const uint iqh = iqs / 8; - - const float d = float(data_a[ib].d); - const uint qs = data_a[ib].qs[iqs]; - const uint qh = data_a[ib].qh[iqh]; - const int8_t sign = int8_t(data_a[ib].signs[iqs / 2] >> (2 * (idx % 4))); - const uint scale = data_a[ib].scales[iqs / 16]; - const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(sign << 1, sign))); - const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf)); - const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)] >> (16 * (idx % 2)); - const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); // vec4 used due to #12147 - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); -#elif defined(DATA_A_IQ4_XS) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A; - - const uint ib = idx / 128; // 2 values per idx - const uint ib32 = (idx % 128) / 16; // 0..7 - const uint iq = 16 * ib32 + 2 * (idx % 8); - - const uint sl = (data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF; - const uint sh = ((data_a[ib].scales_h) >> (2 * ib32)) & 3; - const uint qshift = (idx & 8) >> 1; - u8vec2 qs = u8vec2(data_a[ib].qs[iq], data_a[ib].qs[iq + 1]); - qs = (qs >> qshift) & uint8_t(0xF); - - const float d = float(data_a[ib].d); - const vec2 v = d * float(int(sl | (sh << 4)) - 32) * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]); - - buf_a[buf_idx ] = FLOAT_TYPE(v.x); - buf_a[buf_idx + 1] = FLOAT_TYPE(v.y); -#elif defined(DATA_A_IQ4_NL) - const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; - const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a; - - const uint ib = idx / 8; - const uint iqs = idx & 0x07; - - const FLOAT_TYPE d = FLOAT_TYPE(data_a_packed16[ib].d); - const uint vui = uint(data_a_packed16[ib].qs[iqs]); - - buf_a[buf_idx ] = FLOAT_TYPE(kvalues_iq4nl[vui & 0xF]) * d; - buf_a[buf_idx + 1 ] = FLOAT_TYPE(kvalues_iq4nl[bitfieldExtract(vui, 8, 4)]) * d; - buf_a[buf_idx + 16] = FLOAT_TYPE(kvalues_iq4nl[bitfieldExtract(vui, 4, 4)]) * d; - buf_a[buf_idx + 17] = FLOAT_TYPE(kvalues_iq4nl[vui >> 12]) * d; -#endif + load_a_to_shmem(pos_a, loadr_a, loadc_a + l, ir * BM + loadc_a + l, block + loadr_a, end_k); } [[unroll]] for (uint l = 0; l < BN; l += loadstride_b) { -#if LOAD_VEC_B == 8 -#ifdef MUL_MAT_ID - const u16vec2 row_idx = row_ids[ic * BN + loadc_b + l]; - const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + loadr_b; +#if !defined(MUL_MAT_ID) + load_b_to_shmem(pos_b, loadr_b, loadc_b + l, ic * BN + loadc_b + l, block + loadr_b, end_k); #else - const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b; -#endif - const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B; - buf_b[buf_idx + 0] = FLOAT_TYPE(data_b[idx][0].x); - buf_b[buf_idx + 1] = FLOAT_TYPE(data_b[idx][0].y); - buf_b[buf_idx + 2] = FLOAT_TYPE(data_b[idx][0].z); - buf_b[buf_idx + 3] = FLOAT_TYPE(data_b[idx][0].w); - buf_b[buf_idx + 4] = FLOAT_TYPE(data_b[idx][1].x); - buf_b[buf_idx + 5] = FLOAT_TYPE(data_b[idx][1].y); - buf_b[buf_idx + 6] = FLOAT_TYPE(data_b[idx][1].z); - buf_b[buf_idx + 7] = FLOAT_TYPE(data_b[idx][1].w); -#elif LOAD_VEC_B == 4 -#ifdef MUL_MAT_ID - const u16vec2 row_idx = row_ids[ic * BN + loadc_b + l]; - const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + loadr_b; -#else - const uint idx = pos_b + (loadc_b + l) * p.stride_b / LOAD_VEC_B + loadr_b; -#endif - const uint buf_idx = (loadc_b + l) * SHMEM_STRIDE + loadr_b * LOAD_VEC_B; - buf_b[buf_idx + 0] = FLOAT_TYPE(data_b[idx].x); - buf_b[buf_idx + 1] = FLOAT_TYPE(data_b[idx].y); - buf_b[buf_idx + 2] = FLOAT_TYPE(data_b[idx].z); - buf_b[buf_idx + 3] = FLOAT_TYPE(data_b[idx].w); -#elif !MUL_MAT_ID - if (ic * BN + loadc_b + l < p.N && block + loadr_b < end_k) { - buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(data_b[pos_b + (loadc_b + l) * p.stride_b + loadr_b]); - } else { - buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(0.0f); - } -#else - const uint row_i = ic * BN + loadc_b + l; - if (row_i < _ne1) { - const u16vec2 row_idx = row_ids[row_i]; - buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(data_b[pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + loadr_b]); - } else { - buf_b[(loadc_b + l) * SHMEM_STRIDE + loadr_b] = FLOAT_TYPE(0.0f); - } + load_b_to_shmem(pos_b, loadr_b, loadc_b + l, ic, _ne1); #endif } barrier(); - pos_a += BK / LOAD_VEC_A; - pos_b += BK / LOAD_VEC_B; + pos_a += BK >> LOAD_VEC_A_SHIFT; + pos_b += BK >> LOAD_VEC_B_SHIFT; #ifdef COOPMAT [[unroll]] for (uint i = 0; i < BK; i += TK) { @@ -735,7 +262,7 @@ void main() { } } #else - [[unroll]] for (uint i = 0; i < BK; i++) { + [[unroll]] for (uint i = 0; i < BK / 2; i++) { // Load from shared into cache [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) { [[unroll]] for (uint j = 0; j < TM; j++) { @@ -751,7 +278,7 @@ void main() { [[unroll]] for (uint cc = 0; cc < TN; cc++) { [[unroll]] for (uint cr = 0; cr < TM; cr++) { const uint sums_idx = (wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr; - sums[sums_idx] = fma(ACC_TYPE(cache_a[wsir * TM + cr]), ACC_TYPE(cache_b[cc]), sums[sums_idx]); + sums[sums_idx] += dot(ACC_TYPE_VEC2(cache_a[wsir * TM + cr]), ACC_TYPE_VEC2(cache_b[cc])); } } } @@ -816,6 +343,48 @@ void main() { } } #endif // MUL_MAT_ID +#else +#if !defined(MUL_MAT_ID) + // No boundary checks needed in entire subgroup + if (dr + (warp_r + 1) * WM <= p.M && dc + (warp_c + 1) * WN <= p.N) { + [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) { + [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) { + + const uint dr_warp = dr + wsir * WSUBM + tiwr * TM; + const uint dc_warp = dc + wsic * WSUBN + tiwc * TN; + [[unroll]] for (uint cc = 0; cc < TN; cc++) { +#if TM == 4 + data_d[(offsets + (dc_warp + cc) * p.stride_d + dr_warp) / 4] = D_TYPE_VEC4(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM ], + sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + 1], + sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + 2], + sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + 3]); +#elif TM == 2 + data_d[(offsets + (dc_warp + cc) * p.stride_d + dr_warp) / 2] = D_TYPE_VEC2(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM ], + sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + 1]); +#else + [[unroll]] for (uint cr = 0; cr < TM; cr++) { + data_d[offsets + (dc_warp + cc) * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]); + } +#endif + } + } + } + } else { + [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) { + [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) { + + const uint dr_warp = dr + wsir * WSUBM + tiwr * TM; + const uint dc_warp = dc + wsic * WSUBN + tiwc * TN; + [[unroll]] for (uint cc = 0; cc < TN; cc++) { + [[unroll]] for (uint cr = 0; cr < TM; cr++) { + if (dr_warp + cr < p.M && dc_warp + cc < p.N) { + data_d[offsets + (dc_warp + cc) * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]); + } + } + } + } + } + } #else [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) { [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) { @@ -823,23 +392,16 @@ void main() { const uint dr_warp = dr + wsir * WSUBM + tiwr * TM; const uint dc_warp = dc + wsic * WSUBN + tiwc * TN; [[unroll]] for (uint cc = 0; cc < TN; cc++) { -#ifdef MUL_MAT_ID const uint row_i = dc_warp + cc; if (row_i >= _ne1) break; const u16vec2 row_idx = row_ids[row_i]; -#endif // MUL_MAT_ID [[unroll]] for (uint cr = 0; cr < TM; cr++) { -#ifdef MUL_MAT_ID data_d[row_idx.y * p.batch_stride_d + row_idx.x * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]); -#else - if (dr_warp + cr < p.M && dc_warp + cc < p.N) { - data_d[offsets + (dc_warp + cc) * p.stride_d + dr_warp + cr] = D_TYPE(sums[(wsic * TN + cc) * (WMITER * TM) + wsir * TM + cr]); - } -#endif // MUL_MAT_ID } } } } +#endif // MUL_MAT_ID #endif // COOPMAT } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp index 7649febb07193..928be1fd6a33f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp @@ -27,6 +27,8 @@ layout (constant_id = 4) const bool enable_smaller_matrices = false; const uint BNover2 = enable_smaller_matrices ? (BN / 2) : BN; const uint BNover4 = enable_smaller_matrices ? (BN / 4) : BN; +layout (constant_id = 5) const bool aligned = false; + layout (push_constant) uniform parameter { uint M; @@ -186,12 +188,12 @@ void main() { // Hint to the compiler that values are aligned (want 16B alignment). // Quants are always block-aligned, no alignment needed. -#if ALIGNED + if (aligned) { #if QUANT_K == 1 - stride_a &= ~7; -#endif - stride_b &= ~7; + stride_a &= ~7; #endif + stride_b &= ~7; + } // Create layouts for both clamped and unclamped accesses tensorLayoutNV<2> tensorLayoutA = createTensorLayoutNV(2); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp new file mode 100644 index 0000000000000..52c0b4b7683fa --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.comp @@ -0,0 +1,494 @@ + +void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uint idx_m, const uint idx_k, const uint end_k) { +#if defined(DATA_A_F32) || defined(DATA_A_F16) +#if defined(FLOAT16) && defined(A_TYPE_VEC8) + if (LOAD_VEC_A_SHIFT == 3) { + const uint idx = pos_a + col * ((p.stride_a >> LOAD_VEC_A_SHIFT)) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_A_SHIFT) / 2; + const FLOAT_TYPE_VEC8 vals = FLOAT_TYPE_VEC8(data_a_vec8[idx]); + buf_a[buf_idx ] = FLOAT_TYPE_VEC2(vals[0].xy); + buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(vals[0].zw); + buf_a[buf_idx + 2] = FLOAT_TYPE_VEC2(vals[1].xy); + buf_a[buf_idx + 3] = FLOAT_TYPE_VEC2(vals[1].zw); + } else +#endif + if (LOAD_VEC_A_SHIFT == 2) { + const uint idx = pos_a + col * ((p.stride_a >> LOAD_VEC_A_SHIFT)) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_A_SHIFT) / 2; + const FLOAT_TYPE_VEC4 vals = FLOAT_TYPE_VEC4(data_a_vec4[idx]); + buf_a[buf_idx ] = FLOAT_TYPE_VEC2(vals.xy); + buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(vals.zw); + } else if (idx_m < p.M && idx_k + 1 < end_k) { + buf_a[col * SHMEM_STRIDE + row] = FLOAT_TYPE_VEC2(data_a[pos_a + col * p.stride_a + row ], + data_a[pos_a + col * p.stride_a + row + 1]); + } else if (idx_m < p.M && idx_k < end_k) { + buf_a[col * SHMEM_STRIDE + row] = FLOAT_TYPE_VEC2(data_a[pos_a + col * p.stride_a + row], 0.0f); + } else { + buf_a[col * SHMEM_STRIDE + row] = FLOAT_TYPE_VEC2(0.0f); + } +#elif defined(DATA_A_Q4_0) + const uint idx = pos_a + col * (p.stride_a >> LOAD_VEC_A_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (4 * row) / 2; + + const uint ib = idx / 4; + const uint iqs = idx & 0x03; + + const float d = float(data_a_packed16[ib].d); + const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16); + const vec4 v0 = (vec4(unpack8(vui & 0x0F0F0F0F)) - 8.0f) * d; + const vec4 v1 = (vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) - 8.0f) * d; + + buf_a[buf_idx ] = FLOAT_TYPE_VEC2(v0.xy); + buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(v0.zw); + buf_a[buf_idx + 8] = FLOAT_TYPE_VEC2(v1.xy); + buf_a[buf_idx + 9] = FLOAT_TYPE_VEC2(v1.zw); +#elif defined(DATA_A_Q4_1) + const uint idx = pos_a + col * (p.stride_a >> LOAD_VEC_A_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (4 * row) / 2; + + const uint ib = idx / 4; + const uint iqs = idx & 0x03; + + const float d = float(data_a_packed16[ib].d); + const float m = float(data_a_packed16[ib].m); + const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16); + const vec4 v0 = vec4(unpack8(vui & 0x0F0F0F0F)) * d + m; + const vec4 v1 = vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) * d + m; + + buf_a[buf_idx ] = FLOAT_TYPE_VEC2(v0.xy); + buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(v0.zw); + buf_a[buf_idx + 8] = FLOAT_TYPE_VEC2(v1.xy); + buf_a[buf_idx + 9] = FLOAT_TYPE_VEC2(v1.zw); +#elif defined(DATA_A_Q5_0) + const uint idx = pos_a + col * (p.stride_a >> LOAD_VEC_A_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (2 * row) / 2; + + const uint ib = idx / 8; + const uint iqs = idx & 0x07; + + const float d = float(data_a_packed16[ib].d); + const uint uint_qh = uint(data_a_packed16[ib].qh[1]) << 16 | uint(data_a_packed16[ib].qh[0]); + const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10); + const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10); + + const uint vui = uint(data_a_packed16[ib].qs[iqs]); + const vec4 v = (vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) - 16.0f) * d; + + buf_a[buf_idx ] = FLOAT_TYPE_VEC2(v.xz); + buf_a[buf_idx + 8] = FLOAT_TYPE_VEC2(v.yw); +#elif defined(DATA_A_Q5_1) + const uint idx = pos_a + col * (p.stride_a >> LOAD_VEC_A_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (2 * row) / 2; + + const uint ib = idx / 8; + const uint iqs = idx & 0x07; + + const float d = float(data_a_packed16[ib].d); + const float m = float(data_a_packed16[ib].m); + const uint uint_qh = data_a_packed16[ib].qh; + const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10); + const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10); + + const uint vui = uint(data_a_packed16[ib].qs[iqs]); + const vec4 v = vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) * d + m; + + buf_a[buf_idx ] = FLOAT_TYPE_VEC2(v.xz); + buf_a[buf_idx + 8] = FLOAT_TYPE_VEC2(v.yw); +#elif defined(DATA_A_Q8_0) + const uint idx = pos_a + col * (p.stride_a >> LOAD_VEC_A_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_A_SHIFT) / 2; + + const uint ib = idx / 8; + const uint iqs = idx & 0x07; + + const float d = float(data_a_packed16[ib].d); + const i8vec2 v0 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs])).xy; // vec4 used due to #12147 + const i8vec2 v1 = unpack8(int32_t(data_a_packed16[ib].qs[2*iqs + 1])).xy; + const vec4 v = vec4(v0.x, v0.y, v1.x, v1.y) * d; + + buf_a[buf_idx ] = FLOAT_TYPE_VEC2(v.xy); + buf_a[buf_idx + 1] = FLOAT_TYPE_VEC2(v.zw); +#elif defined(DATA_A_Q2_K) + const uint idx = pos_a + col * (p.stride_a >> LOAD_VEC_A_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_A_SHIFT) / 2; + + const uint ib = idx / 128; // 2 values per idx + const uint iqs = idx % 128; // 0..127 + + const uint qsi = (iqs / 64) * 32 + (iqs % 16) * 2; // 0,2,4..30 + const uint scalesi = iqs / 8; // 0..15 + const uint qsshift = ((iqs % 64) / 16) * 2; // 0,2,4,6 + + const uvec2 qs = uvec2(data_a[ib].qs[qsi], data_a[ib].qs[qsi + 1]); + const uint scales = data_a[ib].scales[scalesi]; + const vec2 d = vec2(data_a[ib].d); + + const vec2 v = d.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - d.y * float(scales >> 4); + + buf_a[buf_idx ] = FLOAT_TYPE_VEC2(v.xy); +#elif defined(DATA_A_Q3_K) + const uint idx = pos_a + col * (p.stride_a >> LOAD_VEC_A_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_A_SHIFT) / 2; + + const uint ib = idx / 128; // 2 values per idx + const uint iqs = idx % 128; // 0..127 + + const uint n = iqs / 64; // 0,1 + const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..62 + const uint hmi = (iqs % 16) * 2; // 0,2,4..30 + const uint j = (iqs % 64) / 4; // 0..3 + const uint is = iqs / 8; // 0..15 + const uint halfsplit = ((iqs % 64) / 16); // 0,1,2,3 + const uint qsshift = halfsplit * 2; // 0,2,4,6 + const uint m = 1 << (4 * n + halfsplit); // 1,2,4,8,16,32,64,128 + + const int8_t us = int8_t(((data_a[ib].scales[is % 8] >> (4 * int(is / 8))) & 0xF) + | (((data_a[ib].scales[8 + (is % 4)] >> (2 * int(is / 4))) & 3) << 4)); + const float dl = float(data_a[ib].d) * float(us - 32); + + buf_a[buf_idx] = FLOAT_TYPE_VEC2(dl * float(int8_t((data_a[ib].qs[qsi ] >> qsshift) & 3) - (((data_a[ib].hmask[hmi ] & m) != 0) ? 0 : 4)), + dl * float(int8_t((data_a[ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[ib].hmask[hmi + 1] & m) != 0) ? 0 : 4))); +#elif defined(DATA_A_Q4_K) + const uint idx = pos_a + col * (p.stride_a >> LOAD_VEC_A_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_A_SHIFT) / 2; + + const uint ib = idx / 128; // 2 values per idx + const uint iqs = idx % 128; // 0..127 + + const uint n = iqs / 32; // 0,1,2,3 + const uint b = (iqs % 32) / 16; // 0,1 + const uint is = 2 * n + b; // 0..7 + const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..126 + + const vec2 loadd = vec2(data_a[ib].d); + + const uint scidx0 = (is < 4) ? is : (is + 4); + const uint scidx1 = (is < 4) ? is : (is - 4); + const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0; + const uint scidxshift1 = (is < 4) ? 0 : 2; + const uint mbidx0 = is + 4; + const uint mbidx1 = (is < 4) ? is + 4 : is; + const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0; + const uint mbidxshift0 = (is < 4) ? 0 : 4; + const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0; + const uint mbidxshift1 = (is < 4) ? 0 : 2; + + const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1)); + const uint8_t mbyte = uint8_t((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0 | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1)); + + const float d = loadd.x * sc; + const float m = -loadd.y * mbyte; + + buf_a[buf_idx] = FLOAT_TYPE_VEC2(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF), m), + fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF), m)); +#elif defined(DATA_A_Q5_K) + const uint idx = pos_a + col * (p.stride_a >> LOAD_VEC_A_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_A_SHIFT) / 2; + + const uint ib = idx / 128; // 2 values per idx + const uint iqs = idx % 128; // 0..127 + + const uint n = iqs / 32; // 0,1,2,3 + const uint b = (iqs % 32) / 16; // 0,1 + const uint is = 2 * n + b; // 0..7 + const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..126 + const uint qhi = (iqs % 16) * 2; // 0,2,4..30 + + const uint8_t hm = uint8_t(1 << (iqs / 16)); + + const vec2 loadd = vec2(data_a[ib].d); + + const uint scidx0 = (is < 4) ? is : (is + 4); + const uint scidx1 = (is < 4) ? is : (is - 4); + const uint scidxmask1 = (is < 4) ? 0x30 : 0xC0; + const uint scidxshift1 = (is < 4) ? 0 : 2; + const uint mbidx0 = is + 4; + const uint mbidx1 = (is < 4) ? is + 4 : is; + const uint mbidxmask0 = (is < 4) ? 0xF : 0xF0; + const uint mbidxshift0 = (is < 4) ? 0 : 4; + const uint mbidxmask1 = (is < 4) ? 0x30 : 0xC0; + const uint mbidxshift1 = (is < 4) ? 0 : 2; + + const uint8_t sc = uint8_t((data_a[ib].scales[scidx0] & 0xF) | ((data_a[ib].scales[scidx1] & scidxmask1) >> scidxshift1)); + const uint8_t mbyte = uint8_t(((data_a[ib].scales[mbidx0] & mbidxmask0) >> mbidxshift0) | ((data_a[ib].scales[mbidx1] & mbidxmask1) >> mbidxshift1)); + + const float d = loadd.x * sc; + const float m = -loadd.y * mbyte; + + buf_a[buf_idx] = FLOAT_TYPE_VEC2(fma(d, float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi ] & hm) != 0 ? 16 : 0), m), + fma(d, float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0), m)); +#elif defined(DATA_A_Q6_K) + const uint idx = pos_a + col * (p.stride_a >> LOAD_VEC_A_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_A_SHIFT) / 2; + + const uint ib = idx / 128; // 2 values per idx + const uint iqs = idx % 128; // 0..127 + + const uint n = iqs / 64; // 0,1 + const uint b = (iqs % 64) / 32; // 0,1 + const uint is_b = (iqs % 16) / 8; // 0,1 + const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6 + const uint is = 8 * n + qhshift + is_b; // 0..15 + const uint qsi = n * 64 + (iqs % 32) * 2; // 0,2,4..126 + const uint qhi = n * 32 + (iqs % 16) * 2; // 0,2,4..62 + + const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]); + + buf_a[buf_idx] = FLOAT_TYPE_VEC2(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32), + dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32)); +#elif defined(DATA_A_IQ1_S) + const uint idx = pos_a + col * (p.stride_a >> LOAD_VEC_A_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_A_SHIFT) / 2; + + const uint ib = idx / 128; // 2 values per idx + const uint ib32 = (idx % 128) / 16; // 0..7 + const uint ib8 = (idx % 128) / 4; + const int i8 = 2 * int(idx % 4); + + const float d = float(data_a[ib].d); + const uint qh = data_a[ib].qh[ib32]; + const uint qs = data_a[ib].qs[ib8]; + const float dl = d * (2 * bitfieldExtract(qh, 12, 3) + 1); + const float delta = ((qh & 0x8000) != 0) ? -IQ1S_DELTA : IQ1S_DELTA; + const int16_t grid = int16_t(iq1s_grid[qs | (bitfieldExtract(qh, 3 * int(ib8 & 3), 3) << 8)]); + + const ivec2 gvec = ivec2( + bitfieldExtract(grid, 2 * (i8), 2), + bitfieldExtract(grid, 2 * (i8 + 1), 2) + ); + const vec2 v = dl * (vec2(gvec) + delta); + + buf_a[buf_idx] = FLOAT_TYPE_VEC2(v.xy); +#elif defined(DATA_A_IQ1_M) + const uint idx = pos_a + col * (p.stride_a >> LOAD_VEC_A_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_A_SHIFT) / 2; + + const uint ib = idx / 128; // 2 values per idx + const uint ib8 = (idx % 128) / 4; + const uint ib16 = ib8 / 2; + const int i8 = 2 * int(idx % 4); + + const uint16_t[4] scales = data_a[ib].scales; + const u16vec4 s = u16vec4(scales[0], scales[1], scales[2], scales[3]) >> 12; + const float d = float(unpackHalf2x16(s.x | (s.y << 4) | (s.z << 8) | (s.w << 12)).x); + const uint sc = scales[ib8 / 8]; + const uint qs = data_a[ib].qs[ib8]; + const uint qh = data_a[ib].qh[ib16] >> (4 * (ib8 & 1)); + const float dl = d * (2 * bitfieldExtract(sc, 3 * int(ib16 & 3), 3) + 1); + const float delta = ((qh & 8) != 0) ? -IQ1M_DELTA : IQ1M_DELTA; + const int16_t grid = int16_t(iq1s_grid[qs | ((qh & 7) << 8)]); + const ivec2 gvec = ivec2( + bitfieldExtract(grid, 2 * (i8), 2), + bitfieldExtract(grid, 2 * (i8 + 1), 2) + ); + const vec2 v = dl * (vec2(gvec) + delta); + + buf_a[buf_idx] = FLOAT_TYPE_VEC2(v.xy); +#elif defined(DATA_A_IQ2_XXS) + const uint idx = pos_a + col * (p.stride_a >> LOAD_VEC_A_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_A_SHIFT) / 2; + + const uint ib = idx / 128; // 2 values per idx + const uint ib32 = (idx % 128) / 16; // 0..7 + const uint ib8 = (idx / 4) % 4; + + const float d = float(data_a[ib].d); + const uint qs = data_a[ib].qs[8 * ib32 + ib8]; + const uint signs = pack32(u8vec4( + data_a[ib].qs[8*ib32 + 4], + data_a[ib].qs[8*ib32 + 5], + data_a[ib].qs[8*ib32 + 6], + data_a[ib].qs[8*ib32 + 7] + )); + const float db = d * 0.25 * (0.5 + (signs >> 28)); + const uint32_t sign7 = bitfieldExtract(signs, 7 * int(ib8), 7); + const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4)); + const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); + const uint grid = iq2xxs_grid[qs][(idx % 4) / 2] >> (16 * (idx & 1)); + const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); // vec4 used due to #12147 + + buf_a[buf_idx] = FLOAT_TYPE_VEC2(v.xy); +#elif defined(DATA_A_IQ2_XS) + const uint idx = pos_a + col * (p.stride_a >> LOAD_VEC_A_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_A_SHIFT) / 2; + + const uint ib = idx / 128; // 2 values per idx + const uint ib32 = (idx % 128) / 16; // 0..7 + const uint ib8 = (idx / 4) % 4; // 0..3 + + const float d = float(data_a[ib].d); + const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf; + const float db = d * 0.25 * (0.5 + scale); + const uint qs = data_a[ib].qs[4 * ib32 + ib8]; + const uint sign7 = qs >> 9; + const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4)); + const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); + const uint grid = iq2xs_grid[qs & 511][(idx % 4) / 2] >> (16 * (idx & 1)); + const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); // vec4 used due to #12147 + + buf_a[buf_idx] = FLOAT_TYPE_VEC2(v.xy); +#elif defined(DATA_A_IQ2_S) + const uint idx = pos_a + col * (p.stride_a >> LOAD_VEC_A_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_A_SHIFT) / 2; + + const uint ib = idx / 128; // 2 values per idx + const uint ib8 = (idx % 128) / 4; // 0..31 + const uint ib32 = ib8 / 4; // 0..7 + + const uint scale = (data_a[ib].scales[ib32] >> (2 * (ib8 & 2))) & 0xf; + const uint qs = data_a[ib].qs[ib8]; + const uint qh = data_a[ib].qh[ib32]; + const uint qhshift = 2 * (ib8 % 4); + const uint sign = data_a[ib].qs[QUANT_K / 8 + ib8] >> (2 * (idx % 4)); + + const float d = float(data_a[ib].d); + const float db = d * 0.25 * (0.5 + scale); + const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); + const uint16_t grid = unpack16(iq2s_grid[qs | ((qh << (8 - qhshift)) & 0x300)][(idx & 2) >> 1])[idx & 1]; + const vec2 v = db * vec2(sign01) * vec2(unpack8(uint32_t(grid)).xy); // vec4 used due to #12147 + + buf_a[buf_idx] = FLOAT_TYPE_VEC2(v.xy); +#elif defined(DATA_A_IQ3_XXS) + const uint idx = pos_a + col * (p.stride_a >> LOAD_VEC_A_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_A_SHIFT) / 2; + + const uint ib = idx / 128; // 2 values per idx + const uint iqs = (idx % 128) / 2; // 0..63 + const uint is = QUANT_K / 4 + 4 * (iqs / 8); // 8 values + + const float d = float(data_a[ib].d); + const uint qs = data_a[ib].qs[iqs]; + const uint signs = pack32(u8vec4( + data_a[ib].qs[is+0], + data_a[ib].qs[is+1], + data_a[ib].qs[is+2], + data_a[ib].qs[is+3] + )); + const float db = d * 0.5 * (0.5 + (signs >> 28)); + const uint32_t sign7 = bitfieldExtract(signs, 7 * (int(iqs / 2) % 4), 7); + const uint sign = (sign7 | (bitCount(sign7) << 7)) >> (2 * (idx % 4)); + const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(int8_t(sign << 1), int8_t(sign)))); + const uint grid = iq3xxs_grid[qs] >> (16 * (idx & 1)); + const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); // vec4 used due to #12147 + + buf_a[buf_idx] = FLOAT_TYPE_VEC2(v.xy); +#elif defined(DATA_A_IQ3_S) + const uint idx = pos_a + col * (p.stride_a >> LOAD_VEC_A_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_A_SHIFT) / 2; + + const uint ib = idx / 128; // 2 values per idx + const uint iqs = (idx % 128) / 2; // 0..63 + const uint iqh = iqs / 8; + + const float d = float(data_a[ib].d); + const uint qs = data_a[ib].qs[iqs]; + const uint qh = data_a[ib].qh[iqh]; + const int8_t sign = int8_t(data_a[ib].signs[iqs / 2] >> (2 * (idx % 4))); + const uint scale = data_a[ib].scales[iqs / 16]; + const i8vec2 sign01 = i8vec2(1 - (2 & i8vec2(sign << 1, sign))); + const float db = d * (1 + 2 * ((scale >> (4 * (iqh & 1))) & 0xf)); + const uint32_t grid = iq3s_grid[qs | ((qh << (8 - (iqs % 8))) & 256)] >> (16 * (idx % 2)); + const vec2 v = db * vec2(sign01) * vec2(unpack8(grid).xy); // vec4 used due to #12147 + + buf_a[buf_idx] = FLOAT_TYPE_VEC2(v.xy); +#elif defined(DATA_A_IQ4_XS) + const uint idx = pos_a + col * (p.stride_a >> LOAD_VEC_A_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_A_SHIFT) / 2; + + const uint ib = idx / 128; // 2 values per idx + const uint ib32 = (idx % 128) / 16; // 0..7 + const uint iq = 16 * ib32 + 2 * (idx % 8); + + const uint sl = (data_a[ib].scales_l[ib32/2] >> (4 * (ib32 & 1))) & 0xF; + const uint sh = ((data_a[ib].scales_h) >> (2 * ib32)) & 3; + const uint qshift = (idx & 8) >> 1; + u8vec2 qs = u8vec2(data_a[ib].qs[iq], data_a[ib].qs[iq + 1]); + qs = (qs >> qshift) & uint8_t(0xF); + + const float d = float(data_a[ib].d); + const vec2 v = d * float(int(sl | (sh << 4)) - 32) * vec2(kvalues_iq4nl[qs.x], kvalues_iq4nl[qs.y]); + + buf_a[buf_idx] = FLOAT_TYPE_VEC2(v.xy); +#elif defined(DATA_A_IQ4_NL) + const uint idx = pos_a + col * (p.stride_a >> LOAD_VEC_A_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (2 * row) / 2; + + const uint ib = idx / 8; + const uint iqs = idx & 0x07; + + const FLOAT_TYPE d = FLOAT_TYPE(data_a_packed16[ib].d); + const uint vui = uint(data_a_packed16[ib].qs[iqs]); + + buf_a[buf_idx ] = FLOAT_TYPE_VEC2(kvalues_iq4nl[vui & 0xF] * d, + kvalues_iq4nl[bitfieldExtract(vui, 8, 4)] * d); + buf_a[buf_idx + 8] = FLOAT_TYPE_VEC2(kvalues_iq4nl[bitfieldExtract(vui, 4, 4)] * d, + kvalues_iq4nl[vui >> 12] * d); +#endif +} + +#if !defined(MUL_MAT_ID) +void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uint idx_n, const uint idx_k, const uint end_k) { +#if defined(B_TYPE_VEC8) + if (LOAD_VEC_B_SHIFT == 3) { + const uint idx = pos_b + col * (p.stride_b >> LOAD_VEC_B_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_B_SHIFT) / 2; + const FLOAT_TYPE_VEC8 vals = FLOAT_TYPE_VEC8(data_b_vec8[idx]); + buf_b[buf_idx + 0] = FLOAT_TYPE_VEC2(vals[0].xy); + buf_b[buf_idx + 1] = FLOAT_TYPE_VEC2(vals[0].zw); + buf_b[buf_idx + 2] = FLOAT_TYPE_VEC2(vals[1].xy); + buf_b[buf_idx + 3] = FLOAT_TYPE_VEC2(vals[1].zw); + } else +#endif + if (LOAD_VEC_B_SHIFT == 2) { + const uint idx = pos_b + col * (p.stride_b >> LOAD_VEC_B_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_B_SHIFT) / 2; + const FLOAT_TYPE_VEC4 vals = FLOAT_TYPE_VEC4(data_b_vec4[idx]); + buf_b[buf_idx + 0] = FLOAT_TYPE_VEC2(vals.xy); + buf_b[buf_idx + 1] = FLOAT_TYPE_VEC2(vals.zw); + } else if (idx_n < p.N && idx_k + 1 < end_k) { + buf_b[col * SHMEM_STRIDE + row] = FLOAT_TYPE_VEC2(data_b[pos_b + col * p.stride_b + row ], + data_b[pos_b + col * p.stride_b + row + 1]); + } else if (idx_n < p.N && idx_k + 1 < end_k) { + buf_b[col * SHMEM_STRIDE + row] = FLOAT_TYPE_VEC2(data_b[pos_b + col * p.stride_b + row], 0.0f); + } else { + buf_b[col * SHMEM_STRIDE + row] = FLOAT_TYPE_VEC2(0.0f); + } +} +#else +void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uint ic, const uint _ne1) { +#if defined(B_TYPE_VEC8) + if (LOAD_VEC_B_SHIFT == 3) { + const u16vec2 row_idx = row_ids[ic * BN + col]; + const uint idx = pos_b + row_idx.y * (p.batch_stride_b >> LOAD_VEC_B_SHIFT) + (row_idx.x % p.ne11) * (p.stride_b >> LOAD_VEC_B_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_B_SHIFT) / 2; + const FLOAT_TYPE_VEC8 vals = FLOAT_TYPE_VEC8(data_b_vec8[idx]); + buf_b[buf_idx + 0] = FLOAT_TYPE_VEC2(vals[0].xy); + buf_b[buf_idx + 1] = FLOAT_TYPE_VEC2(vals[0].zw); + buf_b[buf_idx + 2] = FLOAT_TYPE_VEC2(vals[1].xy); + buf_b[buf_idx + 3] = FLOAT_TYPE_VEC2(vals[1].zw); + } else +#endif + if (LOAD_VEC_B_SHIFT == 2) { + const u16vec2 row_idx = row_ids[ic * BN + col]; + const uint idx = pos_b + row_idx.y * (p.batch_stride_b >> LOAD_VEC_B_SHIFT) + (row_idx.x % p.ne11) * (p.stride_b >> LOAD_VEC_B_SHIFT) + row; + const uint buf_idx = col * SHMEM_STRIDE + (row << LOAD_VEC_B_SHIFT) / 2; + const FLOAT_TYPE_VEC4 vals = FLOAT_TYPE_VEC4(data_b_vec4[idx]); + buf_b[buf_idx + 0] = FLOAT_TYPE_VEC2(vals.xy); + buf_b[buf_idx + 1] = FLOAT_TYPE_VEC2(vals.zw); + } else { + const uint row_i_1 = ic * BN + col; + const uint row_i_2 = ic * BN + col + 1; + if (row_i_1 < _ne1 && row_i_2 < _ne1) { + const u16vec2 row_idx_1 = row_ids[row_i_1]; + const u16vec2 row_idx_2 = row_ids[row_i_2]; + buf_b[col * SHMEM_STRIDE + row] = FLOAT_TYPE_VEC2(data_b[pos_b + row_idx_1.y * p.batch_stride_b + (row_idx_1.x % p.ne11) * p.stride_b + row], + data_b[pos_b + row_idx_2.y * p.batch_stride_b + (row_idx_2.x % p.ne11) * p.stride_b + row]); + } else if (row_i_1 < _ne1) { + const u16vec2 row_idx = row_ids[row_i_1]; + buf_b[col * SHMEM_STRIDE + row] = FLOAT_TYPE_VEC2(data_b[pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row], 0.0f); + } else { + buf_b[col * SHMEM_STRIDE + row] = FLOAT_TYPE_VEC2(0.0f); + } + } +} +#endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp index f5b29bfb13a66..da98af406d11d 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/types.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/types.comp @@ -11,25 +11,19 @@ #define QUANT_K 1 #define QUANT_R 1 -#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1 #define A_TYPE float -#elif LOAD_VEC_A == 4 -#define A_TYPE vec4 -#elif LOAD_VEC_A == 8 -#define A_TYPE mat2x4 -#endif +#define A_TYPE_VEC4 vec4 +#define A_TYPE_VEC8 mat2x4 #endif #if defined(DATA_A_F16) #define QUANT_K 1 #define QUANT_R 1 -#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1 #define A_TYPE float16_t -#elif LOAD_VEC_A == 4 -#define A_TYPE f16vec4 -#elif LOAD_VEC_A == 8 -#define A_TYPE f16mat2x4 +#define A_TYPE_VEC4 f16vec4 +#if defined(FLOAT16) +#define A_TYPE_VEC8 f16mat2x4 #endif #endif @@ -53,6 +47,7 @@ struct block_q4_0_packed16 #define QUANT_AUXF 1 #define A_TYPE block_q4_0 #define A_TYPE_PACKED16 block_q4_0_packed16 +#define LOAD_VEC_A_SHIFT 3 #endif #define QUANT_K_Q4_1 32 @@ -85,6 +80,7 @@ struct block_q4_1_packed32 #define A_TYPE block_q4_1 #define A_TYPE_PACKED16 block_q4_1_packed16 #define A_TYPE_PACKED32 block_q4_1_packed32 +#define LOAD_VEC_A_SHIFT 3 #endif #define QUANT_K_Q5_0 32 @@ -110,6 +106,7 @@ struct block_q5_0_packed16 #define QUANT_AUXF 1 #define A_TYPE block_q5_0 #define A_TYPE_PACKED16 block_q5_0_packed16 +#define LOAD_VEC_A_SHIFT 2 #endif #define QUANT_K_Q5_1 32 @@ -145,6 +142,7 @@ struct block_q5_1_packed32 #define A_TYPE block_q5_1 #define A_TYPE_PACKED16 block_q5_1_packed16 #define A_TYPE_PACKED32 block_q5_1_packed32 +#define LOAD_VEC_A_SHIFT 2 #endif #define QUANT_K_Q8_0 32 @@ -173,6 +171,7 @@ struct block_q8_0_packed32 #define A_TYPE block_q8_0 #define A_TYPE_PACKED16 block_q8_0_packed16 #define A_TYPE_PACKED32 block_q8_0_packed32 +#define LOAD_VEC_A_SHIFT 2 #endif #define QUANT_K_Q8_1 32 @@ -223,6 +222,7 @@ struct block_q2_K_packed32 #define A_TYPE block_q2_K #define A_TYPE_PACKED16 block_q2_K_packed16 #define A_TYPE_PACKED32 block_q2_K_packed32 +#define LOAD_VEC_A_SHIFT 1 #endif #define QUANT_K_Q3_K 256 @@ -247,6 +247,7 @@ struct block_q3_K_packed16 #define QUANT_K QUANT_K_Q3_K #define A_TYPE block_q3_K #define A_TYPE_PACKED16 block_q3_K_packed16 +#define LOAD_VEC_A_SHIFT 1 #endif #define QUANT_K_Q4_K 256 @@ -282,6 +283,7 @@ struct block_q4_K_packed128 #define A_TYPE block_q4_K #define A_TYPE_PACKED16 block_q4_K_packed16 #define A_TYPE_PACKED32 block_q4_K_packed32 +#define LOAD_VEC_A_SHIFT 1 #endif #define QUANT_K_Q5_K 256 @@ -311,6 +313,7 @@ struct block_q5_K_packed128 #define QUANT_K QUANT_K_Q5_K #define A_TYPE block_q5_K #define A_TYPE_PACKED16 block_q5_K_packed16 +#define LOAD_VEC_A_SHIFT 1 #endif #define QUANT_K_Q6_K 256 @@ -335,6 +338,7 @@ struct block_q6_K_packed16 #define QUANT_K QUANT_K_Q6_K #define A_TYPE block_q6_K #define A_TYPE_PACKED16 block_q6_K_packed16 +#define LOAD_VEC_A_SHIFT 1 #endif // IQuants @@ -367,12 +371,14 @@ struct block_iq1_m_packed64 { #define QUANT_K QUANT_K_IQ1_S #define QUANT_R QUANT_R_IQ1_S #define A_TYPE block_iq1_s +#define LOAD_VEC_A_SHIFT 1 #endif #if defined(DATA_A_IQ1_M) #define QUANT_K QUANT_K_IQ1_M #define QUANT_R QUANT_R_IQ1_M #define A_TYPE block_iq1_m +#define LOAD_VEC_A_SHIFT 1 #endif #if defined(DATA_A_IQ1_S) || defined(DATA_A_IQ1_M) @@ -631,6 +637,7 @@ void init_iq_shmem(uvec3 wgsize) #define QUANT_R QUANT_R_IQ2_XXS #define A_TYPE block_iq2_xxs #define A_TYPE_PACKED16 block_iq2_xxs_packed16 +#define LOAD_VEC_A_SHIFT 1 #endif #define QUANT_K_IQ2_XS 256 @@ -801,6 +808,7 @@ void init_iq_shmem(uvec3 wgsize) #define QUANT_R QUANT_R_IQ2_XS #define A_TYPE block_iq2_xs #define A_TYPE_PACKED16 block_iq2_xs_packed16 +#define LOAD_VEC_A_SHIFT 1 #endif #define QUANT_K_IQ2_S 256 @@ -1101,6 +1109,7 @@ void init_iq_shmem(uvec3 wgsize) #define QUANT_R QUANT_R_IQ2_S #define A_TYPE block_iq2_s #define A_TYPE_PACKED16 block_iq2_s_packed16 +#define LOAD_VEC_A_SHIFT 1 #endif #define QUANT_K_IQ3_XXS 256 @@ -1173,6 +1182,7 @@ void init_iq_shmem(uvec3 wgsize) #define QUANT_R QUANT_R_IQ3_XXS #define A_TYPE block_iq3_xxs #define A_TYPE_PACKED16 block_iq3_xxs_packed16 +#define LOAD_VEC_A_SHIFT 1 #endif #define QUANT_K_IQ3_S 256 @@ -1283,6 +1293,7 @@ void init_iq_shmem(uvec3 wgsize) #define QUANT_R QUANT_R_IQ3_S #define A_TYPE block_iq3_s #define A_TYPE_PACKED16 block_iq3_s_packed16 +#define LOAD_VEC_A_SHIFT 1 #endif #define QUANT_K_IQ4_XS 256 @@ -1322,6 +1333,7 @@ struct block_iq4_nl_packed16 #define QUANT_R QUANT_R_IQ4_NL #define A_TYPE block_iq4_nl #define A_TYPE_PACKED16 block_iq4_nl_packed16 +#define LOAD_VEC_A_SHIFT 1 #endif #if defined(DATA_A_IQ4_NL) || defined(DATA_A_IQ4_XS) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index cf74625cc56d5..2c424ba9ed38a 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -292,13 +292,11 @@ void string_to_spv(const std::string& _name, const std::string& in_fname, const void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool f16acc) { std::string load_vec = coopmat2 ? "1" : fp16 ? "8" : "4"; - std::string aligned_b_type_f32 = coopmat2 ? "float" : fp16 ? "mat2x4" : "vec4"; - std::string aligned_b_type_f16 = coopmat2 ? "float16_t" : fp16 ? "f16mat2x4" : "f16vec4"; - std::map base_dict = { - {"FLOAT_TYPE", (coopmat2 || fp16) ? "float16_t" : "float"}, - {"FLOAT_TYPE_VEC2", (coopmat2 || fp16) ? "f16vec2" : "vec2"}, - }; + std::map base_dict = {{"FLOAT_TYPE", (coopmat2 || fp16) ? "float16_t" : "float" }, + {"FLOAT_TYPE_VEC2", (coopmat2 || fp16) ? "f16vec2" : "vec2" }, + {"FLOAT_TYPE_VEC4", (coopmat2 || fp16) ? "f16vec4" : "vec4" }, + {"FLOAT_TYPE_VEC8", (coopmat2 || fp16) ? "f16mat2x4" : "mat2x4"}}; std::string shader_name = "matmul"; if (matmul_id) { @@ -310,7 +308,8 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool base_dict["FLOAT16"] = "1"; } - base_dict["ACC_TYPE"] = f16acc ? "float16_t" : "float"; + base_dict["ACC_TYPE" ] = f16acc ? "float16_t" : "float"; + base_dict["ACC_TYPE_VEC2"] = f16acc ? "f16vec2" : "vec2"; if (coopmat) { base_dict["COOPMAT"] = "1"; @@ -319,34 +318,34 @@ void matmul_shaders(bool fp16, bool matmul_id, bool coopmat, bool coopmat2, bool const std::string source_name = coopmat2 ? "mul_mm_cm2.comp" : "mul_mm.comp"; // Shaders with f16 B_TYPE - string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}, }), fp16, coopmat, coopmat2, f16acc); - string_to_spv(shader_name + "_f32_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); - - string_to_spv(shader_name + "_f16_aligned", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); - string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc); + if (fp16) { + string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC4", "f16vec4"}, {"B_TYPE_VEC8", "f16mat2x4"}, {"D_TYPE", "float"}, {"D_TYPE_VEC2", "vec2"}, {"D_TYPE_VEC4", "vec4"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC4", "f16vec4"}, {"B_TYPE_VEC8", "f16mat2x4"}, {"D_TYPE", "float"}, {"D_TYPE_VEC2", "vec2"}, {"D_TYPE_VEC4", "vec4"}}), fp16, coopmat, coopmat2, f16acc); + } else { + string_to_spv(shader_name + "_f32_f16", source_name, merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}, {"D_TYPE_VEC2", "vec2"}, {"D_TYPE_VEC4", "vec4"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_f16", source_name, merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}, {"D_TYPE_VEC2", "vec2"}, {"D_TYPE_VEC4", "vec4"}}), fp16, coopmat, coopmat2, f16acc); + } for (const auto& tname : type_names) { - std::string load_vec_quant = "2"; + std::string load_vec_quant_shift = "1"; if ((tname == "q4_0") || (tname == "q4_1")) - load_vec_quant = "8"; + load_vec_quant_shift = "3"; else if ((tname == "q5_0") || (tname == "q5_1") || (tname == "q8_0") || (tname == "iq4_nl")) - load_vec_quant = "4"; + load_vec_quant_shift = "2"; std::string data_a_key = "DATA_A_" + to_uppercase(tname); - // For unaligned, load one at a time for f32/f16, or two at a time for quants - std::string load_vec_a_unaligned = (coopmat2 || tname == "f32" || tname == "f16") ? "1" : load_vec_quant; - // For aligned matmul loads - std::string load_vec_a = (coopmat2 || tname == "f32" || tname == "f16") ? load_vec : load_vec_quant; // don't generate f32 variants for coopmat2 if (!coopmat2) { - string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc); - string_to_spv(shader_name + "_" + tname + "_f32_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); + string_to_spv(shader_name + "_" + tname + "_f32", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"B_TYPE_VEC4", "vec4"}, {"B_TYPE_VEC8", "mat2x4"}, {"D_TYPE", "float"}, {"D_TYPE_VEC2", "vec2"}, {"D_TYPE_VEC4", "vec4"}}), fp16, coopmat, coopmat2, f16acc); } if (tname != "f16" && tname != "f32") { - string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16, coopmat, coopmat2, f16acc); - string_to_spv(shader_name + "_" + tname + "_f16_aligned", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}, {"ALIGNED", "1"}}), fp16, coopmat, coopmat2, f16acc); + if (fp16) { + string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC4", "f16vec4"}, {"B_TYPE_VEC8", "f16mat2x4"}, {"D_TYPE", "float"}, {"D_TYPE_VEC2", "vec2"}, {"D_TYPE_VEC4", "vec4"}}), fp16, coopmat, coopmat2, f16acc); + } else { + string_to_spv(shader_name + "_" + tname + "_f16", source_name, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"B_TYPE_VEC4", "f16vec4"}, {"D_TYPE", "float"}, {"D_TYPE_VEC2", "vec2"}, {"D_TYPE_VEC4", "vec4"}}), fp16, coopmat, coopmat2, f16acc); + } } #if defined(GGML_VULKAN_INTEGER_DOT_GLSLC_SUPPORT) @@ -388,6 +387,7 @@ void process_shaders() { // flash attention for (const auto& f16acc : {false, true}) { std::string acctype = f16acc ? "float16_t" : "float"; + std::string acctype_vec2 = f16acc ? "f16vec2" : "vec2"; for (const auto& tname : type_names) { if (tname == "f32") {