diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/paged_attention.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/paged_attention.hpp index 35ab6b16726ae7..ad79e5178f21a8 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/paged_attention.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/paged_attention.hpp @@ -40,7 +40,7 @@ struct paged_attention : public primitive_base { ob << heads_num; ob << kv_heads_num; ob << has_alibi; - ob << has_rotation_coefficients; + ob << has_rotated_blocks; } void load(BinaryInputBuffer& ib) override { @@ -49,7 +49,7 @@ struct paged_attention : public primitive_base { ib >> heads_num; ib >> kv_heads_num; ib >> has_alibi; - ib >> has_rotation_coefficients; + ib >> has_rotated_blocks; } optional_value scale_val{}; @@ -57,6 +57,6 @@ struct paged_attention : public primitive_base { size_t heads_num = 0; size_t kv_heads_num = 0; bool has_alibi = false; - bool has_rotation_coefficients = false; + bool has_rotated_blocks = false; }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp index 48092d690455a0..ed4d2f97fcf378 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp @@ -203,9 +203,10 @@ struct paged_attention_impl : multi_stage_primitive { // dependency args.inputs.push_back(instance.subsequence_begins_memory_ptr()); } - if (desc->has_rotation_coefficients) { - args.inputs.push_back(instance.rotation_coefficients_memory_ptr()); + if (desc->has_rotated_blocks) { args.inputs.push_back(instance.rotated_block_indices_memory_ptr()); + args.inputs.push_back(instance.rotation_deltas_memory_ptr()); + args.inputs.push_back(instance.rotation_trig_lut_memory_ptr()); } } else if (kernel_idx == 4) { // Output scores calculation kernel diff --git a/src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h b/src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h index df7074134fb560..675d77296aa06b 100644 --- a/src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h @@ -62,13 +62,6 @@ class typed_primitive_inst : public typed_primitive_inst_base

prefill_network; protected: diff --git a/src/plugins/intel_gpu/src/graph/paged_attention.cpp b/src/plugins/intel_gpu/src/graph/paged_attention.cpp index 4622420a05ce53..48ae46d83de34a 100644 --- a/src/plugins/intel_gpu/src/graph/paged_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/paged_attention.cpp @@ -98,7 +98,7 @@ std::string paged_attention_inst::to_string(const paged_attention_node& node) { paged_attention_info.add("kv_heads_num", desc->kv_heads_num); paged_attention_info.add("scale", desc->scale_val.value_or(1.0f)); paged_attention_info.add("has_alibi", desc->has_alibi); - paged_attention_info.add("has_rotation_coefficients", desc->has_rotation_coefficients); + paged_attention_info.add("has_rotated_blocks", desc->has_rotated_blocks); node_info->add("paged_attention primitive info", paged_attention_info); node_info->dump(primitive_description); diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl index 4a8d6e8b19a796..2d6598e0a654cc 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/pa_sdpa_opt.cl @@ -43,9 +43,10 @@ KERNEL(pa_sdpa_opt)( #if HAS_ALIBI const __global ALIBI_INPUT_TYPE* alibi_slopes, #endif -#if HAS_ROTATION_COEFFICIENTS - const __global INPUT8_TYPE* rotation_coefficients, - const __global INPUT9_TYPE* rotated_block_indices, +#if HAS_ROTATED_BLOCKS + const __global INPUT8_TYPE* rotated_block_indices, + const __global INPUT9_TYPE* rotation_deltas, + const __global INPUT10_TYPE* rotated_block_indices, #endif __global OUTPUT_TYPE* output, #if PAGED_ATTENTION_SCORES_OUTPUT @@ -67,8 +68,9 @@ KERNEL(pa_sdpa_opt)( // subsequence_begins: [sequences_num + 1] // block_indices: [used_blocks_num] // block_indices_begins: [sequences_num + 1] - // rotation_coefficients: [num_rotated_blocks * PAGED_ATTENTION_BLOCK_SIZE] // rotated_block_indices: [num_rotated_blocks ] + // rotation_deltas [num_rotated_blocks, 1 || PAGED_ATTENTION_BLOCK_SIZE ] + // rotation_trig_lut [MAX_CONTEXT_LEN, HEAD_SIZE] // // Output shapes: // output: [sequences_num, HEADS_NUM * HEAD_SIZE] @@ -154,7 +156,7 @@ KERNEL(pa_sdpa_opt)( } #endif -#ifdef HAS_ROTATION_COEFFICIENTS +#ifdef HAS_ROTATED_BLOCKS // TODO (vshampor): add cache block rotation at this spot #endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp index a8722da033668b..bac6ebd11fbe9b 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/pa_sdpa_kernel_opt.cpp @@ -237,8 +237,8 @@ JitConstants PagedAttentionSDPAKernelOpt::GetJitConstants(const pa_sdpa_params& jit.AddConstant(MakeJitConstant("PAGED_ATTENTION_SCORES_OUTPUT", 1)); } - if (params.conf.has_rotation_coefficients_input) - jit.AddConstant(MakeJitConstant("HAS_ROTATION_COEFFICIENTS", 1)); + if (params.conf.has_rotated_blocks) + jit.AddConstant(MakeJitConstant("HAS_ROTATED_BLOCKS", 1)); if (kernel_idx == KernelsTypes::MULTI_TOKENS || kernel_idx == KernelsTypes::FINALIZATION_MULTI_TOKENS) jit.AddConstant(MakeJitConstant("MULTI_TOKENS_PROCESSING", 1)); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h index eb7ec907d047f5..7b9519395d88ca 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h @@ -100,7 +100,7 @@ struct sdpa_configuration { int64_t paged_attention_max_len = 0; bool has_const_scale_val = false; float scale_val = 0.f; - bool has_rotation_coefficients_input = false; + bool has_rotated_blocks = false; }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp b/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp index 31e41db0252170..b56807d720b870 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/paged_attention.cpp @@ -48,7 +48,7 @@ static void CreatePagedAttentionExtensionOp(ProgramBuilder& p, const std::shared const size_t scale_idx = 9; const size_t alibi_idx = 11; - const size_t rotation_coefficients_idx = 13; + const size_t rotated_block_indices_idx = 13; std::shared_ptr scale_const = std::dynamic_pointer_cast(op->get_input_node_shared_ptr(scale_idx)); if (scale_const) { @@ -64,10 +64,10 @@ static void CreatePagedAttentionExtensionOp(ProgramBuilder& p, const std::shared prim.num_outputs = 1; - std::shared_ptr rotation_coefficients_const = - std::dynamic_pointer_cast(op->get_input_node_shared_ptr(rotation_coefficients_idx)); - OPENVINO_ASSERT(rotation_coefficients_const != nullptr); - prim.has_rotation_coefficients = ov::shape_size(rotation_coefficients_const->get_output_shape(0)) > 0; + std::shared_ptr rotated_block_indices_const = + std::dynamic_pointer_cast(op->get_input_node_shared_ptr(rotated_block_indices_idx)); + OPENVINO_ASSERT(rotated_block_indices_const != nullptr); + prim.has_rotated_blocks = ov::shape_size(rotated_block_indices_const->get_output_shape(0)) > 0; if (op->get_output_size() > 1) { const auto scores_output_idx = 1;