From 7a57f7abb4f7e600e3c168135335ca2ee2df657b Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Thu, 29 Aug 2024 12:01:14 +0800
Subject: [PATCH 01/13] add linux-perf

---
 src/plugins/intel_cpu/src/graph.cpp           |   14 +-
 .../intel_cpu/src/nodes/linux_perf.hpp        | 1242 +++++++++++++++++
 2 files changed, 1252 insertions(+), 4 deletions(-)
 create mode 100644 src/plugins/intel_cpu/src/nodes/linux_perf.hpp
diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp
index aab78a4d5f15bd..92b541f9b2543a 100644
--- a/src/plugins/intel_cpu/src/graph.cpp
+++ b/src/plugins/intel_cpu/src/graph.cpp
@@ -45,6 +45,7 @@
 #include "utils/node_dumper.h"
 #include "utils/precision_support.h"
 #include "utils/verbose.h"
+#include "nodes/linux_perf.hpp"
 
 #if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
 #    include <tbb/task.h>
@@ -108,6 +109,7 @@ void Graph::Replicate(const std::shared_ptr<const ov::Model>& model,
     OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, itt::domains::intel_cpu_LT, "Graph::Replicate", "ov::Model");
 
     this->_name = model->get_friendly_name();
+    LinuxPerf::Init();
 
     // Map data object onto producer node
     std::map<std::shared_ptr<ov::Node>, NodePtr> op2node;
@@ -1162,6 +1164,7 @@ VecMemoryDescs Graph::getOutputMemoryDescriptors() const {
 
 void Graph::InferStatic(SyncInferRequest* request, int numaId) {
     for (const auto& node : m_executableGraphNodes) {
+        auto perf1 = LinuxPerf::Profile(node->getTypeStr());
         ExecuteNodeWithCatch(node, request, numaId);
     }
 }
@@ -1437,11 +1440,15 @@ inline void Graph::ExecuteNodeWithCatch(const NodePtr& node, SyncInferRequest* r
 template <typename UpdateStrategy>
 void Graph::InferDynamic(SyncInferRequest* request, int numaId, UpdateStrategy&& update) {
     size_t inferCounter = 0;
+    auto perf = LinuxPerf::Profile(std::string("Graph::InferDynamic_#") + std::to_string(infer_count));
     for (auto stopIndx : m_executableSyncNodesInds) {
-        update(stopIndx);
-
+        {
+            auto perf1 = LinuxPerf::Profile("update");
+            update(stopIndx);
+        }
         for (; inferCounter < stopIndx; ++inferCounter) {
             auto& node = m_executableGraphNodes[inferCounter];
+            auto perf1 = LinuxPerf::Profile(node->getTypeStr()); // + "_" + node->getName());
 
             ExecuteNodeWithCatch(node, request, numaId);
         }
@@ -1487,8 +1494,7 @@ void Graph::Infer(SyncInferRequest* request) {
                         static_cast<int>(status));
     }
 
-    if (infer_count != -1)
-        infer_count++;
+    infer_count++;
 }
 
 void Graph::SortTopologically() {
diff --git a/src/plugins/intel_cpu/src/nodes/linux_perf.hpp b/src/plugins/intel_cpu/src/nodes/linux_perf.hpp
new file mode 100644
index 00000000000000..fa9498fab70e81
--- /dev/null
+++ b/src/plugins/intel_cpu/src/nodes/linux_perf.hpp
@@ -0,0 +1,1242 @@
+
+#include <linux/perf_event.h>
+#include <time.h>
+//#include <linux/time.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+
+#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 30
+#include <sys/syscall.h>
+#define gettid() syscall(SYS_gettid)
+#endif
+
+inline int perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu, int group_fd, unsigned long flags) {
+	return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
+}
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+#include <atomic>
+#include <x86intrin.h>
+#include <sys/mman.h>
+#include <thread>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <deque>
+#include <mutex>
+#include <set>
+#include <iomanip>
+#include <functional>
+#include <limits>
+
+namespace LinuxPerf {
+
+#define _LINE_STRINGIZE(x) _LINE_STRINGIZE2(x)
+#define _LINE_STRINGIZE2(x) #x
+#define LINE_STRING _LINE_STRINGIZE(__LINE__)
+
+#define LINUX_PERF_ "\e[33m[LINUX_PERF:" LINE_STRING "]\e[0m "
+
+inline uint64_t get_time_ns() {
+    struct timespec tp0;
+    if (clock_gettime(CLOCK_MONOTONIC_RAW, &tp0) != 0) {
+        perror(LINUX_PERF_"clock_gettime(CLOCK_MONOTONIC_RAW,...) failed!");
+        abort();
+    }
+    return (tp0.tv_sec * 1000000000) + tp0.tv_nsec;    
+}
+
+struct TscCounter {
+    uint64_t tsc_ticks_per_second;
+    uint64_t tsc_ticks_base;
+    double tsc_to_usec(uint64_t tsc_ticks) const {
+        if (tsc_ticks < tsc_ticks_base)
+            return 0;
+        return (tsc_ticks - tsc_ticks_base) * 1000000.0 / tsc_ticks_per_second;
+    }
+    double tsc_to_usec(uint64_t tsc_ticks0, uint64_t tsc_ticks1) const {
+        if (tsc_ticks1 < tsc_ticks0)
+            return 0;
+        return (tsc_ticks1 - tsc_ticks0) * 1000000.0 / tsc_ticks_per_second;
+    }
+    TscCounter() {
+        uint64_t start_ticks = __rdtsc();
+        std::this_thread::sleep_for(std::chrono::seconds(1));
+        tsc_ticks_per_second = (__rdtsc() - start_ticks);
+        std::cout << LINUX_PERF_"tsc_ticks_per_second = " << tsc_ticks_per_second << std::endl;
+        tsc_ticks_base = __rdtsc();
+
+        // use CLOCK_MONOTONIC_RAW instead of TSC
+        tsc_ticks_per_second = 1000000000; // ns
+        tsc_ticks_base = get_time_ns();
+    }
+};
+
+class IPerfEventDumper {
+public:
+    virtual void dump_json(std::ofstream& fw, TscCounter& tsc) = 0;
+};
+
+struct PerfEventJsonDumper {
+    std::mutex g_mutex;
+    std::set<IPerfEventDumper*> all_dumpers;
+    const char* dump_file_name = "perf_dump.json";
+    bool dump_file_over = false;
+    bool not_finalized = true;
+    std::ofstream fw;
+    std::atomic_int totalProfilerManagers{0};
+    TscCounter tsc;
+
+    ~PerfEventJsonDumper() {
+        if (not_finalized)
+            finalize();
+    }
+
+    void finalize() {
+        if (!not_finalized)
+            return;
+        std::lock_guard<std::mutex> guard(g_mutex);
+        if (dump_file_over || all_dumpers.empty())
+            return;
+
+        // start dump
+        fw.open(dump_file_name, std::ios::out);
+        fw << "{\n";
+        fw << "\"schemaVersion\": 1,\n";
+        fw << "\"traceEvents\": [\n";
+        fw.flush();
+
+        for (auto& pthis : all_dumpers) {
+            pthis->dump_json(fw, tsc);
+        }
+        all_dumpers.clear();
+
+        fw << R"({
+            "name": "Profiler End",
+            "ph": "i",
+            "s": "g",
+            "pid": "Traces",
+            "tid": "Trace OV Profiler",
+            "ts":)"
+           << tsc.tsc_to_usec(get_time_ns()) << "}",
+            fw << "]\n";
+        fw << "}\n";
+        auto total_size = fw.tellp();
+        fw.close();
+        dump_file_over = true;
+        not_finalized = false;
+
+        std::cout << LINUX_PERF_"Dumpped ";
+        
+        if (total_size < 1024) std::cout << total_size << " bytes ";
+        else if (total_size < 1024*1024) std::cout << total_size/1024 << " KB ";
+        else std::cout << total_size/(1024 * 1024) << " MB ";
+        std::cout << " to " << dump_file_name << std::endl;
+    }
+
+    int register_manager(IPerfEventDumper* pthis) {
+        std::lock_guard<std::mutex> guard(g_mutex);
+        std::stringstream ss;
+        auto serial_id = totalProfilerManagers.fetch_add(1);
+        ss << LINUX_PERF_"#" << serial_id << "(" << pthis << ") : is registed." << std::endl;
+        std::cout << ss.str();
+        all_dumpers.emplace(pthis);
+        return serial_id;
+    }
+
+    static PerfEventJsonDumper& get() {
+        static PerfEventJsonDumper inst;
+        return inst;
+    }
+};
+
+inline std::vector<std::string> str_split(const std::string& s, std::string delimiter) {
+    std::vector<std::string> ret;
+    size_t last = 0;
+    size_t next = 0;
+    while ((next = s.find(delimiter, last)) != std::string::npos) {
+        //std::cout << last << "," << next << "=" << s.substr(last, next-last) << "\n";
+        ret.push_back(s.substr(last, next-last));
+        last = next + 1;
+    }
+    ret.push_back(s.substr(last));
+    return ret;
+}
+
+template<typename T>
+T& read_ring_buffer(perf_event_mmap_page& meta, uint64_t& offset) {
+    auto offset0 = offset;
+    offset += sizeof(T);
+    return *reinterpret_cast<T*>(reinterpret_cast<uint8_t*>(&meta) + meta.data_offset + (offset0)%meta.data_size);
+}
+
+struct PerfRawConfig {
+    PerfRawConfig() {
+        // env var defined raw events
+        const char* str_raw_config = std::getenv("LINUX_PERF");
+        if (str_raw_config) {
+            CPU_ZERO(&cpu_mask);
+            // options are separated by ":" as PATH
+            auto options = str_split(str_raw_config, ":");
+            for(auto& opt : options) {
+                auto items = str_split(opt, "=");
+                if (items.size() == 2) {
+                    if (items[0] == "dump") {
+                        // limit the number of dumps per thread
+                        dump = strtoll(&items[1][0], nullptr, 0);
+                    } else if (items[0] == "cpus") {
+                        // thread's affinity (cpu-binding) can be changed by threading-libs(TBB/OpenMP) anytime
+                        // sched_getaffinity() can only get correct binding at start-up time, another way is to specify it 
+                        // also too many events may generate if per-thread event is used, cpus can limit
+                        // cpus=56
+                        // cpus=56.57.59
+                        auto cpus = str_split(items[1], ",");
+                        CPU_ZERO(&cpu_mask);
+                        for(auto& cpu : cpus) {
+                            CPU_SET(std::atoi(cpu.c_str()), &cpu_mask);
+                        }
+                    } else {
+                        auto config = strtoul(&items[1][0], nullptr, 0);
+                        if (config > 0)
+                            raw_configs.emplace_back(items[0], config);
+                    }
+                }
+                if (items.size() == 1) {
+                    if (items[0] == "switch-cpu") {
+                        // get cpu_mask as early as possible
+                        switch_cpu = true;
+                        CPU_ZERO(&cpu_mask);
+                        if (sched_getaffinity(getpid(), sizeof(cpu_set_t), &cpu_mask)) {
+                            perror(LINUX_PERF_"sched_getaffinity failed:");
+                            abort();
+                        }
+                    }
+                    if (items[0] == "dump")
+                        dump = std::numeric_limits<int64_t>::max(); // no limit to number of dumps
+                }
+            }
+
+            for(auto& cfg : raw_configs) {
+                printf(LINUX_PERF_" config: %s=0x%lx\n", cfg.first.c_str(), cfg.second);
+            }
+            if (switch_cpu) {
+                printf(LINUX_PERF_" config: switch_cpu\n");
+            }
+            if (dump)
+                printf(LINUX_PERF_" config: dump=%ld\n", dump);
+            if (CPU_COUNT(&cpu_mask)) {
+                printf(LINUX_PERF_" config: cpus=");
+                for (int cpu = 0; cpu < (int)sizeof(cpu_set_t)*8; cpu++)
+                    if(CPU_ISSET(cpu, &cpu_mask)) printf("%d,", cpu);
+                printf("\n");
+            }
+        } else {
+            printf(LINUX_PERF_" LINUX_PERF is unset, example: LINUX_PERF=dump,switch-cpu,L2_MISS=0x10d1\n");
+        }
+    }
+
+    bool dump_on_cpu(int cpu) {
+        if (dump == 0)
+            return false;
+        if (CPU_COUNT(&cpu_mask))
+            return CPU_ISSET(cpu, &cpu_mask);
+        return true;
+    }
+
+    int64_t dump = 0;
+    cpu_set_t cpu_mask;
+    bool switch_cpu = false;
+    std::vector<int> dump_cpus;
+    std::vector<std::pair<std::string, uint64_t>> raw_configs;
+
+    static PerfRawConfig& get() {
+        static PerfRawConfig inst;
+        return inst;
+    }
+};
+
+
+// context switch events
+// this will visualize 
+struct PerfEventCtxSwitch : public IPerfEventDumper {
+    bool is_enabled;
+
+    struct event {
+        int fd;
+        perf_event_mmap_page * meta;
+        int cpu;
+        uint64_t ctx_switch_in_time;
+        uint64_t ctx_switch_in_tid;
+        uint64_t ctx_last_time;
+
+        event(int fd, perf_event_mmap_page * meta): fd(fd), meta(meta) {}
+    };
+    std::vector<event> events;
+
+    PerfEventCtxSwitch() {
+        is_enabled = PerfRawConfig::get().switch_cpu;
+        if (is_enabled) {
+            // make sure TSC in PerfEventJsonDumper is the very first thing to initialize
+            PerfEventJsonDumper::get().register_manager(this);
+
+            // open fd for each CPU
+            cpu_set_t mask = PerfRawConfig::get().cpu_mask;
+
+            long number_of_processors = sysconf(_SC_NPROCESSORS_ONLN);
+            printf(LINUX_PERF_"sizeof(cpu_set_t):%lu: _SC_NPROCESSORS_ONLN=%ld CPU_COUNT=%d\n", sizeof(cpu_set_t), number_of_processors, CPU_COUNT(&mask));
+            if (CPU_COUNT(&mask) >= number_of_processors) {
+                printf(LINUX_PERF_" no affinity is set, will not enable PerfEventCtxSwitch\n");
+                is_enabled = false;
+                return;
+            }
+
+            for (int cpu = 0; cpu < (int)sizeof(cpu_set_t)*8; cpu++) {
+                auto is_set = CPU_ISSET(cpu, &mask);
+                if (!is_set) continue;
+
+                perf_event_attr pea;
+                memset(&pea, 0, sizeof(struct perf_event_attr));
+                pea.type = PERF_TYPE_HARDWARE;
+                pea.size = sizeof(struct perf_event_attr);
+                pea.config = PERF_COUNT_HW_REF_CPU_CYCLES;  // not the point, can be any
+                pea.disabled = 0;
+                pea.exclude_kernel = 1;
+                pea.exclude_hv = 1;
+                pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;        
+                // pinned: It applies only to hardware counters and only to group leaders
+                pea.pinned = 1;
+                pea.read_format |= PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
+
+                // for group master, generate PERF_RECORD_SWITCH into ring-buffer
+                // is helpful to visualize context switch
+                pea.context_switch = 1;
+                // then TID, TIME, ID, STREAM_ID, and CPU can additionally be included in non-PERF_RECORD_SAMPLEs
+                // if the  corresponding sample_type is selected
+                pea.sample_id_all = 1;
+                pea.sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_TID | PERF_SAMPLE_CPU;
+                auto mmap_length = sysconf(_SC_PAGESIZE) * (1024 + 1);
+                pea.use_clockid = 1;
+                pea.clockid = CLOCK_MONOTONIC_RAW;
+
+                // calling thread on any processor
+                pid_t pid = -1;
+                // measures all processes/threads on the specified CPU
+                int ctx_switch_fd = perf_event_open(&pea, pid, cpu, -1, 0);
+                if (ctx_switch_fd < 0) {
+                    perror(LINUX_PERF_"PerfEventCtxSwitch perf_event_open failed (check /proc/sys/kernel/perf_event_paranoid please)");
+                    abort();
+                }
+
+                auto* ctx_switch_pmeta = reinterpret_cast<perf_event_mmap_page*>(mmap(NULL, mmap_length, PROT_READ | PROT_WRITE, MAP_SHARED, ctx_switch_fd, 0));
+                if (ctx_switch_pmeta == MAP_FAILED) {
+                    perror(LINUX_PERF_"mmap perf_event_mmap_page failed:");
+                    close(ctx_switch_fd);
+                    abort();
+                }
+                printf(LINUX_PERF_"perf_event_open CPU_WIDE context_switch on cpu %d, ctx_switch_fd=%d\n", cpu, ctx_switch_fd);
+                events.emplace_back(ctx_switch_fd, ctx_switch_pmeta);
+                events.back().ctx_switch_in_time = get_time_ns();
+                events.back().ctx_last_time = get_time_ns();
+                events.back().cpu = cpu;
+            }
+            my_pid = getpid();
+            my_tid = gettid();
+        }
+    }
+
+    ~PerfEventCtxSwitch() {
+        if (is_enabled) {
+            PerfEventJsonDumper::get().finalize();
+        }
+        for(auto& ev : events) {
+            close(ev.fd);
+        }
+    }
+
+    struct ProfileData {
+        uint64_t tsc_start;
+        uint64_t tsc_end;
+        uint32_t tid;
+        uint32_t cpu;
+        bool preempt;   // preempt means current TID preempts previous thread
+    };
+
+    std::deque<ProfileData> all_dump_data;
+
+    void dump_json(std::ofstream& fw, TscCounter& tsc) override {
+        static std::atomic_uint64_t async_evid{0};
+        if (!is_enabled) return;
+
+        updateRingBuffer();
+
+        auto data_size = all_dump_data.size();
+        if (!data_size) return;
+
+        for (auto& ev : events) {
+            if (ev.ctx_switch_in_time == 0) continue;
+            all_dump_data.emplace_back();
+            auto* pd = &all_dump_data.back();
+            pd->tid = ev.ctx_switch_in_tid;
+            pd->cpu = ev.cpu;
+            pd->tsc_start = ev.ctx_switch_in_time;
+            pd->tsc_end = get_time_ns();
+            ev.ctx_switch_in_time = 0;
+        }
+
+        auto pid = 9999;    // fake pid for CPU
+        auto cat = "TID";
+        
+        // TID is used for CPU id instead
+        for (auto& d : all_dump_data) {
+            auto duration = tsc.tsc_to_usec(d.tsc_start, d.tsc_end);
+            auto start = tsc.tsc_to_usec(d.tsc_start);
+            //auto end = tsc.tsc_to_usec(d.tsc_end);
+            auto cpu_id = d.cpu;
+
+            fw << "{\"ph\": \"X\", \"name\": \"" << d.tid << "\", \"cat\":\"" << cat << "\","
+                << "\"pid\": " << pid << ", \"tid\": \"CPU" << cpu_id <<  "\","
+                << "\"ts\": " << std::setprecision (15) << start << ", \"dur\": " << duration << "},\n";
+        }
+    }
+
+    bool ring_buffer_verbose = false;
+    uint32_t my_pid = 0;
+    uint32_t my_tid = 0;
+    std::atomic<int> atom_gard{0};
+
+    void updateRingBuffer() {
+        // only one thread can enter
+        const int lock_value = atom_gard.exchange(1);
+        if (lock_value == 1) {
+            // has been locked, return;
+            return;
+        }
+
+        // only update when any ring-buffer is half loaded
+        bool need_update = false;
+        for(auto& ev : events) {
+            auto& mmap_meta = *ev.meta;
+            auto used_size = (mmap_meta.data_tail - mmap_meta.data_head) % mmap_meta.data_size;
+            if (used_size > (mmap_meta.data_size >> 1)) {
+                need_update = true;
+                break;
+            }
+        }
+
+        if (!need_update) {
+            // unlock
+            atom_gard.exchange(0);
+            return;
+        }
+
+        for(auto& ev : events) {
+            auto& mmap_meta = *ev.meta;
+            uint64_t head0 = mmap_meta.data_tail;
+            uint64_t head1 = mmap_meta.data_head;
+            //printf("ring-buffer@end: %lu~%lu %llu %llu %llu\n", head0, head1, group_meta.data_tail, group_meta.data_offset, group_meta.data_size);
+
+            if (head0 != head1) {
+                if (ring_buffer_verbose) {
+                    printf("PERF_RECORD_SWITCH = %d\n", PERF_RECORD_SWITCH);
+                    printf("PERF_RECORD_SWITCH_CPU_WIDE = %d\n", PERF_RECORD_SWITCH_CPU_WIDE);
+                    printf("PERF_RECORD_MISC_SWITCH_OUT = %d\n", PERF_RECORD_MISC_SWITCH_OUT);
+                    printf("PERF_RECORD_MISC_SWITCH_OUT_PREEMPT  = %d\n", PERF_RECORD_MISC_SWITCH_OUT_PREEMPT);
+                }
+
+                while(head0 < head1) {
+                    auto h0 = head0;
+                    auto type = read_ring_buffer<__u32>(mmap_meta, head0);
+                    auto misc = read_ring_buffer<__u16>(mmap_meta, head0);
+                    auto size = read_ring_buffer<__u16>(mmap_meta, head0);
+                    uint32_t next_prev_pid = 0, next_prev_tid = 0;
+                    if (type == PERF_RECORD_SWITCH_CPU_WIDE) {
+                        // previous PID/TID if switching-in
+                        // next PID/TID if switching-out
+                        next_prev_pid = read_ring_buffer<__u32>(mmap_meta, head0);
+                        next_prev_tid = read_ring_buffer<__u32>(mmap_meta, head0);
+                    }
+                    auto pid = read_ring_buffer<__u32>(mmap_meta, head0);
+                    auto tid = read_ring_buffer<__u32>(mmap_meta, head0);
+                    auto time = read_ring_buffer<uint64_t>(mmap_meta, head0);
+                    auto cpu = read_ring_buffer<__u32>(mmap_meta, head0);
+                    auto reserved0 = read_ring_buffer<__u32>(mmap_meta, head0);
+                    (void)reserved0;
+                    (void)next_prev_pid;
+                    (void)pid;
+
+                    // skip idle process (with TID 0)
+                    if (tid > 0 && ring_buffer_verbose) {
+                        printf("event: %lu/%lu\ttype,misc,size=(%u,%u,%u) cpu%u,next_prev_tid=%u,tid=%u  time:(%lu), (+%lu)\n",
+                            h0, head1,
+                            type, misc, size,
+                            cpu, next_prev_tid, tid,
+                            time,
+                            time - ev.ctx_last_time);
+                    }
+
+                    if (type == PERF_RECORD_SWITCH_CPU_WIDE && tid > 0) {
+                        if (misc & PERF_RECORD_MISC_SWITCH_OUT || misc & PERF_RECORD_MISC_SWITCH_OUT_PREEMPT) {
+                            // switch out
+                            // generate a log
+                            all_dump_data.emplace_back();
+                            auto* pd = &all_dump_data.back();
+                            pd->tid = tid;
+                            pd->cpu = cpu;
+                            pd->preempt = (misc & PERF_RECORD_MISC_SWITCH_OUT_PREEMPT);
+                            //printf("ctx_switch_in_time=%lu\n", ctx_switch_in_time);
+                            pd->tsc_start = ev.ctx_switch_in_time;
+                            pd->tsc_end = time;
+
+                            if (ring_buffer_verbose) printf("\t  cpu: %u tid: %u  %lu (+%lu)\n", cpu, tid, ev.ctx_switch_in_time, time-ev.ctx_switch_in_time);
+
+                            ev.ctx_switch_in_time = 0;
+                        } else {
+                            // switch in
+                            ev.ctx_switch_in_time = time;
+                            ev.ctx_switch_in_tid = tid;
+                        }
+                    }
+
+                    ev.ctx_last_time = time;
+                    head0 += size - (head0 - h0);
+                }
+
+                if (head0 != head1) {
+                    printf("head0(%lu) != head1(%lu)\n", head0, head1);
+                    abort();
+                }
+
+                // update tail so kernel can keep generate event records
+                mmap_meta.data_tail = head0;
+                std::atomic_thread_fence(std::memory_order_seq_cst);
+            }
+        }
+        atom_gard.exchange(0);
+    }
+
+    static PerfEventCtxSwitch& get() {
+        static PerfEventCtxSwitch inst;
+        return inst;
+    }
+};
+
+/*
+RAW HARDWARE EVENT DESCRIPTOR
+       Even when an event is not available in a symbolic form within perf right now, it can be encoded in a per processor specific way.
+
+       For instance For x86 CPUs NNN represents the raw register encoding with the layout of IA32_PERFEVTSELx MSRs (see [Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 3B: System Programming Guide] Figure 30-1
+       Layout of IA32_PERFEVTSELx MSRs) or AMD’s PerfEvtSeln (see [AMD64 Architecture Programmer’s Manual Volume 2: System Programming], Page 344, Figure 13-7 Performance Event-Select Register (PerfEvtSeln)).
+
+       Note: Only the following bit fields can be set in x86 counter registers: event, umask, edge, inv, cmask. Esp. guest/host only and OS/user mode flags must be setup using EVENT MODIFIERS.
+
+ event 7:0
+ umask 15:8
+ edge  18
+ inv   23
+ cmask 31:24
+*/
+#define X86_RAW_EVENT(EventSel, UMask, CMask) ((CMask << 24) | (UMask << 8) | (EventSel))
+
+struct PerfEventGroup : public IPerfEventDumper {
+    int group_fd = -1;
+    uint64_t read_format;
+
+    struct event {
+        int fd = -1;
+        uint64_t id = 0;
+        uint64_t pmc_index = 0;
+        perf_event_mmap_page* pmeta = nullptr;
+        std::string name = "?";
+        char format[32];
+    };
+    std::vector<event> events;
+
+    uint64_t read_buf[512]; // 4KB
+    uint64_t time_enabled;
+    uint64_t time_running;
+    uint64_t pmc_width;
+    uint64_t pmc_mask;
+    uint64_t values[32];
+    uint32_t tsc_time_shift;
+    uint32_t tsc_time_mult;
+
+    // ref_cpu_cycles even id
+    // this event is fixed function counter provided by most x86 CPU
+    // and it provides TSC clock which is:
+    //    - very high-resolution (<1ns or >1GHz)
+    //    - independent of CPU-frequency throttling
+    int ref_cpu_cycles_evid = -1;
+    int sw_task_clock_evid = -1;
+    int hw_cpu_cycles_evid = -1;
+    int hw_instructions_evid = -1;
+
+    struct ProfileData {
+        uint64_t tsc_start;
+        uint64_t tsc_end;
+        std::string title;
+        const char * cat;
+        int32_t id;
+        static const int data_size = 16; // 4(fixed) + 8(PMU) + 4(software)
+        uint64_t data[data_size] = {0};
+        // f/i/u/p
+        char extra_data_type[data_size] = {0};
+        union {
+            double f;
+            int64_t i;
+            void * p;
+        } extra_data[data_size];
+
+        template<typename T>
+        char get_extra_type(T t) {
+            if (std::is_pointer<T>::value) return 'p';
+            if (std::is_floating_point<T>::value) return 'f';
+            if (std::is_integral<T>::value) return 'i';
+            return '\0';
+        }
+        template<typename T>
+        void set_extra_data(int i, T* t) { extra_data[i].p = t; }
+        void set_extra_data(int i, float t) { extra_data[i].f = t; }
+        void set_extra_data(int i, double t) { extra_data[i].f = t; }
+        template<typename T>
+        void set_extra_data(int i, T t) {
+            static_assert(std::is_integral<T>::value);
+            extra_data[i].i = t;
+        }
+
+        template <typename ... Values>
+        void set_extra_data(Values... vals) {
+            static_assert(data_size >= sizeof...(vals));
+            int j = 0;
+            int unused1[] = { 0, (set_extra_data(j++, vals), 0)... };
+            (void)unused1;
+            j = 0;
+            int unused2[] = { 0, (extra_data_type[j++] = get_extra_type(vals), 0)... };
+            (void)unused2;
+            extra_data_type[j] = '\0';
+        }
+
+        ProfileData(const std::string& title) : title(title) {
+            start();
+        }
+        void start() {
+            tsc_start = get_time_ns();
+        }
+        void stop() {
+            tsc_end = get_time_ns();
+        }
+    };
+
+    bool enable_dump_json = false;
+    int64_t dump_limit = 0;
+    std::deque<ProfileData> all_dump_data;
+    int serial;
+
+    using CallBackEventArgsSerializer = std::function<void(std::ostream& fw, double usec, uint64_t* counters)>;
+    CallBackEventArgsSerializer fn_evt_args_serializer;
+
+    void dump_json(std::ofstream& fw, TscCounter& tsc) override {
+        static std::atomic_uint64_t async_evid{0};
+        if (!enable_dump_json)
+            return;
+        auto data_size = all_dump_data.size();
+        if (!data_size)
+            return;
+
+        for (auto& d : all_dump_data) {
+            auto duration = tsc.tsc_to_usec(d.tsc_start, d.tsc_end);
+            auto title = std::string(d.title) + "_" + std::to_string(d.id);
+            auto cat = d.cat;
+            //auto pid = serial;
+            auto start = tsc.tsc_to_usec(d.tsc_start);
+            //auto end = tsc.tsc_to_usec(d.tsc_end);
+
+            if (d.id < 0) {
+                // async events
+                // {"cat": "foo", "name": "async_read2", "pid": 4092243, "id": 4092246, "ph": "b", "ts": 23819.718},
+                fw << "{\"ph\": \"b\", \"name\": \"" << d.title << "\", \"cat\":\"" << cat << "\","
+                    << "\"pid\": " << my_pid << ", \"id\": " << (-d.id) << ","
+                    << "\"ts\": " << std::setprecision (15) << start << "},";
+
+                fw << "{\"ph\": \"e\", \"name\": \"" << d.title << "\", \"cat\":\"" << cat << "\","
+                    << "\"pid\": " << my_pid << ", \"id\": " << (-d.id) << ","
+                    << "\"ts\": " << std::setprecision (15) << tsc.tsc_to_usec(d.tsc_end) << ",";
+            } else {
+                fw << "{\"ph\": \"X\", \"name\": \"" << title << "\", \"cat\":\"" << cat << "\","
+                    << "\"pid\": " << my_pid << ", \"tid\": " << my_tid << ","
+                    << "\"ts\": " << std::setprecision (15) << start << ", \"dur\": " << duration << ",";
+            }
+
+            fw << "\"args\":{";
+            {
+                std::stringstream ss;
+                if (fn_evt_args_serializer)
+                    fn_evt_args_serializer(ss, duration, d.data);
+                if (sw_task_clock_evid >= 0) {
+                    // PERF_COUNT_SW_TASK_CLOCK in nano-seconds
+                    ss << "\"CPU Usage\":" << (d.data[sw_task_clock_evid] * 1e-3)/duration << ",";
+                }
+                if (hw_cpu_cycles_evid >= 0) {
+                    if (sw_task_clock_evid >= 0 && d.data[sw_task_clock_evid] > 0) {
+                        ss << "\"CPU Freq(GHz)\":" << static_cast<double>(d.data[hw_cpu_cycles_evid])/d.data[sw_task_clock_evid] << ",";
+                    } else {
+                        ss << "\"CPU Freq(GHz)\":" << static_cast<double>(d.data[hw_cpu_cycles_evid])*1e-3/duration << ",";
+                    }
+                    if (hw_instructions_evid >= 0 && d.data[hw_instructions_evid] > 0) {
+                        ss << "\"CPI\":" << static_cast<double>(d.data[hw_cpu_cycles_evid])/d.data[hw_instructions_evid] << ",";
+                    }
+                }
+                auto prev_locale = ss.imbue(std::locale(""));
+                const char * sep = "";
+                for(size_t i = 0; i < events.size() && i < d.data_size; i++) {
+                    ss << sep << "\"" << events[i].name << "\":\"" << d.data[i] << "\"";
+                    sep = ",";
+                }
+                ss.imbue(prev_locale);
+                if (d.extra_data_type[0] != 0) {
+                    sep = "";
+                    ss << ",\"Extra Data\":[";
+                    for(size_t i = 0; i < d.data_size && (d.extra_data_type[i] != 0); i++) {
+                        if (d.extra_data_type[i] == 'f') ss << sep << d.extra_data[i].f;
+                        else if (d.extra_data_type[i] == 'i') ss << sep << d.extra_data[i].i;
+                        else if (d.extra_data_type[i] == 'p') ss << sep << "\"" << d.extra_data[i].p << "\"";
+                        else ss << sep << "\"?\"";
+                        sep = ",";
+                    }
+                    ss << "]";
+                }
+                fw << ss.str();
+            }
+            fw << "}},\n";
+        }
+        all_dump_data.clear();
+        std::cout << LINUX_PERF_"#" << serial << "(" << this << ") finalize: dumpped " << data_size << std::endl;
+    }
+
+    uint64_t operator[](size_t i) {
+        if (i < events.size()) {
+            return values[i];
+        } else {
+            printf(LINUX_PERF_"PerfEventGroup: operator[] with index %lu oveflow (>%lu)\n", i, events.size());
+            abort();
+        }
+        return 0;
+    }
+    
+    PerfEventGroup() = default;
+
+    struct Config {
+        uint32_t type;
+        uint64_t config;
+        const char * name;
+        Config(uint32_t type, uint64_t config, const char * name = "?") : type(type), config(config), name(name) {}
+    };
+
+    uint32_t my_pid = 0;
+    uint32_t my_tid = 0;
+
+    PerfEventGroup(const std::vector<Config> type_configs, CallBackEventArgsSerializer fn = {}) : fn_evt_args_serializer(fn) {
+        for(auto& tc : type_configs) {
+            if (tc.type == PERF_TYPE_SOFTWARE) {
+                add_sw(tc.config);
+            }
+            if (tc.type == PERF_TYPE_HARDWARE) {
+                add_hw(tc.config);
+            }
+            if (tc.type == PERF_TYPE_RAW) {
+                add_raw(tc.config);
+            }
+            events.back().name = tc.name;
+            snprintf(events.back().format, sizeof(events.back().format), "%%%lulu, ", strlen(tc.name));
+        }
+
+        // env var defined raw events
+        for (auto raw_cfg : PerfRawConfig::get().raw_configs) {
+            add_raw(raw_cfg.second);
+            events.back().name = raw_cfg.first;
+        }
+
+        dump_limit = PerfRawConfig::get().dump;
+        enable_dump_json = PerfRawConfig::get().dump_on_cpu(sched_getcpu());
+        serial = 0;
+        if (enable_dump_json) {
+            serial = PerfEventJsonDumper::get().register_manager(this);
+        }
+        my_pid = getpid();
+        my_tid = gettid();
+
+        enable();
+    }
+
+    ~PerfEventGroup() {
+        if (enable_dump_json)
+            PerfEventJsonDumper::get().finalize();
+        disable();
+        for(auto & ev : events) {
+            close(ev.fd);
+        }
+    }
+
+    void show_header() {
+        std::stringstream ss;
+        ss << "\e[33m";
+        ss << "#" << serial << ":";
+        for(auto& ev : events) {
+            ss << ev.name << ", ";
+        }
+        ss << "\e[0m\n";
+        std::cout << ss.str();
+    }
+
+    void add_raw(uint64_t config, bool pinned=false) {
+        perf_event_attr pea;
+        memset(&pea, 0, sizeof(struct perf_event_attr));
+        pea.type = PERF_TYPE_RAW;
+        pea.size = sizeof(struct perf_event_attr);
+        pea.config = config;
+        pea.disabled = 1;
+        pea.exclude_kernel = 1;
+        pea.exclude_hv = 1;
+        pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;        
+        if (pinned && group_fd == -1) {
+            // pinned: It applies only to hardware counters and only to group leaders
+            pea.pinned = 1;
+        }
+        if (group_fd == -1) {
+            pea.read_format |= PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
+        }
+        add(&pea);
+    }
+
+    void add_hw(uint64_t config, bool pinned=false) {
+        perf_event_attr pea;
+        memset(&pea, 0, sizeof(struct perf_event_attr));
+        pea.type = PERF_TYPE_HARDWARE;
+        pea.size = sizeof(struct perf_event_attr);
+        pea.config = config;
+        pea.disabled = 1;
+        pea.exclude_kernel = 1;
+        pea.exclude_hv = 1;
+        pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;        
+        if (pinned && group_fd == -1) {
+            // pinned: It applies only to hardware counters and only to group leaders
+            pea.pinned = 1;
+        }
+        if (group_fd == -1) {
+            pea.read_format |= PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
+        }
+        add(&pea);
+    }
+
+    void add_sw(uint64_t config) {
+        perf_event_attr pea;
+        memset(&pea, 0, sizeof(struct perf_event_attr));
+        pea.type = PERF_TYPE_SOFTWARE;
+        pea.size = sizeof(struct perf_event_attr);
+        pea.config = config;
+        pea.disabled = 1;
+        pea.exclude_kernel = 0; // some SW events are counted as kernel
+        pea.exclude_hv = 1;
+        //pea.pinned = 1;   //sw event cannot set pinned!!!
+        pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID ;
+        add(&pea);
+    }
+
+    void add(perf_event_attr* pev_attr, pid_t pid = 0, int cpu = -1) {
+        event ev;
+
+        size_t mmap_length = sysconf(_SC_PAGESIZE) * 1;
+        // clockid must consistent within group
+        pev_attr->use_clockid = 1;
+        // can be synched with clock_gettime(CLOCK_MONOTONIC_RAW)
+        pev_attr->clockid = CLOCK_MONOTONIC_RAW;
+
+        RETRY:
+        ev.fd = perf_event_open(pev_attr, pid, cpu, group_fd, 0);
+        if (ev.fd < 0) {
+            if (!pev_attr->exclude_kernel) {
+                printf(LINUX_PERF_"perf_event_open(type=%d,config=%lld) with exclude_kernel=0 failed (due to /proc/sys/kernel/perf_event_paranoid is 2),  set exclude_kernel=1 and retry...\n",
+                       pev_attr->type, pev_attr->config);
+                pev_attr->exclude_kernel = 1;
+                goto RETRY;
+            } else {
+                printf(LINUX_PERF_"perf_event_open(type=%d,config=%lld) failed", pev_attr->type, pev_attr->config);
+                perror("");
+                abort();
+            }
+        }
+        ioctl(ev.fd, PERF_EVENT_IOC_ID, &ev.id);
+
+        ev.pmeta = reinterpret_cast<perf_event_mmap_page*>(mmap(NULL, mmap_length, PROT_READ | PROT_WRITE, MAP_SHARED, ev.fd, 0));
+        if (ev.pmeta == MAP_FAILED) {
+            perror(LINUX_PERF_"mmap perf_event_mmap_page failed:");
+            close(ev.fd);
+            abort();
+        }
+
+        if (group_fd == -1) {
+            group_fd = ev.fd;
+            read_format = pev_attr->read_format;
+        }
+        if (pev_attr->type == PERF_TYPE_HARDWARE && pev_attr->config == PERF_COUNT_HW_REF_CPU_CYCLES) {
+            ref_cpu_cycles_evid = events.size();
+        }
+        if (pev_attr->type == PERF_TYPE_SOFTWARE && pev_attr->config == PERF_COUNT_SW_TASK_CLOCK) {
+            sw_task_clock_evid = events.size();
+        }
+        if (pev_attr->type == PERF_TYPE_HARDWARE && pev_attr->config == PERF_COUNT_HW_CPU_CYCLES) {
+            hw_cpu_cycles_evid = events.size();
+        }
+        if (pev_attr->type == PERF_TYPE_HARDWARE && pev_attr->config == PERF_COUNT_HW_INSTRUCTIONS) {
+            hw_instructions_evid = events.size();
+        }
+        //printf("perf_event_open : fd=%d, id=%lu\n", ev.fd, ev.id);
+
+        events.push_back(ev);
+    }
+
+    bool event_group_enabled = false;
+    uint32_t num_events_no_pmc;
+
+    void enable() {
+        if (event_group_enabled)
+            return;
+        ioctl(group_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
+        ioctl(group_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
+        // PMC index is only valid when being enabled
+        num_events_no_pmc = 0;
+        for(auto& ev : events) {
+            if (ev.pmc_index == 0 && ev.pmeta->cap_user_rdpmc) {
+                uint32_t seqlock;
+                do {
+                    seqlock = ev.pmeta->lock;
+                    std::atomic_thread_fence(std::memory_order_seq_cst);
+                    ev.pmc_index = ev.pmeta->index;
+                    pmc_width = ev.pmeta->pmc_width;
+                    pmc_mask = 1;
+                    pmc_mask = (pmc_mask << pmc_width) - 1;
+                    if (ev.pmeta->cap_user_time) {
+                        tsc_time_shift = ev.pmeta->time_shift;
+                        tsc_time_mult = ev.pmeta->time_mult;
+                        //printf("time: %u,%u\n", tsc_time_shift, tsc_time_mult);
+                    }
+                    std::atomic_thread_fence(std::memory_order_seq_cst);
+                } while (ev.pmeta->lock != seqlock || (seqlock & 1));
+            }
+            // some events like PERF_TYPE_SOFTWARE cannot read using rdpmc()
+            if (ev.pmc_index == 0)
+                num_events_no_pmc ++;
+        }
+        event_group_enabled = true;
+    }
+
+    uint64_t tsc2nano(uint64_t cyc) {
+        uint64_t quot, rem;
+        quot  = cyc >> tsc_time_shift;
+        rem   = cyc & (((uint64_t)1 << tsc_time_shift) - 1);
+        return quot * tsc_time_mult + ((rem * tsc_time_mult) >> tsc_time_shift);
+    }
+
+    void disable() {
+        if (!event_group_enabled)
+            return;
+
+        ioctl(group_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
+
+        for(auto& ev : events) {
+            ev.pmc_index = 0;
+        }
+        event_group_enabled = false;
+    }
+
+    uint64_t rdpmc(int i, uint64_t base = 0) {
+        return (_rdpmc(events[i].pmc_index - 1) - base) & pmc_mask;
+    }
+
+    template<class FN>
+    std::vector<uint64_t> rdpmc(FN fn, std::string name = {}, int64_t loop_cnt = 0, std::function<void(uint64_t, uint64_t*, char*&)> addinfo = {}) {
+        int cnt = events.size();
+        std::vector<uint64_t> pmc(cnt, 0);
+
+        bool use_pmc = (num_events_no_pmc == 0);
+        if (use_pmc) {
+            for(int i = 0; i < cnt; i++) {
+                if (events[i].pmc_index)
+                    pmc[i] = _rdpmc(events[i].pmc_index - 1);
+                else
+                    pmc[i] = 0;
+            }
+        } else {
+            read();
+            for(int i = 0; i < cnt; i++) {
+                pmc[i] = values[i];
+            }
+        }
+
+        auto tsc0 = __rdtsc();
+        fn();
+        auto tsc1 = __rdtsc();
+
+        if (use_pmc) {
+            for(int i = 0; i < cnt; i++) {
+                if (events[i].pmc_index)
+                    pmc[i] = (_rdpmc(events[i].pmc_index - 1) - pmc[i]) & pmc_mask;
+                else
+                    pmc[i] = 0;
+            }
+        } else {
+            read();
+            for(int i = 0; i < cnt; i++) {
+                pmc[i] -= values[i];
+            }
+        }
+
+        if (!name.empty()) {
+            char log_buff[1024];
+            char * log = log_buff;
+            log += sprintf(log, "\e[33m");
+            for(int i = 0; i < cnt; i++) {
+                log += sprintf(log, events[i].format, pmc[i]);
+            }
+            auto duration_ns = tsc2nano(tsc1 - tsc0);
+            
+            log += sprintf(log, "\e[0m [%16s] %.3f us", name.c_str(), duration_ns/1e3);
+            if (hw_cpu_cycles_evid >= 0) {
+                log += sprintf(log, " CPU:%.2f(GHz)", 1.0 * pmc[hw_cpu_cycles_evid] / duration_ns);
+                if (hw_instructions_evid >= 0) {
+                    log += sprintf(log, " CPI:%.2f", 1.0 * pmc[hw_cpu_cycles_evid] / pmc[hw_instructions_evid]);
+                }
+                if (loop_cnt > 0) {
+                    // cycles per kernel (or per-iteration)
+                    log += sprintf(log, " CPK:%.1fx%d", 1.0 * pmc[hw_cpu_cycles_evid] / loop_cnt, loop_cnt);
+                }
+            }
+            if (addinfo) {
+                addinfo(duration_ns, &pmc[0], log);
+            }
+            log += sprintf(log, "\n");
+            printf(log_buff);
+        }
+        return pmc;
+    }
+
+    void read(bool verbose = false) {
+        for(size_t i = 0; i < events.size(); i++) values[i] = 0;
+
+        if (::read(group_fd, read_buf, sizeof(read_buf)) == -1) {
+            perror(LINUX_PERF_"read perf event failed:");
+            abort();
+        }
+
+        uint64_t * readv = read_buf;
+        auto nr = *readv++;
+        if (verbose) printf("number of counters:\t%lu\n", nr);
+        time_enabled = 0;
+        time_running = 0;
+        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+            time_enabled = *readv++;
+            if (verbose) printf("time_enabled:\t%lu\n", time_enabled);
+        }
+        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+            time_running = *readv++;
+            if (verbose) printf("time_running:\t%lu\n", time_running);
+        }
+
+        for (size_t i = 0; i < nr; i++) {
+            auto value = *readv++;
+            auto id = *readv++;
+            for (size_t k = 0; k < events.size(); k++) {
+                if (id == events[k].id) {
+                    values[k] = value;
+                }
+            }
+        }
+
+        if (verbose) {
+            for (size_t k = 0; k < events.size(); k++) {
+                printf("\t[%lu]: %lu\n", k, values[k]);
+            }
+        }
+    }
+
+    //================================================================================
+    // profiler API with json_dump capability
+    struct ProfileScope {
+        PerfEventGroup* pevg = nullptr;
+        ProfileData* pd = nullptr;
+        bool do_unlock = false;
+        ProfileScope() = default;
+        ProfileScope(PerfEventGroup* pevg, ProfileData* pd, bool do_unlock = false) : pevg(pevg), pd(pd), do_unlock(do_unlock) {}
+
+        // Move only
+        ProfileScope(const ProfileScope&) = delete;
+        ProfileScope& operator=(const ProfileScope&) = delete;
+
+        ProfileScope(ProfileScope&& other) {
+            pevg = other.pevg;
+            pd = other.pd;
+            other.pevg = nullptr;
+            other.pd = nullptr;
+        }
+
+        ProfileScope& operator=(ProfileScope&& other) {
+            if (&other != this) {
+                pevg = other.pevg;
+                pd = other.pd;
+                other.pevg = nullptr;
+                other.pd = nullptr;
+            }
+
+            return *this;
+        }
+
+        uint64_t* finish() {
+            if (do_unlock) {
+                PerfEventGroup::get_sampling_lock() --;
+            }
+            if (!pevg || !pd)
+                return nullptr;
+
+            pd->stop();
+            bool use_pmc = (pevg->num_events_no_pmc == 0);
+            if (use_pmc) {
+                for (size_t i =0; i < pevg->events.size() && i < pd->data_size; i++)
+                    if (pevg->events[i].pmc_index)
+                        pd->data[i] = (_rdpmc(pevg->events[i].pmc_index - 1) - pd->data[i]) & pevg->pmc_mask;
+                    else
+                        pd->data[i] = 0;
+            } else {
+                pevg->read();
+                for (size_t i =0; i < pevg->events.size() && i < pd->data_size; i++)
+                    pd->data[i] = pevg->values[i] - pd->data[i];
+            }
+            pevg = nullptr;
+            return pd->data;
+        }
+
+        ~ProfileScope() {
+            finish();
+        }
+    };
+
+    ProfileData* _profile(const std::string& title, int id = 0) {
+        if (get_sampling_lock().load() != 0)
+            return nullptr;
+        if (dump_limit == 0)
+            return nullptr;
+        dump_limit --;
+
+        PerfEventCtxSwitch::get().updateRingBuffer();
+
+        all_dump_data.emplace_back(title);
+        auto* pd = &all_dump_data.back();
+        pd->cat = "enable";
+        pd->id = id;
+
+        // use rdpmc if possible
+        bool use_pmc = (num_events_no_pmc == 0);
+        if (use_pmc) {
+            for (size_t i =0; i < events.size() && i < pd->data_size; i++)
+                if (events[i].pmc_index)
+                    pd->data[i] = _rdpmc(events[i].pmc_index - 1);
+        } else {
+            read();
+            for (size_t i =0; i < events.size() && i < pd->data_size; i++)
+                pd->data[i] = values[i];
+        }
+
+        return pd;
+    }
+
+    static PerfEventGroup& get() {
+        thread_local PerfEventGroup pevg({
+            {PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, "HW_CPU_CYCLES"},
+            {PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, "HW_INSTRUCTIONS"},
+            {PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES, "HW_CACHE_MISSES"},
+            //{PERF_TYPE_HARDWARE, PERF_COUNT_HW_REF_CPU_CYCLES, "HW_REF_CPU_CYCLES"},
+            {PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES, "SW_CONTEXT_SWITCHES"},
+            {PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK, "SW_TASK_CLOCK"},
+            {PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS, "SW_PAGE_FAULTS"},
+
+            // XSNP_NONE                : ... were hits in L3 without snoops required                (data is not owned by any other core's local cache)
+            // XSNP_FWD   /XSNP_HITM    : ... were HitM responses from shared L3                     (data was exclusivly/dirty owned by another core's local cache)
+            // XSNP_NO_FWD/XSNP_HIT     : ... were L3 and cross-core snoop hits in on-pkg core cache (data was shared/clean in another core's local cache)
+
+            {PERF_TYPE_RAW, X86_RAW_EVENT(0xd2, 0x01, 0x00), "XSNP_MISS"},
+            {PERF_TYPE_RAW, X86_RAW_EVENT(0xd2, 0x02, 0x00), "XSNP_NO_FWD"},
+            {PERF_TYPE_RAW, X86_RAW_EVENT(0xd2, 0x04, 0x00), "XSNP_FWD"},
+            {PERF_TYPE_RAW, X86_RAW_EVENT(0xd2, 0x08, 0x00), "XSNP_NONE"},              
+        });
+        return pevg;
+    }
+
+    // this lock is global, affect all threads
+    static std::atomic_int& get_sampling_lock() {
+        static std::atomic_int sampling_lock{0};
+        return sampling_lock;
+    }
+};
+
+using ProfileScope = PerfEventGroup::ProfileScope;
+
+#if 1
+// pwe-thread event group with default events pre-selected
+template <typename ... Args>
+ProfileScope Profile(const std::string& title, int id = 0, Args&&... args) {
+    auto& pevg = PerfEventGroup::get();
+    auto* pd = pevg._profile(title, id);
+    if (pd) {
+        pd->set_extra_data(std::forward<Args>(args)...);
+    }
+    return {&pevg, pd};
+}
+
+// overload accept sampling_probability, which can be used to disable profile in scope 
+template <typename ... Args>
+ProfileScope Profile(float sampling_probability, const std::string& title, int id = 0, Args&&... args) {
+    auto& pevg = PerfEventGroup::get();
+    auto* pd = pevg._profile(title, id);
+    if (pd) {
+        pd->set_extra_data(std::forward<Args>(args)...);
+    }
+
+    bool disable_profile = ((std::rand() % 1000)*0.001f >= sampling_probability);
+    if (disable_profile) {
+        PerfEventGroup::get_sampling_lock() ++;
+    }
+    return {&pevg, pd, disable_profile};
+}
+
+inline int Init() {
+    // this is for capture all context switching events
+    PerfEventCtxSwitch::get();
+
+    // this is for making main threads the first process
+    auto dummy = Profile("start");
+    return 0;
+}
+
+#else
+
+template <typename ... Args>
+int Profile(const std::string& title, int id = 0, Args&&... args) {
+    return 0;
+}
+
+// overload accept sampling_probability, which can be used to disable profile in scope 
+template <typename ... Args>
+int Profile(float sampling_probability, const std::string& title, int id = 0, Args&&... args) {
+    return 0;
+}
+
+inline int Init() {
+    return 0;
+}
+
+#endif
+
+} // namespace LinuxPerf

From f29f7d0f005551208686d47f6dc49f4c413909ab Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Thu, 19 Dec 2024 09:27:01 +0100
Subject: [PATCH 02/13] move StatefulSDPAFusion before CommonOptimizations

---
 .../transformations/transformation_pipeline.cpp    | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 4d7df9a335e98a..dee31df767daa0 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -431,6 +431,8 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
         ov::pass::KeepConstAndDecompression);
 
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::AUGRUCellFusion);
+    CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion);
+    CPU_REGISTER_PASS_X64(manager, ov::intel_cpu::SDPAFuseTransposeReshape);
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::CommonOptimizations);
     CPU_REGISTER_PASS_X64(manager, ov::pass::KeepConstsPrecision, decompression_precisions, false, true);
     CPU_SET_CALLBACK_X64(
@@ -654,16 +656,6 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
     CPU_SET_CALLBACK_COMMON(manager, nmsCallback, ov::pass::ConvertNMS9ToNMSIEInternal);
     CPU_SET_CALLBACK_COMMON(manager, nmsCallback, ov::pass::ConvertMulticlassNmsToMulticlassNmsIE);
     CPU_SET_CALLBACK_COMMON(manager, nmsCallback, ov::pass::ConvertMatrixNmsToMatrixNmsIE);
-    CPU_SET_CALLBACK_COMMON(
-        manager,
-        [this](const_node_ptr& node) -> bool {
-            std::string errorMsg;
-            // Current SDPA impl is optimized only for LLM models, so we decompose it for others to avoid perf
-            // regression. Matching the pattern is a little complicated, so we just check if there is any state nodes.
-            return node::ScaledDotProductAttention::isSupportedOperation(node, errorMsg) &&
-                   model->get_variables().size() > 0;
-        },
-        ov::pass::ScaledDotProductAttentionDecomposition);
 
     // List of enabled/disabled transformations
 
@@ -946,8 +938,6 @@ void Transformations::PostLpt() {
 #endif  // OPENVINO_ARCH_X86_64
 
     CPU_REGISTER_PASS_COMMON(postLPTPassManager, ov::pass::transpose_sinking::TSShapeOfForward);
-    CPU_REGISTER_PASS_COMMON(postLPTPassManager, StatefulSDPAFusion);
-    CPU_REGISTER_PASS_X64(postLPTPassManager, ov::intel_cpu::SDPAFuseTransposeReshape);
     CPU_REGISTER_PASS_X64(postLPTPassManager, ov::pass::RMSFusion, false);
     CPU_REGISTER_PASS_X64(postLPTPassManager, ov::intel_cpu::DecomposeRMSNorm);
     CPU_SET_CALLBACK_X64(

From 9208d96c3a813c75103ae7e5b4b01d591eb25f49 Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Fri, 20 Dec 2024 02:12:40 +0100
Subject: [PATCH 03/13] add env for test

---
 .../intel_cpu/src/nodes/linux_perf.hpp        |  2 +-
 .../transformation_pipeline.cpp               | 40 ++++++++++++++++++-
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/src/plugins/intel_cpu/src/nodes/linux_perf.hpp b/src/plugins/intel_cpu/src/nodes/linux_perf.hpp
index fa9498fab70e81..f3c3e4304ec3da 100644
--- a/src/plugins/intel_cpu/src/nodes/linux_perf.hpp
+++ b/src/plugins/intel_cpu/src/nodes/linux_perf.hpp
@@ -1183,7 +1183,7 @@ struct PerfEventGroup : public IPerfEventDumper {
 
 using ProfileScope = PerfEventGroup::ProfileScope;
 
-#if 1
+#if 0
 // pwe-thread event group with default events pre-selected
 template <typename ... Args>
 ProfileScope Profile(const std::string& title, int id = 0, Args&&... args) {
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index dee31df767daa0..164568c3b51188 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -431,8 +431,12 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
         ov::pass::KeepConstAndDecompression);
 
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::AUGRUCellFusion);
-    CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion);
-    CPU_REGISTER_PASS_X64(manager, ov::intel_cpu::SDPAFuseTransposeReshape);
+    auto p = std::getenv("USE_OLD");
+    bool use_old = p && p[0] == '1';
+    if (!use_old) {
+        CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion);
+        CPU_REGISTER_PASS_X64(manager, ov::intel_cpu::SDPAFuseTransposeReshape);
+    }
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::CommonOptimizations);
     CPU_REGISTER_PASS_X64(manager, ov::pass::KeepConstsPrecision, decompression_precisions, false, true);
     CPU_SET_CALLBACK_X64(
@@ -656,6 +660,18 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
     CPU_SET_CALLBACK_COMMON(manager, nmsCallback, ov::pass::ConvertNMS9ToNMSIEInternal);
     CPU_SET_CALLBACK_COMMON(manager, nmsCallback, ov::pass::ConvertMulticlassNmsToMulticlassNmsIE);
     CPU_SET_CALLBACK_COMMON(manager, nmsCallback, ov::pass::ConvertMatrixNmsToMatrixNmsIE);
+    if (use_old) {
+        CPU_SET_CALLBACK_COMMON(
+            manager,
+            [this](const_node_ptr& node) -> bool {
+                std::string errorMsg;
+                // Current SDPA impl is optimized only for LLM models, so we decompose it for others to avoid perf
+                // regression. Matching the pattern is a little complicated, so we just check if there is any state nodes.
+                return node::ScaledDotProductAttention::isSupportedOperation(node, errorMsg) &&
+                    model->get_variables().size() > 0;
+            },
+            ov::pass::ScaledDotProductAttentionDecomposition);
+    }
 
     // List of enabled/disabled transformations
 
@@ -938,6 +954,12 @@ void Transformations::PostLpt() {
 #endif  // OPENVINO_ARCH_X86_64
 
     CPU_REGISTER_PASS_COMMON(postLPTPassManager, ov::pass::transpose_sinking::TSShapeOfForward);
+    auto p = std::getenv("USE_OLD");
+    bool use_old = p && p[0] == '1';
+    if (use_old) {
+        CPU_REGISTER_PASS_COMMON(postLPTPassManager, StatefulSDPAFusion);
+        CPU_REGISTER_PASS_X64(postLPTPassManager, ov::intel_cpu::SDPAFuseTransposeReshape);
+    }
     CPU_REGISTER_PASS_X64(postLPTPassManager, ov::pass::RMSFusion, false);
     CPU_REGISTER_PASS_X64(postLPTPassManager, ov::intel_cpu::DecomposeRMSNorm);
     CPU_SET_CALLBACK_X64(
@@ -960,6 +982,20 @@ void Transformations::PostLpt() {
     symbolic_pipeline->get_manager()->register_pass<NgramFusion>();
 
     postLPTPassManager.run_passes(model);
+    p = std::getenv("CHECK_SDPA");
+    bool check_sdpa = p && p[0] == '1';
+    if (check_sdpa) {
+        size_t count = 0;
+        for (auto&& node : model->get_ordered_ops()) {
+            if (node->get_type_name() == std::string("ScaledDotProductAttentionWithKVCache")) {
+                count++;
+            }
+        }
+        // char buf[128] = {0};
+        // sprintf(buf, "KVCACHE=%ld", count);
+        // std::cout << buf << std::endl;
+        setenv("KVCACHE", std::to_string(count).c_str(), true);
+    }
 }
 
 void Transformations::MainSnippets(void) {

From 5323836f13d36d4c37f83e323ce1ac223ad468db Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Mon, 23 Dec 2024 08:25:14 +0100
Subject: [PATCH 04/13] add dependent transformations

---
 .../transformations/utils/gen_pattern.hpp     | 97 ++++++++++++++++---
 .../common/pass/stateful_sdpa_fusion.cpp      | 39 ++++++--
 .../common/pass/stateful_sdpa_fusion.hpp      |  7 ++
 .../transformation_pipeline.cpp               |  5 +-
 4 files changed, 123 insertions(+), 25 deletions(-)

diff --git a/src/common/transformations/include/transformations/utils/gen_pattern.hpp b/src/common/transformations/include/transformations/utils/gen_pattern.hpp
index 976561b4844a17..215825d2cd13eb 100644
--- a/src/common/transformations/include/transformations/utils/gen_pattern.hpp
+++ b/src/common/transformations/include/transformations/utils/gen_pattern.hpp
@@ -40,6 +40,14 @@ namespace gen_pattern {
 
 #ifdef CPU_DEBUG_CAPS
 
+#    ifdef __GNUC__
+#        define CURRENT_LINE_NO __builtin_LINE()
+#        define CURRENT_FILE    __builtin_FILE()
+#    else
+#        define CURRENT_LINE_NO -1
+#        define CURRENT_FILE    ""
+#    endif
+
 template <typename... Args>
 static inline void _verbose_log(Args&&... args) {
     std::stringstream ss;
@@ -58,6 +66,10 @@ static bool matcher_verbose_enabled() {
         if (matcher_verbose_enabled()) \
         _verbose_log(__VA_ARGS__)
 #else
+
+#    define CURRENT_LINE_NO -1
+#    define CURRENT_FILE    ""
+
 static bool matcher_verbose_enabled() {
     return false;
 }
@@ -181,6 +193,8 @@ class Symbol {
         double literal_const_value;
         std::shared_ptr<Entity> lhs;
         std::shared_ptr<Entity> rhs;
+        const char* filename = "";
+        int line_no = -1;
         // _,+,-,*,/
         // l : literal const
         // n : named symbol
@@ -220,10 +234,12 @@ class Symbol {
         entity->op = 'n';
         entity->name = name;
     }
-    Symbol(const int value) {
+    Symbol(const int value, int line_no = CURRENT_LINE_NO, const char* file = CURRENT_FILE) {
         entity = std::make_shared<Entity>();
         entity->op = 'l';
         entity->literal_const_value = value;
+        entity->line_no = line_no;
+        entity->filename = file;
     }
     Symbol(char op, const Symbol& lhs, const Symbol& rhs) {
         entity = std::make_shared<Entity>();
@@ -246,8 +262,12 @@ class Symbol {
     void* get_id() const {
         return entity.get();
     }
-    const char* get_name() const {
-        return entity->name;
+    std::string get_name() const {
+        if (entity->line_no == -1 || is_independent_var())
+            return entity->name;
+        auto filename = strrchr(entity->filename, '/') ? strrchr(entity->filename, '/') + 1 : entity->filename;
+        std::string name(filename);  // use filename:lineno instead
+        return name + ":" + std::to_string(entity->line_no);
     }
     bool operator<(const Symbol& rhs) const {
         return get_id() < rhs.get_id();
@@ -739,7 +759,9 @@ class GenericPattern : public ov::pass::pattern::op::Pattern {
     explicit GenericPattern(const DiscreteTypeInfo& type_info,
                             const OutputVector& args,
                             const detail::AttrMap& attrs,
-                            const char* vt)
+                            const char* vt,
+                            const int line_no = -1,
+                            const char* file = "")
         : ov::pass::pattern::op::Pattern(args),
           m_type_info(type_info),
           m_attrs(attrs),
@@ -758,6 +780,12 @@ class GenericPattern : public ov::pass::pattern::op::Pattern {
                 sep = ",";
             }
             ss << ")";
+            if (line_no != -1) {
+                // add the code line no to the log:
+                //   O P752<opset1::Multiply>(P736,P745)@fuse_rotary_positional_embeddings.cpp:551 vs ...
+                auto filename = strrchr(file, '/') ? strrchr(file, '/') + 1 : file;
+                ss << "@" << filename << ":" << line_no;
+            }
             m_signature = ss.str();
             set_friendly_name(std::string("P") + std::to_string(id));
         }
@@ -776,7 +804,13 @@ class GenericPattern : public ov::pass::pattern::op::Pattern {
         // strictly requires pattern & graph value to come from output port with same index,
         // this is absolute necessary when pattern contains split node connections.
         if (pattern_value.get_index() != graph_value.get_index()) {
-            _VERBOSE_LOG(level, "X output index mismatch: ", pattern_value.get_index(), "!=", graph_value.get_index());
+            _VERBOSE_LOG(level,
+                         "X output index mismatch:(",
+                         m_signature,
+                         "): ",
+                         pattern_value.get_index(),
+                         "!=",
+                         graph_value.get_index());
             return false;
         }
 
@@ -1018,7 +1052,9 @@ template <class T>
 std::shared_ptr<Node> makePattern(const std::vector<detail::PatternNode>& inputs,
                                   detail::AttrMap attrmap = {},
                                   const char* vt = nullptr,
-                                  const char* friendly_name = nullptr) {
+                                  const char* friendly_name = nullptr,
+                                  int line_no = CURRENT_LINE_NO,
+                                  const char* file = CURRENT_FILE) {
     OutputVector args;
     for (auto& in : inputs)
         args.push_back(in.get_output());
@@ -1026,7 +1062,8 @@ std::shared_ptr<Node> makePattern(const std::vector<detail::PatternNode>& inputs
     // pattern nodes are better for pattern matching because
     //  - it can be generic/incomplete, so normal OP node is not working properly
     //  - it has predicate to correctly decide which branch to take (in Or pattern)
-    auto pattern_node = std::make_shared<detail::GenericPattern>(T::get_type_info_static(), args, attrmap, vt);
+    auto pattern_node =
+        std::make_shared<detail::GenericPattern>(T::get_type_info_static(), args, attrmap, vt, line_no, file);
 
     if (friendly_name)
         pattern_node->set_friendly_name(friendly_name);
@@ -1120,7 +1157,9 @@ inline std::shared_ptr<Node> GenStridedSlice(detail::PatternNode data,
                                              detail::PatternNode start,
                                              detail::PatternNode stop,
                                              detail::PatternNode step,
-                                             size_t axis) {
+                                             size_t axis,
+                                             int line_no = CURRENT_LINE_NO,
+                                             const char* file = CURRENT_FILE) {
     std::vector<int64_t> begin_mask(axis + 1, 1);
     std::vector<int64_t> end_mask(axis + 1, 1);
     std::vector<int64_t> new_axis_mask;
@@ -1135,12 +1174,27 @@ inline std::shared_ptr<Node> GenStridedSlice(detail::PatternNode data,
                                                    {"end_mask", end_mask},
                                                    {"new_axis_mask", new_axis_mask},
                                                    {"shrink_axis_mask", shrink_axis_mask},
-                                                   {"ellipsis_mask", ellipsis_mask}});
+                                                   {"ellipsis_mask", ellipsis_mask}},
+                                                  nullptr,
+                                                  nullptr,
+                                                  line_no,
+                                                  file);
     return opt2;
 }
 
-inline std::shared_ptr<Node> GenSlice(detail::PatternNode data, Symbol start, Symbol stop, Symbol step, size_t axis) {
-    auto opt1 = makePattern<opset8::Slice>({data, {start}, {stop}, {step}, {static_cast<int>(axis)}});
+inline std::shared_ptr<Node> GenSlice(detail::PatternNode data,
+                                      Symbol start,
+                                      Symbol stop,
+                                      Symbol step,
+                                      size_t axis,
+                                      int line_no = CURRENT_LINE_NO,
+                                      const char* file = CURRENT_FILE) {
+    auto opt1 = makePattern<opset8::Slice>({data, {start}, {stop}, {step}, {static_cast<int>(axis)}},
+                                           {},
+                                           nullptr,
+                                           nullptr,
+                                           line_no,
+                                           file);
 
     std::vector<Symbol> vbegin(axis + 1, Symbol(0));
     std::vector<Symbol> vend(axis + 1, Symbol(0));
@@ -1168,7 +1222,11 @@ inline std::shared_ptr<Node> GenSlice(detail::PatternNode data, Symbol start, Sy
                                                    {"end_mask", end_mask},
                                                    {"new_axis_mask", new_axis_mask},
                                                    {"shrink_axis_mask", shrink_axis_mask},
-                                                   {"ellipsis_mask", ellipsis_mask}});
+                                                   {"ellipsis_mask", ellipsis_mask}},
+                                                  nullptr,
+                                                  nullptr,
+                                                  line_no,
+                                                  file);
     return opt1 | opt2;
 }
 
@@ -1329,7 +1387,9 @@ class PatternValidator {
                 auto id = sym.get_id();
                 if (symbol_value_map.count(id)) {
                     if (symbol_value_map[id] != value) {
-                        _VERBOSE_LOG(" in-consistency between multiple references of same symbol : ",
+                        _VERBOSE_LOG(" in-consistency between multiple references of same symbol(",
+                                     sym.get_name(),
+                                     "): ",
                                      symbol_value_map[id],
                                      " != ",
                                      value);
@@ -1345,7 +1405,12 @@ class PatternValidator {
             if (sym.is_literal_const()) {
                 auto literal = sym.eval(symbol_value_map);
                 if (literal != value) {
-                    _VERBOSE_LOG(" mismatch between literal symbol & value : ", literal, " != ", value);
+                    _VERBOSE_LOG(" mismatch between literal symbol & value(",
+                                 sym.get_name(),
+                                 "): ",
+                                 literal,
+                                 " != ",
+                                 value);
                     return false;
                 }
                 // no need to put literal into value map to eval them.
@@ -1373,7 +1438,9 @@ class PatternValidator {
                     }
                 }
                 if (!is_match) {
-                    _VERBOSE_LOG(" mismatch between derived & value : ",
+                    _VERBOSE_LOG(" mismatch between derived & value(",
+                                 sym.get_name(),
+                                 "): ",
                                  std::setprecision(std::numeric_limits<float>::max_digits10),
                                  derived,
                                  " != ",
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
index 447adb0b2fe23f..08b5ec14f32e1c 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
@@ -12,6 +12,7 @@
 #include <openvino/opsets/opset13.hpp>
 #include <openvino/opsets/opset6.hpp>
 #include <openvino/opsets/opset8.hpp>
+#include <openvino/pass/manager.hpp>
 #include <openvino/pass/pattern/op/or.hpp>
 #include <openvino/pass/pattern/op/wrap_type.hpp>
 #include <transformations/utils/gen_pattern.hpp>
@@ -20,7 +21,11 @@
 #include "itt.hpp"
 #include "openvino/opsets/opset1.hpp"
 #include "ov_ops/type_relaxed.hpp"
+#include "transformations/common_optimizations/simplify_shape_of_sub_graph.hpp"
 #include "transformations/cpu_opset/common/op/sdpa.hpp"
+#include "transformations/defs.hpp"
+#include "transformations/op_conversions/convert_broadcast3.hpp"
+#include "transformations/transpose_sinking/ts_shape_of.hpp"
 using namespace ov::gen_pattern;
 
 namespace ov {
@@ -57,7 +62,7 @@ StatefulSDPAFusion::StatefulSDPAFusion() {
     std::shared_ptr<Node> computed_bcst_k, computed_bcst_v, multiply_k, multiply_v;
     std::shared_ptr<Node> mq_reshape_k, mq_reshape_v;
     auto multi_query_bcst = [](const std::shared_ptr<Node>& kv) {
-        auto reshape_kv = wrap_type<opset6::Reshape>({kv, any_input()});
+        auto reshape_kv = makePattern<opset6::Reshape>({kv, any_input()});
         auto unsqueeze_kv = makePattern<opset1::Unsqueeze>({kv, any_input()});
 
         auto check_one = [](Output<Node> output) -> bool {
@@ -73,8 +78,8 @@ StatefulSDPAFusion::StatefulSDPAFusion() {
             makePattern<opset1::Broadcast>({wrap_type<opset1::Constant>(check_one), any_input(), any_input()},
                                            {{"mode", "numpy"}});
 
-        auto multiply_kv = wrap_type<opset6::Multiply>({reshape_kv | unsqueeze_kv, constant_bcst | computed_bcst});
-        auto result = wrap_type<opset6::Reshape>({multiply_kv, any_input()});
+        auto multiply_kv = makePattern<opset6::Multiply>({reshape_kv | unsqueeze_kv, constant_bcst | computed_bcst});
+        auto result = makePattern<opset6::Reshape>({multiply_kv, any_input()});
         return std::make_tuple(result, reshape_kv, unsqueeze_kv, computed_bcst, multiply_kv);
     };
 
@@ -178,15 +183,19 @@ StatefulSDPAFusion::StatefulSDPAFusion() {
 
         opset6::Assign *assign_k_node = nullptr, *assign_v_node = nullptr;
         opset1::Convert *assign_cvt_k_node = nullptr, *assign_cvt_v_node = nullptr;
-        if (!find_assign(concat_k_node, assign_k_node, assign_cvt_k_node))
+        if (!find_assign(concat_k_node, assign_k_node, assign_cvt_k_node)) {
             return false;
-        if (past_k_node->get_variable_id() != assign_k_node->get_variable_id())
+        }
+        if (past_k_node->get_variable_id() != assign_k_node->get_variable_id()) {
             return false;
+        }
 
-        if (!find_assign(concat_v_node, assign_v_node, assign_cvt_v_node))
+        if (!find_assign(concat_v_node, assign_v_node, assign_cvt_v_node)) {
             return false;
-        if (past_v_node->get_variable_id() != assign_v_node->get_variable_id())
+        }
+        if (past_v_node->get_variable_id() != assign_v_node->get_variable_id()) {
             return false;
+        }
 
         auto is_optional_one_child = [&pattern_map](const std::vector<std::shared_ptr<Node>>& nodes) {
             for (auto&& node : nodes) {
@@ -284,5 +293,21 @@ StatefulSDPAFusion::StatefulSDPAFusion() {
     this->register_matcher(m, callback);
 }
 
+bool SDPASubgraphFusion::run_on_model(const std::shared_ptr<ov::Model>& f) {
+    RUN_ON_FUNCTION_SCOPE(SDPASubgraphFusion);
+    using namespace ov::pass::pattern;
+    ov::pass::Manager manager(get_pass_config(), "SDPASubgraphFusion");
+    manager.set_per_pass_validation(false);
+
+    CPU_REGISTER_PASS_COMMON(manager, ov::pass::SimplifyShapeOfSubGraph, true);
+    CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConvertBroadcast3);
+    CPU_REGISTER_PASS_COMMON(manager, ov::pass::transpose_sinking::TSShapeOfForward);
+    CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion);
+    CPU_REGISTER_PASS_COMMON(manager, ov::pass::Validate);
+
+    manager.run_passes(f);
+    return false;
+}
+
 }  // namespace intel_cpu
 }  // namespace ov
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.hpp
index 96028402aa9f92..59494736bb2c2e 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.hpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.hpp
@@ -14,5 +14,12 @@ class StatefulSDPAFusion : public ov::pass::MatcherPass {
     StatefulSDPAFusion();
 };
 
+class SDPASubgraphFusion : public ov::pass::ModelPass {
+public:
+    OPENVINO_RTTI("SDPASubgraphFusion", "0");
+
+    bool run_on_model(const std::shared_ptr<ov::Model>& f) override;
+};
+
 }  // namespace intel_cpu
 }  // namespace ov
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 164568c3b51188..21413addc3187a 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -434,8 +434,7 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
     auto p = std::getenv("USE_OLD");
     bool use_old = p && p[0] == '1';
     if (!use_old) {
-        CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion);
-        CPU_REGISTER_PASS_X64(manager, ov::intel_cpu::SDPAFuseTransposeReshape);
+        CPU_REGISTER_PASS_COMMON(manager, SDPASubgraphFusion);
     }
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::CommonOptimizations);
     CPU_REGISTER_PASS_X64(manager, ov::pass::KeepConstsPrecision, decompression_precisions, false, true);
@@ -953,10 +952,10 @@ void Transformations::PostLpt() {
     }
 #endif  // OPENVINO_ARCH_X86_64
 
-    CPU_REGISTER_PASS_COMMON(postLPTPassManager, ov::pass::transpose_sinking::TSShapeOfForward);
     auto p = std::getenv("USE_OLD");
     bool use_old = p && p[0] == '1';
     if (use_old) {
+        CPU_REGISTER_PASS_COMMON(postLPTPassManager, ov::pass::transpose_sinking::TSShapeOfForward);
         CPU_REGISTER_PASS_COMMON(postLPTPassManager, StatefulSDPAFusion);
         CPU_REGISTER_PASS_X64(postLPTPassManager, ov::intel_cpu::SDPAFuseTransposeReshape);
     }

From e95bedc483cff3c5386f5f49d9e7c18f5eb48e03 Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Tue, 24 Dec 2024 09:31:21 +0100
Subject: [PATCH 05/13] fix mixtral failure

---
 .../common/pass/stateful_sdpa_fusion.cpp       | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
index 08b5ec14f32e1c..45ea04b8fc753a 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
@@ -61,6 +61,7 @@ StatefulSDPAFusion::StatefulSDPAFusion() {
     std::shared_ptr<Node> reshape_k, reshape_v, unsqueeze_k, unsqueeze_v;
     std::shared_ptr<Node> computed_bcst_k, computed_bcst_v, multiply_k, multiply_v;
     std::shared_ptr<Node> mq_reshape_k, mq_reshape_v;
+    std::shared_ptr<Node> computed_bcst3_k, computed_bcst3_v;
     auto multi_query_bcst = [](const std::shared_ptr<Node>& kv) {
         auto reshape_kv = makePattern<opset6::Reshape>({kv, any_input()});
         auto unsqueeze_kv = makePattern<opset1::Unsqueeze>({kv, any_input()});
@@ -79,12 +80,16 @@ StatefulSDPAFusion::StatefulSDPAFusion() {
                                            {{"mode", "numpy"}});
 
         auto multiply_kv = makePattern<opset6::Multiply>({reshape_kv | unsqueeze_kv, constant_bcst | computed_bcst});
-        auto result = makePattern<opset6::Reshape>({multiply_kv, any_input()});
-        return std::make_tuple(result, reshape_kv, unsqueeze_kv, computed_bcst, multiply_kv);
+        auto computed_bcst3 =
+            makePattern<opset3::Broadcast>({unsqueeze_kv, any_input()},
+                                           {{"mode", "bidirectional"}});
+
+        auto result = makePattern<opset6::Reshape>({multiply_kv | computed_bcst3, any_input()});
+        return std::make_tuple(result, reshape_kv, unsqueeze_kv, computed_bcst, multiply_kv, computed_bcst3);
     };
 
-    std::tie(mq_reshape_k, reshape_k, unsqueeze_k, computed_bcst_k, multiply_k) = multi_query_bcst(concat_k);
-    std::tie(mq_reshape_v, reshape_v, unsqueeze_v, computed_bcst_v, multiply_v) = multi_query_bcst(concat_v);
+    std::tie(mq_reshape_k, reshape_k, unsqueeze_k, computed_bcst_k, multiply_k, computed_bcst3_k) = multi_query_bcst(concat_k);
+    std::tie(mq_reshape_v, reshape_v, unsqueeze_v, computed_bcst_v, multiply_v, computed_bcst3_v) = multi_query_bcst(concat_v);
     auto present_k = concat_k | mq_reshape_k;
     auto present_v = concat_v | mq_reshape_v;
 
@@ -221,7 +226,9 @@ StatefulSDPAFusion::StatefulSDPAFusion() {
                                     computed_bcst_v,
                                     multiply_v,
                                     mq_reshape_k,
-                                    mq_reshape_v})) {
+                                    mq_reshape_v,
+                                    computed_bcst3_k,
+                                    computed_bcst3_v})) {
             return false;
         }
 
@@ -300,7 +307,6 @@ bool SDPASubgraphFusion::run_on_model(const std::shared_ptr<ov::Model>& f) {
     manager.set_per_pass_validation(false);
 
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::SimplifyShapeOfSubGraph, true);
-    CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConvertBroadcast3);
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::transpose_sinking::TSShapeOfForward);
     CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion);
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::Validate);

From bbc20a2616a4e4ec393accdb524a5f76a7966e3e Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Tue, 24 Dec 2024 09:36:13 +0100
Subject: [PATCH 06/13] code clean

---
 src/plugins/intel_cpu/src/graph.cpp           |   14 +-
 .../intel_cpu/src/nodes/linux_perf.hpp        | 1242 -----------------
 .../common/pass/stateful_sdpa_fusion.cpp      |   10 +-
 .../transformation_pipeline.cpp               |   39 +-
 4 files changed, 10 insertions(+), 1295 deletions(-)
 delete mode 100644 src/plugins/intel_cpu/src/nodes/linux_perf.hpp

diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp
index 92b541f9b2543a..aab78a4d5f15bd 100644
--- a/src/plugins/intel_cpu/src/graph.cpp
+++ b/src/plugins/intel_cpu/src/graph.cpp
@@ -45,7 +45,6 @@
 #include "utils/node_dumper.h"
 #include "utils/precision_support.h"
 #include "utils/verbose.h"
-#include "nodes/linux_perf.hpp"
 
 #if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO)
 #    include <tbb/task.h>
@@ -109,7 +108,6 @@ void Graph::Replicate(const std::shared_ptr<const ov::Model>& model,
     OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, itt::domains::intel_cpu_LT, "Graph::Replicate", "ov::Model");
 
     this->_name = model->get_friendly_name();
-    LinuxPerf::Init();
 
     // Map data object onto producer node
     std::map<std::shared_ptr<ov::Node>, NodePtr> op2node;
@@ -1164,7 +1162,6 @@ VecMemoryDescs Graph::getOutputMemoryDescriptors() const {
 
 void Graph::InferStatic(SyncInferRequest* request, int numaId) {
     for (const auto& node : m_executableGraphNodes) {
-        auto perf1 = LinuxPerf::Profile(node->getTypeStr());
         ExecuteNodeWithCatch(node, request, numaId);
     }
 }
@@ -1440,15 +1437,11 @@ inline void Graph::ExecuteNodeWithCatch(const NodePtr& node, SyncInferRequest* r
 template <typename UpdateStrategy>
 void Graph::InferDynamic(SyncInferRequest* request, int numaId, UpdateStrategy&& update) {
     size_t inferCounter = 0;
-    auto perf = LinuxPerf::Profile(std::string("Graph::InferDynamic_#") + std::to_string(infer_count));
     for (auto stopIndx : m_executableSyncNodesInds) {
-        {
-            auto perf1 = LinuxPerf::Profile("update");
-            update(stopIndx);
-        }
+        update(stopIndx);
+
         for (; inferCounter < stopIndx; ++inferCounter) {
             auto& node = m_executableGraphNodes[inferCounter];
-            auto perf1 = LinuxPerf::Profile(node->getTypeStr()); // + "_" + node->getName());
 
             ExecuteNodeWithCatch(node, request, numaId);
         }
@@ -1494,7 +1487,8 @@ void Graph::Infer(SyncInferRequest* request) {
                         static_cast<int>(status));
     }
 
-    infer_count++;
+    if (infer_count != -1)
+        infer_count++;
 }
 
 void Graph::SortTopologically() {
diff --git a/src/plugins/intel_cpu/src/nodes/linux_perf.hpp b/src/plugins/intel_cpu/src/nodes/linux_perf.hpp
deleted file mode 100644
index f3c3e4304ec3da..00000000000000
--- a/src/plugins/intel_cpu/src/nodes/linux_perf.hpp
+++ /dev/null
@@ -1,1242 +0,0 @@
-
-#include <linux/perf_event.h>
-#include <time.h>
-//#include <linux/time.h>
-#include <unistd.h>
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-
-#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 30
-#include <sys/syscall.h>
-#define gettid() syscall(SYS_gettid)
-#endif
-
-inline int perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu, int group_fd, unsigned long flags) {
-	return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags);
-}
-
-#include <cstdio>
-#include <cstdlib>
-#include <cstdint>
-#include <cstring>
-#include <vector>
-#include <atomic>
-#include <x86intrin.h>
-#include <sys/mman.h>
-#include <thread>
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <deque>
-#include <mutex>
-#include <set>
-#include <iomanip>
-#include <functional>
-#include <limits>
-
-namespace LinuxPerf {
-
-#define _LINE_STRINGIZE(x) _LINE_STRINGIZE2(x)
-#define _LINE_STRINGIZE2(x) #x
-#define LINE_STRING _LINE_STRINGIZE(__LINE__)
-
-#define LINUX_PERF_ "\e[33m[LINUX_PERF:" LINE_STRING "]\e[0m "
-
-inline uint64_t get_time_ns() {
-    struct timespec tp0;
-    if (clock_gettime(CLOCK_MONOTONIC_RAW, &tp0) != 0) {
-        perror(LINUX_PERF_"clock_gettime(CLOCK_MONOTONIC_RAW,...) failed!");
-        abort();
-    }
-    return (tp0.tv_sec * 1000000000) + tp0.tv_nsec;    
-}
-
-struct TscCounter {
-    uint64_t tsc_ticks_per_second;
-    uint64_t tsc_ticks_base;
-    double tsc_to_usec(uint64_t tsc_ticks) const {
-        if (tsc_ticks < tsc_ticks_base)
-            return 0;
-        return (tsc_ticks - tsc_ticks_base) * 1000000.0 / tsc_ticks_per_second;
-    }
-    double tsc_to_usec(uint64_t tsc_ticks0, uint64_t tsc_ticks1) const {
-        if (tsc_ticks1 < tsc_ticks0)
-            return 0;
-        return (tsc_ticks1 - tsc_ticks0) * 1000000.0 / tsc_ticks_per_second;
-    }
-    TscCounter() {
-        uint64_t start_ticks = __rdtsc();
-        std::this_thread::sleep_for(std::chrono::seconds(1));
-        tsc_ticks_per_second = (__rdtsc() - start_ticks);
-        std::cout << LINUX_PERF_"tsc_ticks_per_second = " << tsc_ticks_per_second << std::endl;
-        tsc_ticks_base = __rdtsc();
-
-        // use CLOCK_MONOTONIC_RAW instead of TSC
-        tsc_ticks_per_second = 1000000000; // ns
-        tsc_ticks_base = get_time_ns();
-    }
-};
-
-class IPerfEventDumper {
-public:
-    virtual void dump_json(std::ofstream& fw, TscCounter& tsc) = 0;
-};
-
-struct PerfEventJsonDumper {
-    std::mutex g_mutex;
-    std::set<IPerfEventDumper*> all_dumpers;
-    const char* dump_file_name = "perf_dump.json";
-    bool dump_file_over = false;
-    bool not_finalized = true;
-    std::ofstream fw;
-    std::atomic_int totalProfilerManagers{0};
-    TscCounter tsc;
-
-    ~PerfEventJsonDumper() {
-        if (not_finalized)
-            finalize();
-    }
-
-    void finalize() {
-        if (!not_finalized)
-            return;
-        std::lock_guard<std::mutex> guard(g_mutex);
-        if (dump_file_over || all_dumpers.empty())
-            return;
-
-        // start dump
-        fw.open(dump_file_name, std::ios::out);
-        fw << "{\n";
-        fw << "\"schemaVersion\": 1,\n";
-        fw << "\"traceEvents\": [\n";
-        fw.flush();
-
-        for (auto& pthis : all_dumpers) {
-            pthis->dump_json(fw, tsc);
-        }
-        all_dumpers.clear();
-
-        fw << R"({
-            "name": "Profiler End",
-            "ph": "i",
-            "s": "g",
-            "pid": "Traces",
-            "tid": "Trace OV Profiler",
-            "ts":)"
-           << tsc.tsc_to_usec(get_time_ns()) << "}",
-            fw << "]\n";
-        fw << "}\n";
-        auto total_size = fw.tellp();
-        fw.close();
-        dump_file_over = true;
-        not_finalized = false;
-
-        std::cout << LINUX_PERF_"Dumpped ";
-        
-        if (total_size < 1024) std::cout << total_size << " bytes ";
-        else if (total_size < 1024*1024) std::cout << total_size/1024 << " KB ";
-        else std::cout << total_size/(1024 * 1024) << " MB ";
-        std::cout << " to " << dump_file_name << std::endl;
-    }
-
-    int register_manager(IPerfEventDumper* pthis) {
-        std::lock_guard<std::mutex> guard(g_mutex);
-        std::stringstream ss;
-        auto serial_id = totalProfilerManagers.fetch_add(1);
-        ss << LINUX_PERF_"#" << serial_id << "(" << pthis << ") : is registed." << std::endl;
-        std::cout << ss.str();
-        all_dumpers.emplace(pthis);
-        return serial_id;
-    }
-
-    static PerfEventJsonDumper& get() {
-        static PerfEventJsonDumper inst;
-        return inst;
-    }
-};
-
-inline std::vector<std::string> str_split(const std::string& s, std::string delimiter) {
-    std::vector<std::string> ret;
-    size_t last = 0;
-    size_t next = 0;
-    while ((next = s.find(delimiter, last)) != std::string::npos) {
-        //std::cout << last << "," << next << "=" << s.substr(last, next-last) << "\n";
-        ret.push_back(s.substr(last, next-last));
-        last = next + 1;
-    }
-    ret.push_back(s.substr(last));
-    return ret;
-}
-
-template<typename T>
-T& read_ring_buffer(perf_event_mmap_page& meta, uint64_t& offset) {
-    auto offset0 = offset;
-    offset += sizeof(T);
-    return *reinterpret_cast<T*>(reinterpret_cast<uint8_t*>(&meta) + meta.data_offset + (offset0)%meta.data_size);
-}
-
-struct PerfRawConfig {
-    PerfRawConfig() {
-        // env var defined raw events
-        const char* str_raw_config = std::getenv("LINUX_PERF");
-        if (str_raw_config) {
-            CPU_ZERO(&cpu_mask);
-            // options are separated by ":" as PATH
-            auto options = str_split(str_raw_config, ":");
-            for(auto& opt : options) {
-                auto items = str_split(opt, "=");
-                if (items.size() == 2) {
-                    if (items[0] == "dump") {
-                        // limit the number of dumps per thread
-                        dump = strtoll(&items[1][0], nullptr, 0);
-                    } else if (items[0] == "cpus") {
-                        // thread's affinity (cpu-binding) can be changed by threading-libs(TBB/OpenMP) anytime
-                        // sched_getaffinity() can only get correct binding at start-up time, another way is to specify it 
-                        // also too many events may generate if per-thread event is used, cpus can limit
-                        // cpus=56
-                        // cpus=56.57.59
-                        auto cpus = str_split(items[1], ",");
-                        CPU_ZERO(&cpu_mask);
-                        for(auto& cpu : cpus) {
-                            CPU_SET(std::atoi(cpu.c_str()), &cpu_mask);
-                        }
-                    } else {
-                        auto config = strtoul(&items[1][0], nullptr, 0);
-                        if (config > 0)
-                            raw_configs.emplace_back(items[0], config);
-                    }
-                }
-                if (items.size() == 1) {
-                    if (items[0] == "switch-cpu") {
-                        // get cpu_mask as early as possible
-                        switch_cpu = true;
-                        CPU_ZERO(&cpu_mask);
-                        if (sched_getaffinity(getpid(), sizeof(cpu_set_t), &cpu_mask)) {
-                            perror(LINUX_PERF_"sched_getaffinity failed:");
-                            abort();
-                        }
-                    }
-                    if (items[0] == "dump")
-                        dump = std::numeric_limits<int64_t>::max(); // no limit to number of dumps
-                }
-            }
-
-            for(auto& cfg : raw_configs) {
-                printf(LINUX_PERF_" config: %s=0x%lx\n", cfg.first.c_str(), cfg.second);
-            }
-            if (switch_cpu) {
-                printf(LINUX_PERF_" config: switch_cpu\n");
-            }
-            if (dump)
-                printf(LINUX_PERF_" config: dump=%ld\n", dump);
-            if (CPU_COUNT(&cpu_mask)) {
-                printf(LINUX_PERF_" config: cpus=");
-                for (int cpu = 0; cpu < (int)sizeof(cpu_set_t)*8; cpu++)
-                    if(CPU_ISSET(cpu, &cpu_mask)) printf("%d,", cpu);
-                printf("\n");
-            }
-        } else {
-            printf(LINUX_PERF_" LINUX_PERF is unset, example: LINUX_PERF=dump,switch-cpu,L2_MISS=0x10d1\n");
-        }
-    }
-
-    bool dump_on_cpu(int cpu) {
-        if (dump == 0)
-            return false;
-        if (CPU_COUNT(&cpu_mask))
-            return CPU_ISSET(cpu, &cpu_mask);
-        return true;
-    }
-
-    int64_t dump = 0;
-    cpu_set_t cpu_mask;
-    bool switch_cpu = false;
-    std::vector<int> dump_cpus;
-    std::vector<std::pair<std::string, uint64_t>> raw_configs;
-
-    static PerfRawConfig& get() {
-        static PerfRawConfig inst;
-        return inst;
-    }
-};
-
-
-// context switch events
-// this will visualize 
-struct PerfEventCtxSwitch : public IPerfEventDumper {
-    bool is_enabled;
-
-    struct event {
-        int fd;
-        perf_event_mmap_page * meta;
-        int cpu;
-        uint64_t ctx_switch_in_time;
-        uint64_t ctx_switch_in_tid;
-        uint64_t ctx_last_time;
-
-        event(int fd, perf_event_mmap_page * meta): fd(fd), meta(meta) {}
-    };
-    std::vector<event> events;
-
-    PerfEventCtxSwitch() {
-        is_enabled = PerfRawConfig::get().switch_cpu;
-        if (is_enabled) {
-            // make sure TSC in PerfEventJsonDumper is the very first thing to initialize
-            PerfEventJsonDumper::get().register_manager(this);
-
-            // open fd for each CPU
-            cpu_set_t mask = PerfRawConfig::get().cpu_mask;
-
-            long number_of_processors = sysconf(_SC_NPROCESSORS_ONLN);
-            printf(LINUX_PERF_"sizeof(cpu_set_t):%lu: _SC_NPROCESSORS_ONLN=%ld CPU_COUNT=%d\n", sizeof(cpu_set_t), number_of_processors, CPU_COUNT(&mask));
-            if (CPU_COUNT(&mask) >= number_of_processors) {
-                printf(LINUX_PERF_" no affinity is set, will not enable PerfEventCtxSwitch\n");
-                is_enabled = false;
-                return;
-            }
-
-            for (int cpu = 0; cpu < (int)sizeof(cpu_set_t)*8; cpu++) {
-                auto is_set = CPU_ISSET(cpu, &mask);
-                if (!is_set) continue;
-
-                perf_event_attr pea;
-                memset(&pea, 0, sizeof(struct perf_event_attr));
-                pea.type = PERF_TYPE_HARDWARE;
-                pea.size = sizeof(struct perf_event_attr);
-                pea.config = PERF_COUNT_HW_REF_CPU_CYCLES;  // not the point, can be any
-                pea.disabled = 0;
-                pea.exclude_kernel = 1;
-                pea.exclude_hv = 1;
-                pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;        
-                // pinned: It applies only to hardware counters and only to group leaders
-                pea.pinned = 1;
-                pea.read_format |= PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
-
-                // for group master, generate PERF_RECORD_SWITCH into ring-buffer
-                // is helpful to visualize context switch
-                pea.context_switch = 1;
-                // then TID, TIME, ID, STREAM_ID, and CPU can additionally be included in non-PERF_RECORD_SAMPLEs
-                // if the  corresponding sample_type is selected
-                pea.sample_id_all = 1;
-                pea.sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_TID | PERF_SAMPLE_CPU;
-                auto mmap_length = sysconf(_SC_PAGESIZE) * (1024 + 1);
-                pea.use_clockid = 1;
-                pea.clockid = CLOCK_MONOTONIC_RAW;
-
-                // calling thread on any processor
-                pid_t pid = -1;
-                // measures all processes/threads on the specified CPU
-                int ctx_switch_fd = perf_event_open(&pea, pid, cpu, -1, 0);
-                if (ctx_switch_fd < 0) {
-                    perror(LINUX_PERF_"PerfEventCtxSwitch perf_event_open failed (check /proc/sys/kernel/perf_event_paranoid please)");
-                    abort();
-                }
-
-                auto* ctx_switch_pmeta = reinterpret_cast<perf_event_mmap_page*>(mmap(NULL, mmap_length, PROT_READ | PROT_WRITE, MAP_SHARED, ctx_switch_fd, 0));
-                if (ctx_switch_pmeta == MAP_FAILED) {
-                    perror(LINUX_PERF_"mmap perf_event_mmap_page failed:");
-                    close(ctx_switch_fd);
-                    abort();
-                }
-                printf(LINUX_PERF_"perf_event_open CPU_WIDE context_switch on cpu %d, ctx_switch_fd=%d\n", cpu, ctx_switch_fd);
-                events.emplace_back(ctx_switch_fd, ctx_switch_pmeta);
-                events.back().ctx_switch_in_time = get_time_ns();
-                events.back().ctx_last_time = get_time_ns();
-                events.back().cpu = cpu;
-            }
-            my_pid = getpid();
-            my_tid = gettid();
-        }
-    }
-
-    ~PerfEventCtxSwitch() {
-        if (is_enabled) {
-            PerfEventJsonDumper::get().finalize();
-        }
-        for(auto& ev : events) {
-            close(ev.fd);
-        }
-    }
-
-    struct ProfileData {
-        uint64_t tsc_start;
-        uint64_t tsc_end;
-        uint32_t tid;
-        uint32_t cpu;
-        bool preempt;   // preempt means current TID preempts previous thread
-    };
-
-    std::deque<ProfileData> all_dump_data;
-
-    void dump_json(std::ofstream& fw, TscCounter& tsc) override {
-        static std::atomic_uint64_t async_evid{0};
-        if (!is_enabled) return;
-
-        updateRingBuffer();
-
-        auto data_size = all_dump_data.size();
-        if (!data_size) return;
-
-        for (auto& ev : events) {
-            if (ev.ctx_switch_in_time == 0) continue;
-            all_dump_data.emplace_back();
-            auto* pd = &all_dump_data.back();
-            pd->tid = ev.ctx_switch_in_tid;
-            pd->cpu = ev.cpu;
-            pd->tsc_start = ev.ctx_switch_in_time;
-            pd->tsc_end = get_time_ns();
-            ev.ctx_switch_in_time = 0;
-        }
-
-        auto pid = 9999;    // fake pid for CPU
-        auto cat = "TID";
-        
-        // TID is used for CPU id instead
-        for (auto& d : all_dump_data) {
-            auto duration = tsc.tsc_to_usec(d.tsc_start, d.tsc_end);
-            auto start = tsc.tsc_to_usec(d.tsc_start);
-            //auto end = tsc.tsc_to_usec(d.tsc_end);
-            auto cpu_id = d.cpu;
-
-            fw << "{\"ph\": \"X\", \"name\": \"" << d.tid << "\", \"cat\":\"" << cat << "\","
-                << "\"pid\": " << pid << ", \"tid\": \"CPU" << cpu_id <<  "\","
-                << "\"ts\": " << std::setprecision (15) << start << ", \"dur\": " << duration << "},\n";
-        }
-    }
-
-    bool ring_buffer_verbose = false;
-    uint32_t my_pid = 0;
-    uint32_t my_tid = 0;
-    std::atomic<int> atom_gard{0};
-
-    void updateRingBuffer() {
-        // only one thread can enter
-        const int lock_value = atom_gard.exchange(1);
-        if (lock_value == 1) {
-            // has been locked, return;
-            return;
-        }
-
-        // only update when any ring-buffer is half loaded
-        bool need_update = false;
-        for(auto& ev : events) {
-            auto& mmap_meta = *ev.meta;
-            auto used_size = (mmap_meta.data_tail - mmap_meta.data_head) % mmap_meta.data_size;
-            if (used_size > (mmap_meta.data_size >> 1)) {
-                need_update = true;
-                break;
-            }
-        }
-
-        if (!need_update) {
-            // unlock
-            atom_gard.exchange(0);
-            return;
-        }
-
-        for(auto& ev : events) {
-            auto& mmap_meta = *ev.meta;
-            uint64_t head0 = mmap_meta.data_tail;
-            uint64_t head1 = mmap_meta.data_head;
-            //printf("ring-buffer@end: %lu~%lu %llu %llu %llu\n", head0, head1, group_meta.data_tail, group_meta.data_offset, group_meta.data_size);
-
-            if (head0 != head1) {
-                if (ring_buffer_verbose) {
-                    printf("PERF_RECORD_SWITCH = %d\n", PERF_RECORD_SWITCH);
-                    printf("PERF_RECORD_SWITCH_CPU_WIDE = %d\n", PERF_RECORD_SWITCH_CPU_WIDE);
-                    printf("PERF_RECORD_MISC_SWITCH_OUT = %d\n", PERF_RECORD_MISC_SWITCH_OUT);
-                    printf("PERF_RECORD_MISC_SWITCH_OUT_PREEMPT  = %d\n", PERF_RECORD_MISC_SWITCH_OUT_PREEMPT);
-                }
-
-                while(head0 < head1) {
-                    auto h0 = head0;
-                    auto type = read_ring_buffer<__u32>(mmap_meta, head0);
-                    auto misc = read_ring_buffer<__u16>(mmap_meta, head0);
-                    auto size = read_ring_buffer<__u16>(mmap_meta, head0);
-                    uint32_t next_prev_pid = 0, next_prev_tid = 0;
-                    if (type == PERF_RECORD_SWITCH_CPU_WIDE) {
-                        // previous PID/TID if switching-in
-                        // next PID/TID if switching-out
-                        next_prev_pid = read_ring_buffer<__u32>(mmap_meta, head0);
-                        next_prev_tid = read_ring_buffer<__u32>(mmap_meta, head0);
-                    }
-                    auto pid = read_ring_buffer<__u32>(mmap_meta, head0);
-                    auto tid = read_ring_buffer<__u32>(mmap_meta, head0);
-                    auto time = read_ring_buffer<uint64_t>(mmap_meta, head0);
-                    auto cpu = read_ring_buffer<__u32>(mmap_meta, head0);
-                    auto reserved0 = read_ring_buffer<__u32>(mmap_meta, head0);
-                    (void)reserved0;
-                    (void)next_prev_pid;
-                    (void)pid;
-
-                    // skip idle process (with TID 0)
-                    if (tid > 0 && ring_buffer_verbose) {
-                        printf("event: %lu/%lu\ttype,misc,size=(%u,%u,%u) cpu%u,next_prev_tid=%u,tid=%u  time:(%lu), (+%lu)\n",
-                            h0, head1,
-                            type, misc, size,
-                            cpu, next_prev_tid, tid,
-                            time,
-                            time - ev.ctx_last_time);
-                    }
-
-                    if (type == PERF_RECORD_SWITCH_CPU_WIDE && tid > 0) {
-                        if (misc & PERF_RECORD_MISC_SWITCH_OUT || misc & PERF_RECORD_MISC_SWITCH_OUT_PREEMPT) {
-                            // switch out
-                            // generate a log
-                            all_dump_data.emplace_back();
-                            auto* pd = &all_dump_data.back();
-                            pd->tid = tid;
-                            pd->cpu = cpu;
-                            pd->preempt = (misc & PERF_RECORD_MISC_SWITCH_OUT_PREEMPT);
-                            //printf("ctx_switch_in_time=%lu\n", ctx_switch_in_time);
-                            pd->tsc_start = ev.ctx_switch_in_time;
-                            pd->tsc_end = time;
-
-                            if (ring_buffer_verbose) printf("\t  cpu: %u tid: %u  %lu (+%lu)\n", cpu, tid, ev.ctx_switch_in_time, time-ev.ctx_switch_in_time);
-
-                            ev.ctx_switch_in_time = 0;
-                        } else {
-                            // switch in
-                            ev.ctx_switch_in_time = time;
-                            ev.ctx_switch_in_tid = tid;
-                        }
-                    }
-
-                    ev.ctx_last_time = time;
-                    head0 += size - (head0 - h0);
-                }
-
-                if (head0 != head1) {
-                    printf("head0(%lu) != head1(%lu)\n", head0, head1);
-                    abort();
-                }
-
-                // update tail so kernel can keep generate event records
-                mmap_meta.data_tail = head0;
-                std::atomic_thread_fence(std::memory_order_seq_cst);
-            }
-        }
-        atom_gard.exchange(0);
-    }
-
-    static PerfEventCtxSwitch& get() {
-        static PerfEventCtxSwitch inst;
-        return inst;
-    }
-};
-
-/*
-RAW HARDWARE EVENT DESCRIPTOR
-       Even when an event is not available in a symbolic form within perf right now, it can be encoded in a per processor specific way.
-
-       For instance For x86 CPUs NNN represents the raw register encoding with the layout of IA32_PERFEVTSELx MSRs (see [Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 3B: System Programming Guide] Figure 30-1
-       Layout of IA32_PERFEVTSELx MSRs) or AMD’s PerfEvtSeln (see [AMD64 Architecture Programmer’s Manual Volume 2: System Programming], Page 344, Figure 13-7 Performance Event-Select Register (PerfEvtSeln)).
-
-       Note: Only the following bit fields can be set in x86 counter registers: event, umask, edge, inv, cmask. Esp. guest/host only and OS/user mode flags must be setup using EVENT MODIFIERS.
-
- event 7:0
- umask 15:8
- edge  18
- inv   23
- cmask 31:24
-*/
-#define X86_RAW_EVENT(EventSel, UMask, CMask) ((CMask << 24) | (UMask << 8) | (EventSel))
-
-struct PerfEventGroup : public IPerfEventDumper {
-    int group_fd = -1;
-    uint64_t read_format;
-
-    struct event {
-        int fd = -1;
-        uint64_t id = 0;
-        uint64_t pmc_index = 0;
-        perf_event_mmap_page* pmeta = nullptr;
-        std::string name = "?";
-        char format[32];
-    };
-    std::vector<event> events;
-
-    uint64_t read_buf[512]; // 4KB
-    uint64_t time_enabled;
-    uint64_t time_running;
-    uint64_t pmc_width;
-    uint64_t pmc_mask;
-    uint64_t values[32];
-    uint32_t tsc_time_shift;
-    uint32_t tsc_time_mult;
-
-    // ref_cpu_cycles even id
-    // this event is fixed function counter provided by most x86 CPU
-    // and it provides TSC clock which is:
-    //    - very high-resolution (<1ns or >1GHz)
-    //    - independent of CPU-frequency throttling
-    int ref_cpu_cycles_evid = -1;
-    int sw_task_clock_evid = -1;
-    int hw_cpu_cycles_evid = -1;
-    int hw_instructions_evid = -1;
-
-    struct ProfileData {
-        uint64_t tsc_start;
-        uint64_t tsc_end;
-        std::string title;
-        const char * cat;
-        int32_t id;
-        static const int data_size = 16; // 4(fixed) + 8(PMU) + 4(software)
-        uint64_t data[data_size] = {0};
-        // f/i/u/p
-        char extra_data_type[data_size] = {0};
-        union {
-            double f;
-            int64_t i;
-            void * p;
-        } extra_data[data_size];
-
-        template<typename T>
-        char get_extra_type(T t) {
-            if (std::is_pointer<T>::value) return 'p';
-            if (std::is_floating_point<T>::value) return 'f';
-            if (std::is_integral<T>::value) return 'i';
-            return '\0';
-        }
-        template<typename T>
-        void set_extra_data(int i, T* t) { extra_data[i].p = t; }
-        void set_extra_data(int i, float t) { extra_data[i].f = t; }
-        void set_extra_data(int i, double t) { extra_data[i].f = t; }
-        template<typename T>
-        void set_extra_data(int i, T t) {
-            static_assert(std::is_integral<T>::value);
-            extra_data[i].i = t;
-        }
-
-        template <typename ... Values>
-        void set_extra_data(Values... vals) {
-            static_assert(data_size >= sizeof...(vals));
-            int j = 0;
-            int unused1[] = { 0, (set_extra_data(j++, vals), 0)... };
-            (void)unused1;
-            j = 0;
-            int unused2[] = { 0, (extra_data_type[j++] = get_extra_type(vals), 0)... };
-            (void)unused2;
-            extra_data_type[j] = '\0';
-        }
-
-        ProfileData(const std::string& title) : title(title) {
-            start();
-        }
-        void start() {
-            tsc_start = get_time_ns();
-        }
-        void stop() {
-            tsc_end = get_time_ns();
-        }
-    };
-
-    bool enable_dump_json = false;
-    int64_t dump_limit = 0;
-    std::deque<ProfileData> all_dump_data;
-    int serial;
-
-    using CallBackEventArgsSerializer = std::function<void(std::ostream& fw, double usec, uint64_t* counters)>;
-    CallBackEventArgsSerializer fn_evt_args_serializer;
-
-    void dump_json(std::ofstream& fw, TscCounter& tsc) override {
-        static std::atomic_uint64_t async_evid{0};
-        if (!enable_dump_json)
-            return;
-        auto data_size = all_dump_data.size();
-        if (!data_size)
-            return;
-
-        for (auto& d : all_dump_data) {
-            auto duration = tsc.tsc_to_usec(d.tsc_start, d.tsc_end);
-            auto title = std::string(d.title) + "_" + std::to_string(d.id);
-            auto cat = d.cat;
-            //auto pid = serial;
-            auto start = tsc.tsc_to_usec(d.tsc_start);
-            //auto end = tsc.tsc_to_usec(d.tsc_end);
-
-            if (d.id < 0) {
-                // async events
-                // {"cat": "foo", "name": "async_read2", "pid": 4092243, "id": 4092246, "ph": "b", "ts": 23819.718},
-                fw << "{\"ph\": \"b\", \"name\": \"" << d.title << "\", \"cat\":\"" << cat << "\","
-                    << "\"pid\": " << my_pid << ", \"id\": " << (-d.id) << ","
-                    << "\"ts\": " << std::setprecision (15) << start << "},";
-
-                fw << "{\"ph\": \"e\", \"name\": \"" << d.title << "\", \"cat\":\"" << cat << "\","
-                    << "\"pid\": " << my_pid << ", \"id\": " << (-d.id) << ","
-                    << "\"ts\": " << std::setprecision (15) << tsc.tsc_to_usec(d.tsc_end) << ",";
-            } else {
-                fw << "{\"ph\": \"X\", \"name\": \"" << title << "\", \"cat\":\"" << cat << "\","
-                    << "\"pid\": " << my_pid << ", \"tid\": " << my_tid << ","
-                    << "\"ts\": " << std::setprecision (15) << start << ", \"dur\": " << duration << ",";
-            }
-
-            fw << "\"args\":{";
-            {
-                std::stringstream ss;
-                if (fn_evt_args_serializer)
-                    fn_evt_args_serializer(ss, duration, d.data);
-                if (sw_task_clock_evid >= 0) {
-                    // PERF_COUNT_SW_TASK_CLOCK in nano-seconds
-                    ss << "\"CPU Usage\":" << (d.data[sw_task_clock_evid] * 1e-3)/duration << ",";
-                }
-                if (hw_cpu_cycles_evid >= 0) {
-                    if (sw_task_clock_evid >= 0 && d.data[sw_task_clock_evid] > 0) {
-                        ss << "\"CPU Freq(GHz)\":" << static_cast<double>(d.data[hw_cpu_cycles_evid])/d.data[sw_task_clock_evid] << ",";
-                    } else {
-                        ss << "\"CPU Freq(GHz)\":" << static_cast<double>(d.data[hw_cpu_cycles_evid])*1e-3/duration << ",";
-                    }
-                    if (hw_instructions_evid >= 0 && d.data[hw_instructions_evid] > 0) {
-                        ss << "\"CPI\":" << static_cast<double>(d.data[hw_cpu_cycles_evid])/d.data[hw_instructions_evid] << ",";
-                    }
-                }
-                auto prev_locale = ss.imbue(std::locale(""));
-                const char * sep = "";
-                for(size_t i = 0; i < events.size() && i < d.data_size; i++) {
-                    ss << sep << "\"" << events[i].name << "\":\"" << d.data[i] << "\"";
-                    sep = ",";
-                }
-                ss.imbue(prev_locale);
-                if (d.extra_data_type[0] != 0) {
-                    sep = "";
-                    ss << ",\"Extra Data\":[";
-                    for(size_t i = 0; i < d.data_size && (d.extra_data_type[i] != 0); i++) {
-                        if (d.extra_data_type[i] == 'f') ss << sep << d.extra_data[i].f;
-                        else if (d.extra_data_type[i] == 'i') ss << sep << d.extra_data[i].i;
-                        else if (d.extra_data_type[i] == 'p') ss << sep << "\"" << d.extra_data[i].p << "\"";
-                        else ss << sep << "\"?\"";
-                        sep = ",";
-                    }
-                    ss << "]";
-                }
-                fw << ss.str();
-            }
-            fw << "}},\n";
-        }
-        all_dump_data.clear();
-        std::cout << LINUX_PERF_"#" << serial << "(" << this << ") finalize: dumpped " << data_size << std::endl;
-    }
-
-    uint64_t operator[](size_t i) {
-        if (i < events.size()) {
-            return values[i];
-        } else {
-            printf(LINUX_PERF_"PerfEventGroup: operator[] with index %lu oveflow (>%lu)\n", i, events.size());
-            abort();
-        }
-        return 0;
-    }
-    
-    PerfEventGroup() = default;
-
-    struct Config {
-        uint32_t type;
-        uint64_t config;
-        const char * name;
-        Config(uint32_t type, uint64_t config, const char * name = "?") : type(type), config(config), name(name) {}
-    };
-
-    uint32_t my_pid = 0;
-    uint32_t my_tid = 0;
-
-    PerfEventGroup(const std::vector<Config> type_configs, CallBackEventArgsSerializer fn = {}) : fn_evt_args_serializer(fn) {
-        for(auto& tc : type_configs) {
-            if (tc.type == PERF_TYPE_SOFTWARE) {
-                add_sw(tc.config);
-            }
-            if (tc.type == PERF_TYPE_HARDWARE) {
-                add_hw(tc.config);
-            }
-            if (tc.type == PERF_TYPE_RAW) {
-                add_raw(tc.config);
-            }
-            events.back().name = tc.name;
-            snprintf(events.back().format, sizeof(events.back().format), "%%%lulu, ", strlen(tc.name));
-        }
-
-        // env var defined raw events
-        for (auto raw_cfg : PerfRawConfig::get().raw_configs) {
-            add_raw(raw_cfg.second);
-            events.back().name = raw_cfg.first;
-        }
-
-        dump_limit = PerfRawConfig::get().dump;
-        enable_dump_json = PerfRawConfig::get().dump_on_cpu(sched_getcpu());
-        serial = 0;
-        if (enable_dump_json) {
-            serial = PerfEventJsonDumper::get().register_manager(this);
-        }
-        my_pid = getpid();
-        my_tid = gettid();
-
-        enable();
-    }
-
-    ~PerfEventGroup() {
-        if (enable_dump_json)
-            PerfEventJsonDumper::get().finalize();
-        disable();
-        for(auto & ev : events) {
-            close(ev.fd);
-        }
-    }
-
-    void show_header() {
-        std::stringstream ss;
-        ss << "\e[33m";
-        ss << "#" << serial << ":";
-        for(auto& ev : events) {
-            ss << ev.name << ", ";
-        }
-        ss << "\e[0m\n";
-        std::cout << ss.str();
-    }
-
-    void add_raw(uint64_t config, bool pinned=false) {
-        perf_event_attr pea;
-        memset(&pea, 0, sizeof(struct perf_event_attr));
-        pea.type = PERF_TYPE_RAW;
-        pea.size = sizeof(struct perf_event_attr);
-        pea.config = config;
-        pea.disabled = 1;
-        pea.exclude_kernel = 1;
-        pea.exclude_hv = 1;
-        pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;        
-        if (pinned && group_fd == -1) {
-            // pinned: It applies only to hardware counters and only to group leaders
-            pea.pinned = 1;
-        }
-        if (group_fd == -1) {
-            pea.read_format |= PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
-        }
-        add(&pea);
-    }
-
-    void add_hw(uint64_t config, bool pinned=false) {
-        perf_event_attr pea;
-        memset(&pea, 0, sizeof(struct perf_event_attr));
-        pea.type = PERF_TYPE_HARDWARE;
-        pea.size = sizeof(struct perf_event_attr);
-        pea.config = config;
-        pea.disabled = 1;
-        pea.exclude_kernel = 1;
-        pea.exclude_hv = 1;
-        pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;        
-        if (pinned && group_fd == -1) {
-            // pinned: It applies only to hardware counters and only to group leaders
-            pea.pinned = 1;
-        }
-        if (group_fd == -1) {
-            pea.read_format |= PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
-        }
-        add(&pea);
-    }
-
-    void add_sw(uint64_t config) {
-        perf_event_attr pea;
-        memset(&pea, 0, sizeof(struct perf_event_attr));
-        pea.type = PERF_TYPE_SOFTWARE;
-        pea.size = sizeof(struct perf_event_attr);
-        pea.config = config;
-        pea.disabled = 1;
-        pea.exclude_kernel = 0; // some SW events are counted as kernel
-        pea.exclude_hv = 1;
-        //pea.pinned = 1;   //sw event cannot set pinned!!!
-        pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID ;
-        add(&pea);
-    }
-
-    void add(perf_event_attr* pev_attr, pid_t pid = 0, int cpu = -1) {
-        event ev;
-
-        size_t mmap_length = sysconf(_SC_PAGESIZE) * 1;
-        // clockid must consistent within group
-        pev_attr->use_clockid = 1;
-        // can be synched with clock_gettime(CLOCK_MONOTONIC_RAW)
-        pev_attr->clockid = CLOCK_MONOTONIC_RAW;
-
-        RETRY:
-        ev.fd = perf_event_open(pev_attr, pid, cpu, group_fd, 0);
-        if (ev.fd < 0) {
-            if (!pev_attr->exclude_kernel) {
-                printf(LINUX_PERF_"perf_event_open(type=%d,config=%lld) with exclude_kernel=0 failed (due to /proc/sys/kernel/perf_event_paranoid is 2),  set exclude_kernel=1 and retry...\n",
-                       pev_attr->type, pev_attr->config);
-                pev_attr->exclude_kernel = 1;
-                goto RETRY;
-            } else {
-                printf(LINUX_PERF_"perf_event_open(type=%d,config=%lld) failed", pev_attr->type, pev_attr->config);
-                perror("");
-                abort();
-            }
-        }
-        ioctl(ev.fd, PERF_EVENT_IOC_ID, &ev.id);
-
-        ev.pmeta = reinterpret_cast<perf_event_mmap_page*>(mmap(NULL, mmap_length, PROT_READ | PROT_WRITE, MAP_SHARED, ev.fd, 0));
-        if (ev.pmeta == MAP_FAILED) {
-            perror(LINUX_PERF_"mmap perf_event_mmap_page failed:");
-            close(ev.fd);
-            abort();
-        }
-
-        if (group_fd == -1) {
-            group_fd = ev.fd;
-            read_format = pev_attr->read_format;
-        }
-        if (pev_attr->type == PERF_TYPE_HARDWARE && pev_attr->config == PERF_COUNT_HW_REF_CPU_CYCLES) {
-            ref_cpu_cycles_evid = events.size();
-        }
-        if (pev_attr->type == PERF_TYPE_SOFTWARE && pev_attr->config == PERF_COUNT_SW_TASK_CLOCK) {
-            sw_task_clock_evid = events.size();
-        }
-        if (pev_attr->type == PERF_TYPE_HARDWARE && pev_attr->config == PERF_COUNT_HW_CPU_CYCLES) {
-            hw_cpu_cycles_evid = events.size();
-        }
-        if (pev_attr->type == PERF_TYPE_HARDWARE && pev_attr->config == PERF_COUNT_HW_INSTRUCTIONS) {
-            hw_instructions_evid = events.size();
-        }
-        //printf("perf_event_open : fd=%d, id=%lu\n", ev.fd, ev.id);
-
-        events.push_back(ev);
-    }
-
-    bool event_group_enabled = false;
-    uint32_t num_events_no_pmc;
-
-    void enable() {
-        if (event_group_enabled)
-            return;
-        ioctl(group_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
-        ioctl(group_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
-        // PMC index is only valid when being enabled
-        num_events_no_pmc = 0;
-        for(auto& ev : events) {
-            if (ev.pmc_index == 0 && ev.pmeta->cap_user_rdpmc) {
-                uint32_t seqlock;
-                do {
-                    seqlock = ev.pmeta->lock;
-                    std::atomic_thread_fence(std::memory_order_seq_cst);
-                    ev.pmc_index = ev.pmeta->index;
-                    pmc_width = ev.pmeta->pmc_width;
-                    pmc_mask = 1;
-                    pmc_mask = (pmc_mask << pmc_width) - 1;
-                    if (ev.pmeta->cap_user_time) {
-                        tsc_time_shift = ev.pmeta->time_shift;
-                        tsc_time_mult = ev.pmeta->time_mult;
-                        //printf("time: %u,%u\n", tsc_time_shift, tsc_time_mult);
-                    }
-                    std::atomic_thread_fence(std::memory_order_seq_cst);
-                } while (ev.pmeta->lock != seqlock || (seqlock & 1));
-            }
-            // some events like PERF_TYPE_SOFTWARE cannot read using rdpmc()
-            if (ev.pmc_index == 0)
-                num_events_no_pmc ++;
-        }
-        event_group_enabled = true;
-    }
-
-    uint64_t tsc2nano(uint64_t cyc) {
-        uint64_t quot, rem;
-        quot  = cyc >> tsc_time_shift;
-        rem   = cyc & (((uint64_t)1 << tsc_time_shift) - 1);
-        return quot * tsc_time_mult + ((rem * tsc_time_mult) >> tsc_time_shift);
-    }
-
-    void disable() {
-        if (!event_group_enabled)
-            return;
-
-        ioctl(group_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
-
-        for(auto& ev : events) {
-            ev.pmc_index = 0;
-        }
-        event_group_enabled = false;
-    }
-
-    uint64_t rdpmc(int i, uint64_t base = 0) {
-        return (_rdpmc(events[i].pmc_index - 1) - base) & pmc_mask;
-    }
-
-    template<class FN>
-    std::vector<uint64_t> rdpmc(FN fn, std::string name = {}, int64_t loop_cnt = 0, std::function<void(uint64_t, uint64_t*, char*&)> addinfo = {}) {
-        int cnt = events.size();
-        std::vector<uint64_t> pmc(cnt, 0);
-
-        bool use_pmc = (num_events_no_pmc == 0);
-        if (use_pmc) {
-            for(int i = 0; i < cnt; i++) {
-                if (events[i].pmc_index)
-                    pmc[i] = _rdpmc(events[i].pmc_index - 1);
-                else
-                    pmc[i] = 0;
-            }
-        } else {
-            read();
-            for(int i = 0; i < cnt; i++) {
-                pmc[i] = values[i];
-            }
-        }
-
-        auto tsc0 = __rdtsc();
-        fn();
-        auto tsc1 = __rdtsc();
-
-        if (use_pmc) {
-            for(int i = 0; i < cnt; i++) {
-                if (events[i].pmc_index)
-                    pmc[i] = (_rdpmc(events[i].pmc_index - 1) - pmc[i]) & pmc_mask;
-                else
-                    pmc[i] = 0;
-            }
-        } else {
-            read();
-            for(int i = 0; i < cnt; i++) {
-                pmc[i] -= values[i];
-            }
-        }
-
-        if (!name.empty()) {
-            char log_buff[1024];
-            char * log = log_buff;
-            log += sprintf(log, "\e[33m");
-            for(int i = 0; i < cnt; i++) {
-                log += sprintf(log, events[i].format, pmc[i]);
-            }
-            auto duration_ns = tsc2nano(tsc1 - tsc0);
-            
-            log += sprintf(log, "\e[0m [%16s] %.3f us", name.c_str(), duration_ns/1e3);
-            if (hw_cpu_cycles_evid >= 0) {
-                log += sprintf(log, " CPU:%.2f(GHz)", 1.0 * pmc[hw_cpu_cycles_evid] / duration_ns);
-                if (hw_instructions_evid >= 0) {
-                    log += sprintf(log, " CPI:%.2f", 1.0 * pmc[hw_cpu_cycles_evid] / pmc[hw_instructions_evid]);
-                }
-                if (loop_cnt > 0) {
-                    // cycles per kernel (or per-iteration)
-                    log += sprintf(log, " CPK:%.1fx%d", 1.0 * pmc[hw_cpu_cycles_evid] / loop_cnt, loop_cnt);
-                }
-            }
-            if (addinfo) {
-                addinfo(duration_ns, &pmc[0], log);
-            }
-            log += sprintf(log, "\n");
-            printf(log_buff);
-        }
-        return pmc;
-    }
-
-    void read(bool verbose = false) {
-        for(size_t i = 0; i < events.size(); i++) values[i] = 0;
-
-        if (::read(group_fd, read_buf, sizeof(read_buf)) == -1) {
-            perror(LINUX_PERF_"read perf event failed:");
-            abort();
-        }
-
-        uint64_t * readv = read_buf;
-        auto nr = *readv++;
-        if (verbose) printf("number of counters:\t%lu\n", nr);
-        time_enabled = 0;
-        time_running = 0;
-        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
-            time_enabled = *readv++;
-            if (verbose) printf("time_enabled:\t%lu\n", time_enabled);
-        }
-        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
-            time_running = *readv++;
-            if (verbose) printf("time_running:\t%lu\n", time_running);
-        }
-
-        for (size_t i = 0; i < nr; i++) {
-            auto value = *readv++;
-            auto id = *readv++;
-            for (size_t k = 0; k < events.size(); k++) {
-                if (id == events[k].id) {
-                    values[k] = value;
-                }
-            }
-        }
-
-        if (verbose) {
-            for (size_t k = 0; k < events.size(); k++) {
-                printf("\t[%lu]: %lu\n", k, values[k]);
-            }
-        }
-    }
-
-    //================================================================================
-    // profiler API with json_dump capability
-    struct ProfileScope {
-        PerfEventGroup* pevg = nullptr;
-        ProfileData* pd = nullptr;
-        bool do_unlock = false;
-        ProfileScope() = default;
-        ProfileScope(PerfEventGroup* pevg, ProfileData* pd, bool do_unlock = false) : pevg(pevg), pd(pd), do_unlock(do_unlock) {}
-
-        // Move only
-        ProfileScope(const ProfileScope&) = delete;
-        ProfileScope& operator=(const ProfileScope&) = delete;
-
-        ProfileScope(ProfileScope&& other) {
-            pevg = other.pevg;
-            pd = other.pd;
-            other.pevg = nullptr;
-            other.pd = nullptr;
-        }
-
-        ProfileScope& operator=(ProfileScope&& other) {
-            if (&other != this) {
-                pevg = other.pevg;
-                pd = other.pd;
-                other.pevg = nullptr;
-                other.pd = nullptr;
-            }
-
-            return *this;
-        }
-
-        uint64_t* finish() {
-            if (do_unlock) {
-                PerfEventGroup::get_sampling_lock() --;
-            }
-            if (!pevg || !pd)
-                return nullptr;
-
-            pd->stop();
-            bool use_pmc = (pevg->num_events_no_pmc == 0);
-            if (use_pmc) {
-                for (size_t i =0; i < pevg->events.size() && i < pd->data_size; i++)
-                    if (pevg->events[i].pmc_index)
-                        pd->data[i] = (_rdpmc(pevg->events[i].pmc_index - 1) - pd->data[i]) & pevg->pmc_mask;
-                    else
-                        pd->data[i] = 0;
-            } else {
-                pevg->read();
-                for (size_t i =0; i < pevg->events.size() && i < pd->data_size; i++)
-                    pd->data[i] = pevg->values[i] - pd->data[i];
-            }
-            pevg = nullptr;
-            return pd->data;
-        }
-
-        ~ProfileScope() {
-            finish();
-        }
-    };
-
-    ProfileData* _profile(const std::string& title, int id = 0) {
-        if (get_sampling_lock().load() != 0)
-            return nullptr;
-        if (dump_limit == 0)
-            return nullptr;
-        dump_limit --;
-
-        PerfEventCtxSwitch::get().updateRingBuffer();
-
-        all_dump_data.emplace_back(title);
-        auto* pd = &all_dump_data.back();
-        pd->cat = "enable";
-        pd->id = id;
-
-        // use rdpmc if possible
-        bool use_pmc = (num_events_no_pmc == 0);
-        if (use_pmc) {
-            for (size_t i =0; i < events.size() && i < pd->data_size; i++)
-                if (events[i].pmc_index)
-                    pd->data[i] = _rdpmc(events[i].pmc_index - 1);
-        } else {
-            read();
-            for (size_t i =0; i < events.size() && i < pd->data_size; i++)
-                pd->data[i] = values[i];
-        }
-
-        return pd;
-    }
-
-    static PerfEventGroup& get() {
-        thread_local PerfEventGroup pevg({
-            {PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, "HW_CPU_CYCLES"},
-            {PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, "HW_INSTRUCTIONS"},
-            {PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES, "HW_CACHE_MISSES"},
-            //{PERF_TYPE_HARDWARE, PERF_COUNT_HW_REF_CPU_CYCLES, "HW_REF_CPU_CYCLES"},
-            {PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES, "SW_CONTEXT_SWITCHES"},
-            {PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK, "SW_TASK_CLOCK"},
-            {PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS, "SW_PAGE_FAULTS"},
-
-            // XSNP_NONE                : ... were hits in L3 without snoops required                (data is not owned by any other core's local cache)
-            // XSNP_FWD   /XSNP_HITM    : ... were HitM responses from shared L3                     (data was exclusivly/dirty owned by another core's local cache)
-            // XSNP_NO_FWD/XSNP_HIT     : ... were L3 and cross-core snoop hits in on-pkg core cache (data was shared/clean in another core's local cache)
-
-            {PERF_TYPE_RAW, X86_RAW_EVENT(0xd2, 0x01, 0x00), "XSNP_MISS"},
-            {PERF_TYPE_RAW, X86_RAW_EVENT(0xd2, 0x02, 0x00), "XSNP_NO_FWD"},
-            {PERF_TYPE_RAW, X86_RAW_EVENT(0xd2, 0x04, 0x00), "XSNP_FWD"},
-            {PERF_TYPE_RAW, X86_RAW_EVENT(0xd2, 0x08, 0x00), "XSNP_NONE"},              
-        });
-        return pevg;
-    }
-
-    // this lock is global, affect all threads
-    static std::atomic_int& get_sampling_lock() {
-        static std::atomic_int sampling_lock{0};
-        return sampling_lock;
-    }
-};
-
-using ProfileScope = PerfEventGroup::ProfileScope;
-
-#if 0
-// pwe-thread event group with default events pre-selected
-template <typename ... Args>
-ProfileScope Profile(const std::string& title, int id = 0, Args&&... args) {
-    auto& pevg = PerfEventGroup::get();
-    auto* pd = pevg._profile(title, id);
-    if (pd) {
-        pd->set_extra_data(std::forward<Args>(args)...);
-    }
-    return {&pevg, pd};
-}
-
-// overload accept sampling_probability, which can be used to disable profile in scope 
-template <typename ... Args>
-ProfileScope Profile(float sampling_probability, const std::string& title, int id = 0, Args&&... args) {
-    auto& pevg = PerfEventGroup::get();
-    auto* pd = pevg._profile(title, id);
-    if (pd) {
-        pd->set_extra_data(std::forward<Args>(args)...);
-    }
-
-    bool disable_profile = ((std::rand() % 1000)*0.001f >= sampling_probability);
-    if (disable_profile) {
-        PerfEventGroup::get_sampling_lock() ++;
-    }
-    return {&pevg, pd, disable_profile};
-}
-
-inline int Init() {
-    // this is for capture all context switching events
-    PerfEventCtxSwitch::get();
-
-    // this is for making main threads the first process
-    auto dummy = Profile("start");
-    return 0;
-}
-
-#else
-
-template <typename ... Args>
-int Profile(const std::string& title, int id = 0, Args&&... args) {
-    return 0;
-}
-
-// overload accept sampling_probability, which can be used to disable profile in scope 
-template <typename ... Args>
-int Profile(float sampling_probability, const std::string& title, int id = 0, Args&&... args) {
-    return 0;
-}
-
-inline int Init() {
-    return 0;
-}
-
-#endif
-
-} // namespace LinuxPerf
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
index 45ea04b8fc753a..e52c2494d82c86 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
@@ -80,16 +80,16 @@ StatefulSDPAFusion::StatefulSDPAFusion() {
                                            {{"mode", "numpy"}});
 
         auto multiply_kv = makePattern<opset6::Multiply>({reshape_kv | unsqueeze_kv, constant_bcst | computed_bcst});
-        auto computed_bcst3 =
-            makePattern<opset3::Broadcast>({unsqueeze_kv, any_input()},
-                                           {{"mode", "bidirectional"}});
+        auto computed_bcst3 = makePattern<opset3::Broadcast>({unsqueeze_kv, any_input()}, {{"mode", "bidirectional"}});
 
         auto result = makePattern<opset6::Reshape>({multiply_kv | computed_bcst3, any_input()});
         return std::make_tuple(result, reshape_kv, unsqueeze_kv, computed_bcst, multiply_kv, computed_bcst3);
     };
 
-    std::tie(mq_reshape_k, reshape_k, unsqueeze_k, computed_bcst_k, multiply_k, computed_bcst3_k) = multi_query_bcst(concat_k);
-    std::tie(mq_reshape_v, reshape_v, unsqueeze_v, computed_bcst_v, multiply_v, computed_bcst3_v) = multi_query_bcst(concat_v);
+    std::tie(mq_reshape_k, reshape_k, unsqueeze_k, computed_bcst_k, multiply_k, computed_bcst3_k) =
+        multi_query_bcst(concat_k);
+    std::tie(mq_reshape_v, reshape_v, unsqueeze_v, computed_bcst_v, multiply_v, computed_bcst3_v) =
+        multi_query_bcst(concat_v);
     auto present_k = concat_k | mq_reshape_k;
     auto present_v = concat_v | mq_reshape_v;
 
diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
index 21413addc3187a..3ebfc6f8e4658b 100644
--- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
+++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp
@@ -431,11 +431,7 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
         ov::pass::KeepConstAndDecompression);
 
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::AUGRUCellFusion);
-    auto p = std::getenv("USE_OLD");
-    bool use_old = p && p[0] == '1';
-    if (!use_old) {
-        CPU_REGISTER_PASS_COMMON(manager, SDPASubgraphFusion);
-    }
+    CPU_REGISTER_PASS_COMMON(manager, SDPASubgraphFusion);
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::CommonOptimizations);
     CPU_REGISTER_PASS_X64(manager, ov::pass::KeepConstsPrecision, decompression_precisions, false, true);
     CPU_SET_CALLBACK_X64(
@@ -659,18 +655,6 @@ void Transformations::PreLpt(const std::vector<ov::element::Type>& defaultPrecis
     CPU_SET_CALLBACK_COMMON(manager, nmsCallback, ov::pass::ConvertNMS9ToNMSIEInternal);
     CPU_SET_CALLBACK_COMMON(manager, nmsCallback, ov::pass::ConvertMulticlassNmsToMulticlassNmsIE);
     CPU_SET_CALLBACK_COMMON(manager, nmsCallback, ov::pass::ConvertMatrixNmsToMatrixNmsIE);
-    if (use_old) {
-        CPU_SET_CALLBACK_COMMON(
-            manager,
-            [this](const_node_ptr& node) -> bool {
-                std::string errorMsg;
-                // Current SDPA impl is optimized only for LLM models, so we decompose it for others to avoid perf
-                // regression. Matching the pattern is a little complicated, so we just check if there is any state nodes.
-                return node::ScaledDotProductAttention::isSupportedOperation(node, errorMsg) &&
-                    model->get_variables().size() > 0;
-            },
-            ov::pass::ScaledDotProductAttentionDecomposition);
-    }
 
     // List of enabled/disabled transformations
 
@@ -952,13 +936,6 @@ void Transformations::PostLpt() {
     }
 #endif  // OPENVINO_ARCH_X86_64
 
-    auto p = std::getenv("USE_OLD");
-    bool use_old = p && p[0] == '1';
-    if (use_old) {
-        CPU_REGISTER_PASS_COMMON(postLPTPassManager, ov::pass::transpose_sinking::TSShapeOfForward);
-        CPU_REGISTER_PASS_COMMON(postLPTPassManager, StatefulSDPAFusion);
-        CPU_REGISTER_PASS_X64(postLPTPassManager, ov::intel_cpu::SDPAFuseTransposeReshape);
-    }
     CPU_REGISTER_PASS_X64(postLPTPassManager, ov::pass::RMSFusion, false);
     CPU_REGISTER_PASS_X64(postLPTPassManager, ov::intel_cpu::DecomposeRMSNorm);
     CPU_SET_CALLBACK_X64(
@@ -981,20 +958,6 @@ void Transformations::PostLpt() {
     symbolic_pipeline->get_manager()->register_pass<NgramFusion>();
 
     postLPTPassManager.run_passes(model);
-    p = std::getenv("CHECK_SDPA");
-    bool check_sdpa = p && p[0] == '1';
-    if (check_sdpa) {
-        size_t count = 0;
-        for (auto&& node : model->get_ordered_ops()) {
-            if (node->get_type_name() == std::string("ScaledDotProductAttentionWithKVCache")) {
-                count++;
-            }
-        }
-        // char buf[128] = {0};
-        // sprintf(buf, "KVCACHE=%ld", count);
-        // std::cout << buf << std::endl;
-        setenv("KVCACHE", std::to_string(count).c_str(), true);
-    }
 }
 
 void Transformations::MainSnippets(void) {

From 6b2e86bae29a59a60fc35260e9da3c3a72c98eee Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Tue, 24 Dec 2024 19:56:36 +0800
Subject: [PATCH 07/13] use least amount of transformations

---
 .../cpu_opset/common/pass/stateful_sdpa_fusion.cpp              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
index e52c2494d82c86..546f0ba397f54d 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
@@ -306,7 +306,7 @@ bool SDPASubgraphFusion::run_on_model(const std::shared_ptr<ov::Model>& f) {
     ov::pass::Manager manager(get_pass_config(), "SDPASubgraphFusion");
     manager.set_per_pass_validation(false);
 
-    CPU_REGISTER_PASS_COMMON(manager, ov::pass::SimplifyShapeOfSubGraph, true);
+    CPU_REGISTER_PASS_COMMON(manager, ov::pass::SimplifyGatherShapeOf);
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::transpose_sinking::TSShapeOfForward);
     CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion);
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::Validate);

From 657d6add701ce73ee520c95287f4c6a989dc3969 Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Wed, 25 Dec 2024 17:04:38 +0800
Subject: [PATCH 08/13] fix ci error

---
 .../common/pass/stateful_sdpa_fusion.cpp      |   4 +-
 .../x64/fuse_reshape_transpose_to_sdpa.cpp    | 414 +++++++++---------
 2 files changed, 208 insertions(+), 210 deletions(-)

diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
index 546f0ba397f54d..f3b1d926a3dbc7 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
@@ -302,9 +302,7 @@ StatefulSDPAFusion::StatefulSDPAFusion() {
 
 bool SDPASubgraphFusion::run_on_model(const std::shared_ptr<ov::Model>& f) {
     RUN_ON_FUNCTION_SCOPE(SDPASubgraphFusion);
-    using namespace ov::pass::pattern;
-    ov::pass::Manager manager(get_pass_config(), "SDPASubgraphFusion");
-    manager.set_per_pass_validation(false);
+    ov::pass::Manager manager("SDPASubgraphFusion");
 
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::SimplifyGatherShapeOf);
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::transpose_sinking::TSShapeOfForward);
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp
index a75156c0f69fcb..0da3732c295b5c 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp
@@ -34,212 +34,212 @@ namespace test {
  */
 
 // <Input_shapes, [H,S]>
-using InputShapeAndReshapeOrder = std::pair<std::vector<InputShape>, std::vector<int32_t>>;
-using FuseSDPAReshapeTransposeTestParams = std::tuple<ElementType, InputShapeAndReshapeOrder>;
-class FuseSDPAReshapeTransposeTest : virtual public ov::test::SubgraphBaseTest,
-                                     public testing::WithParamInterface<FuseSDPAReshapeTransposeTestParams>,
-                                     public CPUTestsBase {
-public:
-    static std::string getTestCaseName(const testing::TestParamInfo<FuseSDPAReshapeTransposeTestParams>& obj) {
-        ElementType inType;
-        InputShapeAndReshapeOrder inputShapeAndOrders;
-        std::tie(inType, inputShapeAndOrders) = obj.param;
-        std::ostringstream result;
-        std::vector<InputShape>& inputShapes = inputShapeAndOrders.first;
-        auto& reshapeOrderHS = inputShapeAndOrders.second;
-        result << "IS=";
-        for (const auto& shape : inputShapes) {
-            result << ov::test::utils::partialShape2str({shape.first}) << "_";
-        }
-        result << "TS=";
-        for (const auto& shape : inputShapes) {
-            result << "(";
-            if (!shape.second.empty()) {
-                for (const auto& itr : shape.second) {
-                    result << ov::test::utils::vec2str(itr);
-                }
-            }
-            result << ")_";
-        }
-        result << "Prc=" << inType << "_";
-        result << "ReshapeOrderHS=";
-        result << "(";
-        for (const auto& itr : reshapeOrderHS) {
-            result << itr << ",";
-        }
-        result << ")";
-
-        return result.str();
-    }
-
-    void SetUp() override {
-        ElementType inType;
-        InputShapeAndReshapeOrder inputShapeAndOrders;
-        std::tie(inType, inputShapeAndOrders) = this->GetParam();
-        std::vector<InputShape>& inputShapes = inputShapeAndOrders.first;
-        auto& reshapeOrderHS = inputShapeAndOrders.second;
-        targetDevice = ov::test::utils::DEVICE_CPU;
-        rel_threshold = 1e-2f;
-        configuration[ov::hint::inference_precision.name()] = ov::element::f32;
-        if (inType == ElementType::bf16) {
-            configuration[ov::hint::inference_precision.name()] = ov::element::bf16;
-            rel_threshold = 0.01f;
-        }
-        init_input_shapes(inputShapes);
-
-        // pre SDPA reshape->transpose
-        ov::ParameterVector inputParams(3);
-        ov::SinkVector sinkNodes;
-        OutputVector transposes(3);
-        for (size_t i = 0; i < 3u; i++) {
-            inputParams[i] = std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[0]);
-
-            auto reshape_axis =
-                ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 0, reshapeOrderHS[0], reshapeOrderHS[1]});
-
-            std::shared_ptr<ov::Node> reshape_input_1 = inputParams[i];
-            if (i > 0) {
-                auto var = std::make_shared<ov::op::util::Variable>(
-                    ov::op::util::VariableInfo{inputDynamicShapes[0], inType, "var_" + std::to_string(i)});
-                auto readvalue = std::make_shared<ov::op::v6::ReadValue>(inputParams[i], var);
-                auto assign = std::make_shared<ov::op::v6::Assign>(readvalue, var);
-                sinkNodes.emplace_back(assign);
-                reshape_input_1 = readvalue;
-            }
-
-            auto reshape = std::make_shared<ov::op::v1::Reshape>(reshape_input_1, reshape_axis, true);
-            auto transposeOrder = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3});
-            transposes[i] = std::make_shared<ov::op::v1::Transpose>(reshape, transposeOrder);
-        }
-
-        auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(transposes, false);
-        sdpa->set_friendly_name("mha");
-
-        // post SDPA transpose + reshape
-        auto postOrder =
-            ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<size_t>{0, 2, 1, 3});  // BHLS -> BLHS
-        auto transposeSDPA = std::make_shared<ov::op::v1::Transpose>(sdpa, postOrder);
-
-        auto constReshape =
-            ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, reshapeOrderHS[0] * reshapeOrderHS[1]});
-        auto reshapeSDPA = std::make_shared<ov::op::v1::Reshape>(transposeSDPA, constReshape, true);  // BLHS -> B,L,HxS
-
-        function = std::make_shared<ov::Model>(ov::OutputVector{reshapeSDPA},
-                                               sinkNodes,
-                                               inputParams,
-                                               "FuseSDPAReshapeTranspose");
-        targetDevice = ov::test::utils::DEVICE_CPU;
-        functionRefs = function->clone();
-        pass::Manager manager;
-        // decompose ScaledDotProductAttention
-        manager.register_pass<ov::pass::ScaledDotProductAttentionDecomposition>();
-        manager.run_passes(functionRefs);
-    }
-
-    template <typename IT, typename T>
-    static void strided_iota(IT first, size_t n, T value, T stride) {
-        for (size_t i = 0; i < n; i++) {
-            *first++ = value;
-            value += stride;
-        }
-    }
-    void generate(int idx, const std::vector<ov::Shape>& targetInputStaticShapes) {
-        inputs.clear();
-        auto create_input = [this] (std::shared_ptr<ov::op::v0::Parameter> param, ov::Shape shape, float val) {
-            if (param->get_element_type() == ov::element::i32) {
-                ov::Tensor t{ov::element::i32, shape};
-                auto size = ov::shape_size<ov::Shape>(shape);
-                auto* p = static_cast<int*>(t.data());
-                auto start = static_cast<int>(val);
-                for (size_t i = 0; i < size; i++) {
-                    p[i] = (start + i) % size;
-                }
-                inputs.insert({param, t});
-            } else if (param->get_element_type() == ov::element::f32) {
-                ov::Tensor t{ov::element::f32, shape};
-                strided_iota(static_cast<float*>(t.data()), t.get_size(), val, 0.1f);
-                inputs.insert({param, t});
-            } else {
-                ASSERT_TRUE(param->get_element_type() == ov::element::bf16);
-                ov::Tensor t{ov::element::bf16, shape};
-                strided_iota(static_cast<ov::bfloat16*>(t.data()), t.get_size(), val, 0.1f);
-                inputs.insert({param, t});
-            }
-        };
-        // q, k, v
-        create_input(function->get_parameters()[0], targetInputStaticShapes[0], idx + 1.0f);
-        create_input(function->get_parameters()[1], targetInputStaticShapes[0], idx + 2.0f);
-        create_input(function->get_parameters()[2], targetInputStaticShapes[0], idx + 3.0f);
-    }
-    void prepare() {
-        compile_model();
-        inferRequest = compiledModel.create_infer_request();
-        ASSERT_TRUE(inferRequest);
-    }
-    void reset() {
-        for (auto&& state : inferRequest.query_state()) {
-            state.reset();
-        }
-    }
-
-    std::vector<ov::Tensor> run_test(std::shared_ptr<ov::Model> model) {
-        function = model;
-        prepare();
-        std::vector<ov::Tensor> outputs;
-        int idx = 0;
-        for (auto&& shapes : targetStaticShapes) {
-            generate(idx++, shapes);
-            for (const auto& input : inputs) {
-                inferRequest.set_tensor(input.first, input.second);
-            }
-            inferRequest.infer();
-            auto outputTensor = inferRequest.get_output_tensor(0);
-            ov::Tensor copy{outputTensor.get_element_type(), outputTensor.get_shape()};
-            outputTensor.copy_to(copy);
-            outputs.push_back(copy);
-            reset();
-        }
-        return outputs;
-    }
-};
-
-TEST_P(FuseSDPAReshapeTransposeTest, CompareWithRefs) {
-    SKIP_IF_CURRENT_TEST_IS_DISABLED();
-    bool reshape_transpose_fused = false;
-    auto actualOutputs = run_test(function);
-    CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1);
-    CheckNumberOfNodesWithType(compiledModel, "Reshape", 0);
-    CheckNumberOfNodesWithType(compiledModel, "Transpose", 0);
-    for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) {
-        if (n->get_friendly_name() == "mha/fused_reshape_transpose") {
-            reshape_transpose_fused = true;
-        }
-    }
-    ASSERT_TRUE(reshape_transpose_fused);
-
-    auto expectedOutputs = run_test(functionRefs);
-    for (size_t i = 0; i < actualOutputs.size(); i++) {
-        ov::test::utils::compare(expectedOutputs[i], actualOutputs[i], abs_threshold, rel_threshold);
-    }
-}
-
-namespace {
-const std::vector<InputShapeAndReshapeOrder> inputShapeAndReshapeOrders = {
-    // <Input_shapes, [H,S]>
-    {
-        {{
-             // Q,K,V:[B, L, H*S]
-             {{-1, -1, 4 * 16}, {{1, 1, 4 * 16}, {1, 2, 4 * 16}, {2, 2, 4 * 16}}},
-         },
-         // reshapeOrderHS
-         {4, 16}},
-    }};
-
-INSTANTIATE_TEST_SUITE_P(smoke_FuseSDPAReshapeTransposeTest,
-                         FuseSDPAReshapeTransposeTest,
-                         ::testing::Combine(::testing::Values(ElementType::f32),
-                                            ::testing::ValuesIn(inputShapeAndReshapeOrders)),
-                         FuseSDPAReshapeTransposeTest::getTestCaseName);
-}  // namespace
+// using InputShapeAndReshapeOrder = std::pair<std::vector<InputShape>, std::vector<int32_t>>;
+// using FuseSDPAReshapeTransposeTestParams = std::tuple<ElementType, InputShapeAndReshapeOrder>;
+// class FuseSDPAReshapeTransposeTest : virtual public ov::test::SubgraphBaseTest,
+//                                      public testing::WithParamInterface<FuseSDPAReshapeTransposeTestParams>,
+//                                      public CPUTestsBase {
+// public:
+//     static std::string getTestCaseName(const testing::TestParamInfo<FuseSDPAReshapeTransposeTestParams>& obj) {
+//         ElementType inType;
+//         InputShapeAndReshapeOrder inputShapeAndOrders;
+//         std::tie(inType, inputShapeAndOrders) = obj.param;
+//         std::ostringstream result;
+//         std::vector<InputShape>& inputShapes = inputShapeAndOrders.first;
+//         auto& reshapeOrderHS = inputShapeAndOrders.second;
+//         result << "IS=";
+//         for (const auto& shape : inputShapes) {
+//             result << ov::test::utils::partialShape2str({shape.first}) << "_";
+//         }
+//         result << "TS=";
+//         for (const auto& shape : inputShapes) {
+//             result << "(";
+//             if (!shape.second.empty()) {
+//                 for (const auto& itr : shape.second) {
+//                     result << ov::test::utils::vec2str(itr);
+//                 }
+//             }
+//             result << ")_";
+//         }
+//         result << "Prc=" << inType << "_";
+//         result << "ReshapeOrderHS=";
+//         result << "(";
+//         for (const auto& itr : reshapeOrderHS) {
+//             result << itr << ",";
+//         }
+//         result << ")";
+
+//         return result.str();
+//     }
+
+//     void SetUp() override {
+//         ElementType inType;
+//         InputShapeAndReshapeOrder inputShapeAndOrders;
+//         std::tie(inType, inputShapeAndOrders) = this->GetParam();
+//         std::vector<InputShape>& inputShapes = inputShapeAndOrders.first;
+//         auto& reshapeOrderHS = inputShapeAndOrders.second;
+//         targetDevice = ov::test::utils::DEVICE_CPU;
+//         rel_threshold = 1e-2f;
+//         configuration[ov::hint::inference_precision.name()] = ov::element::f32;
+//         if (inType == ElementType::bf16) {
+//             configuration[ov::hint::inference_precision.name()] = ov::element::bf16;
+//             rel_threshold = 0.01f;
+//         }
+//         init_input_shapes(inputShapes);
+
+//         // pre SDPA reshape->transpose
+//         ov::ParameterVector inputParams(3);
+//         ov::SinkVector sinkNodes;
+//         OutputVector transposes(3);
+//         for (size_t i = 0; i < 3u; i++) {
+//             inputParams[i] = std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[0]);
+
+//             auto reshape_axis =
+//                 ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 0, reshapeOrderHS[0], reshapeOrderHS[1]});
+
+//             std::shared_ptr<ov::Node> reshape_input_1 = inputParams[i];
+//             if (i > 0) {
+//                 auto var = std::make_shared<ov::op::util::Variable>(
+//                     ov::op::util::VariableInfo{inputDynamicShapes[0], inType, "var_" + std::to_string(i)});
+//                 auto readvalue = std::make_shared<ov::op::v6::ReadValue>(inputParams[i], var);
+//                 auto assign = std::make_shared<ov::op::v6::Assign>(readvalue, var);
+//                 sinkNodes.emplace_back(assign);
+//                 reshape_input_1 = readvalue;
+//             }
+
+//             auto reshape = std::make_shared<ov::op::v1::Reshape>(reshape_input_1, reshape_axis, true);
+//             auto transposeOrder = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3});
+//             transposes[i] = std::make_shared<ov::op::v1::Transpose>(reshape, transposeOrder);
+//         }
+
+//         auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(transposes, false);
+//         sdpa->set_friendly_name("mha");
+
+//         // post SDPA transpose + reshape
+//         auto postOrder =
+//             ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<size_t>{0, 2, 1, 3});  // BHLS -> BLHS
+//         auto transposeSDPA = std::make_shared<ov::op::v1::Transpose>(sdpa, postOrder);
+
+//         auto constReshape =
+//             ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, reshapeOrderHS[0] * reshapeOrderHS[1]});
+//         auto reshapeSDPA = std::make_shared<ov::op::v1::Reshape>(transposeSDPA, constReshape, true);  // BLHS -> B,L,HxS
+
+//         function = std::make_shared<ov::Model>(ov::OutputVector{reshapeSDPA},
+//                                                sinkNodes,
+//                                                inputParams,
+//                                                "FuseSDPAReshapeTranspose");
+//         targetDevice = ov::test::utils::DEVICE_CPU;
+//         functionRefs = function->clone();
+//         pass::Manager manager;
+//         // decompose ScaledDotProductAttention
+//         manager.register_pass<ov::pass::ScaledDotProductAttentionDecomposition>();
+//         manager.run_passes(functionRefs);
+//     }
+
+//     template <typename IT, typename T>
+//     static void strided_iota(IT first, size_t n, T value, T stride) {
+//         for (size_t i = 0; i < n; i++) {
+//             *first++ = value;
+//             value += stride;
+//         }
+//     }
+//     void generate(int idx, const std::vector<ov::Shape>& targetInputStaticShapes) {
+//         inputs.clear();
+//         auto create_input = [this] (std::shared_ptr<ov::op::v0::Parameter> param, ov::Shape shape, float val) {
+//             if (param->get_element_type() == ov::element::i32) {
+//                 ov::Tensor t{ov::element::i32, shape};
+//                 auto size = ov::shape_size<ov::Shape>(shape);
+//                 auto* p = static_cast<int*>(t.data());
+//                 auto start = static_cast<int>(val);
+//                 for (size_t i = 0; i < size; i++) {
+//                     p[i] = (start + i) % size;
+//                 }
+//                 inputs.insert({param, t});
+//             } else if (param->get_element_type() == ov::element::f32) {
+//                 ov::Tensor t{ov::element::f32, shape};
+//                 strided_iota(static_cast<float*>(t.data()), t.get_size(), val, 0.1f);
+//                 inputs.insert({param, t});
+//             } else {
+//                 ASSERT_TRUE(param->get_element_type() == ov::element::bf16);
+//                 ov::Tensor t{ov::element::bf16, shape};
+//                 strided_iota(static_cast<ov::bfloat16*>(t.data()), t.get_size(), val, 0.1f);
+//                 inputs.insert({param, t});
+//             }
+//         };
+//         // q, k, v
+//         create_input(function->get_parameters()[0], targetInputStaticShapes[0], idx + 1.0f);
+//         create_input(function->get_parameters()[1], targetInputStaticShapes[0], idx + 2.0f);
+//         create_input(function->get_parameters()[2], targetInputStaticShapes[0], idx + 3.0f);
+//     }
+//     void prepare() {
+//         compile_model();
+//         inferRequest = compiledModel.create_infer_request();
+//         ASSERT_TRUE(inferRequest);
+//     }
+//     void reset() {
+//         for (auto&& state : inferRequest.query_state()) {
+//             state.reset();
+//         }
+//     }
+
+//     std::vector<ov::Tensor> run_test(std::shared_ptr<ov::Model> model) {
+//         function = model;
+//         prepare();
+//         std::vector<ov::Tensor> outputs;
+//         int idx = 0;
+//         for (auto&& shapes : targetStaticShapes) {
+//             generate(idx++, shapes);
+//             for (const auto& input : inputs) {
+//                 inferRequest.set_tensor(input.first, input.second);
+//             }
+//             inferRequest.infer();
+//             auto outputTensor = inferRequest.get_output_tensor(0);
+//             ov::Tensor copy{outputTensor.get_element_type(), outputTensor.get_shape()};
+//             outputTensor.copy_to(copy);
+//             outputs.push_back(copy);
+//             reset();
+//         }
+//         return outputs;
+//     }
+// };
+
+// TEST_P(FuseSDPAReshapeTransposeTest, CompareWithRefs) {
+//     SKIP_IF_CURRENT_TEST_IS_DISABLED();
+//     bool reshape_transpose_fused = false;
+//     auto actualOutputs = run_test(function);
+//     CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1);
+//     CheckNumberOfNodesWithType(compiledModel, "Reshape", 0);
+//     CheckNumberOfNodesWithType(compiledModel, "Transpose", 0);
+//     for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) {
+//         if (n->get_friendly_name() == "mha/fused_reshape_transpose") {
+//             reshape_transpose_fused = true;
+//         }
+//     }
+//     ASSERT_TRUE(reshape_transpose_fused);
+
+//     auto expectedOutputs = run_test(functionRefs);
+//     for (size_t i = 0; i < actualOutputs.size(); i++) {
+//         ov::test::utils::compare(expectedOutputs[i], actualOutputs[i], abs_threshold, rel_threshold);
+//     }
+// }
+
+// namespace {
+// const std::vector<InputShapeAndReshapeOrder> inputShapeAndReshapeOrders = {
+//     // <Input_shapes, [H,S]>
+//     {
+//         {{
+//              // Q,K,V:[B, L, H*S]
+//              {{-1, -1, 4 * 16}, {{1, 1, 4 * 16}, {1, 2, 4 * 16}, {2, 2, 4 * 16}}},
+//          },
+//          // reshapeOrderHS
+//          {4, 16}},
+//     }};
+
+// INSTANTIATE_TEST_SUITE_P(smoke_FuseSDPAReshapeTransposeTest,
+//                          FuseSDPAReshapeTransposeTest,
+//                          ::testing::Combine(::testing::Values(ElementType::f32),
+//                                             ::testing::ValuesIn(inputShapeAndReshapeOrders)),
+//                          FuseSDPAReshapeTransposeTest::getTestCaseName);
+// }  // namespace
 }  // namespace test
 }  // namespace ov

From 2c7b9642d98818eb385449e42d17a7b8bffe5e7c Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Fri, 27 Dec 2024 05:44:51 +0100
Subject: [PATCH 09/13] add SDPAFuseTransposeReshape back

---
 .../common/pass/stateful_sdpa_fusion.cpp      |   3 +-
 .../x64/fuse_reshape_transpose_to_sdpa.cpp    | 414 +++++++++---------
 2 files changed, 209 insertions(+), 208 deletions(-)

diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
index f3b1d926a3dbc7..fe4a71f44be958 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
@@ -23,6 +23,7 @@
 #include "ov_ops/type_relaxed.hpp"
 #include "transformations/common_optimizations/simplify_shape_of_sub_graph.hpp"
 #include "transformations/cpu_opset/common/op/sdpa.hpp"
+#include "transformations/cpu_opset/x64/pass/sdpa_fuse_transpose_reshape.hpp"
 #include "transformations/defs.hpp"
 #include "transformations/op_conversions/convert_broadcast3.hpp"
 #include "transformations/transpose_sinking/ts_shape_of.hpp"
@@ -307,7 +308,7 @@ bool SDPASubgraphFusion::run_on_model(const std::shared_ptr<ov::Model>& f) {
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::SimplifyGatherShapeOf);
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::transpose_sinking::TSShapeOfForward);
     CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion);
-    CPU_REGISTER_PASS_COMMON(manager, ov::pass::Validate);
+    CPU_REGISTER_PASS_X64(manager, SDPAFuseTransposeReshape);
 
     manager.run_passes(f);
     return false;
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp
index 0da3732c295b5c..a75156c0f69fcb 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp
@@ -34,212 +34,212 @@ namespace test {
  */
 
 // <Input_shapes, [H,S]>
-// using InputShapeAndReshapeOrder = std::pair<std::vector<InputShape>, std::vector<int32_t>>;
-// using FuseSDPAReshapeTransposeTestParams = std::tuple<ElementType, InputShapeAndReshapeOrder>;
-// class FuseSDPAReshapeTransposeTest : virtual public ov::test::SubgraphBaseTest,
-//                                      public testing::WithParamInterface<FuseSDPAReshapeTransposeTestParams>,
-//                                      public CPUTestsBase {
-// public:
-//     static std::string getTestCaseName(const testing::TestParamInfo<FuseSDPAReshapeTransposeTestParams>& obj) {
-//         ElementType inType;
-//         InputShapeAndReshapeOrder inputShapeAndOrders;
-//         std::tie(inType, inputShapeAndOrders) = obj.param;
-//         std::ostringstream result;
-//         std::vector<InputShape>& inputShapes = inputShapeAndOrders.first;
-//         auto& reshapeOrderHS = inputShapeAndOrders.second;
-//         result << "IS=";
-//         for (const auto& shape : inputShapes) {
-//             result << ov::test::utils::partialShape2str({shape.first}) << "_";
-//         }
-//         result << "TS=";
-//         for (const auto& shape : inputShapes) {
-//             result << "(";
-//             if (!shape.second.empty()) {
-//                 for (const auto& itr : shape.second) {
-//                     result << ov::test::utils::vec2str(itr);
-//                 }
-//             }
-//             result << ")_";
-//         }
-//         result << "Prc=" << inType << "_";
-//         result << "ReshapeOrderHS=";
-//         result << "(";
-//         for (const auto& itr : reshapeOrderHS) {
-//             result << itr << ",";
-//         }
-//         result << ")";
-
-//         return result.str();
-//     }
-
-//     void SetUp() override {
-//         ElementType inType;
-//         InputShapeAndReshapeOrder inputShapeAndOrders;
-//         std::tie(inType, inputShapeAndOrders) = this->GetParam();
-//         std::vector<InputShape>& inputShapes = inputShapeAndOrders.first;
-//         auto& reshapeOrderHS = inputShapeAndOrders.second;
-//         targetDevice = ov::test::utils::DEVICE_CPU;
-//         rel_threshold = 1e-2f;
-//         configuration[ov::hint::inference_precision.name()] = ov::element::f32;
-//         if (inType == ElementType::bf16) {
-//             configuration[ov::hint::inference_precision.name()] = ov::element::bf16;
-//             rel_threshold = 0.01f;
-//         }
-//         init_input_shapes(inputShapes);
-
-//         // pre SDPA reshape->transpose
-//         ov::ParameterVector inputParams(3);
-//         ov::SinkVector sinkNodes;
-//         OutputVector transposes(3);
-//         for (size_t i = 0; i < 3u; i++) {
-//             inputParams[i] = std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[0]);
-
-//             auto reshape_axis =
-//                 ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 0, reshapeOrderHS[0], reshapeOrderHS[1]});
-
-//             std::shared_ptr<ov::Node> reshape_input_1 = inputParams[i];
-//             if (i > 0) {
-//                 auto var = std::make_shared<ov::op::util::Variable>(
-//                     ov::op::util::VariableInfo{inputDynamicShapes[0], inType, "var_" + std::to_string(i)});
-//                 auto readvalue = std::make_shared<ov::op::v6::ReadValue>(inputParams[i], var);
-//                 auto assign = std::make_shared<ov::op::v6::Assign>(readvalue, var);
-//                 sinkNodes.emplace_back(assign);
-//                 reshape_input_1 = readvalue;
-//             }
-
-//             auto reshape = std::make_shared<ov::op::v1::Reshape>(reshape_input_1, reshape_axis, true);
-//             auto transposeOrder = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3});
-//             transposes[i] = std::make_shared<ov::op::v1::Transpose>(reshape, transposeOrder);
-//         }
-
-//         auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(transposes, false);
-//         sdpa->set_friendly_name("mha");
-
-//         // post SDPA transpose + reshape
-//         auto postOrder =
-//             ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<size_t>{0, 2, 1, 3});  // BHLS -> BLHS
-//         auto transposeSDPA = std::make_shared<ov::op::v1::Transpose>(sdpa, postOrder);
-
-//         auto constReshape =
-//             ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, reshapeOrderHS[0] * reshapeOrderHS[1]});
-//         auto reshapeSDPA = std::make_shared<ov::op::v1::Reshape>(transposeSDPA, constReshape, true);  // BLHS -> B,L,HxS
-
-//         function = std::make_shared<ov::Model>(ov::OutputVector{reshapeSDPA},
-//                                                sinkNodes,
-//                                                inputParams,
-//                                                "FuseSDPAReshapeTranspose");
-//         targetDevice = ov::test::utils::DEVICE_CPU;
-//         functionRefs = function->clone();
-//         pass::Manager manager;
-//         // decompose ScaledDotProductAttention
-//         manager.register_pass<ov::pass::ScaledDotProductAttentionDecomposition>();
-//         manager.run_passes(functionRefs);
-//     }
-
-//     template <typename IT, typename T>
-//     static void strided_iota(IT first, size_t n, T value, T stride) {
-//         for (size_t i = 0; i < n; i++) {
-//             *first++ = value;
-//             value += stride;
-//         }
-//     }
-//     void generate(int idx, const std::vector<ov::Shape>& targetInputStaticShapes) {
-//         inputs.clear();
-//         auto create_input = [this] (std::shared_ptr<ov::op::v0::Parameter> param, ov::Shape shape, float val) {
-//             if (param->get_element_type() == ov::element::i32) {
-//                 ov::Tensor t{ov::element::i32, shape};
-//                 auto size = ov::shape_size<ov::Shape>(shape);
-//                 auto* p = static_cast<int*>(t.data());
-//                 auto start = static_cast<int>(val);
-//                 for (size_t i = 0; i < size; i++) {
-//                     p[i] = (start + i) % size;
-//                 }
-//                 inputs.insert({param, t});
-//             } else if (param->get_element_type() == ov::element::f32) {
-//                 ov::Tensor t{ov::element::f32, shape};
-//                 strided_iota(static_cast<float*>(t.data()), t.get_size(), val, 0.1f);
-//                 inputs.insert({param, t});
-//             } else {
-//                 ASSERT_TRUE(param->get_element_type() == ov::element::bf16);
-//                 ov::Tensor t{ov::element::bf16, shape};
-//                 strided_iota(static_cast<ov::bfloat16*>(t.data()), t.get_size(), val, 0.1f);
-//                 inputs.insert({param, t});
-//             }
-//         };
-//         // q, k, v
-//         create_input(function->get_parameters()[0], targetInputStaticShapes[0], idx + 1.0f);
-//         create_input(function->get_parameters()[1], targetInputStaticShapes[0], idx + 2.0f);
-//         create_input(function->get_parameters()[2], targetInputStaticShapes[0], idx + 3.0f);
-//     }
-//     void prepare() {
-//         compile_model();
-//         inferRequest = compiledModel.create_infer_request();
-//         ASSERT_TRUE(inferRequest);
-//     }
-//     void reset() {
-//         for (auto&& state : inferRequest.query_state()) {
-//             state.reset();
-//         }
-//     }
-
-//     std::vector<ov::Tensor> run_test(std::shared_ptr<ov::Model> model) {
-//         function = model;
-//         prepare();
-//         std::vector<ov::Tensor> outputs;
-//         int idx = 0;
-//         for (auto&& shapes : targetStaticShapes) {
-//             generate(idx++, shapes);
-//             for (const auto& input : inputs) {
-//                 inferRequest.set_tensor(input.first, input.second);
-//             }
-//             inferRequest.infer();
-//             auto outputTensor = inferRequest.get_output_tensor(0);
-//             ov::Tensor copy{outputTensor.get_element_type(), outputTensor.get_shape()};
-//             outputTensor.copy_to(copy);
-//             outputs.push_back(copy);
-//             reset();
-//         }
-//         return outputs;
-//     }
-// };
-
-// TEST_P(FuseSDPAReshapeTransposeTest, CompareWithRefs) {
-//     SKIP_IF_CURRENT_TEST_IS_DISABLED();
-//     bool reshape_transpose_fused = false;
-//     auto actualOutputs = run_test(function);
-//     CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1);
-//     CheckNumberOfNodesWithType(compiledModel, "Reshape", 0);
-//     CheckNumberOfNodesWithType(compiledModel, "Transpose", 0);
-//     for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) {
-//         if (n->get_friendly_name() == "mha/fused_reshape_transpose") {
-//             reshape_transpose_fused = true;
-//         }
-//     }
-//     ASSERT_TRUE(reshape_transpose_fused);
-
-//     auto expectedOutputs = run_test(functionRefs);
-//     for (size_t i = 0; i < actualOutputs.size(); i++) {
-//         ov::test::utils::compare(expectedOutputs[i], actualOutputs[i], abs_threshold, rel_threshold);
-//     }
-// }
-
-// namespace {
-// const std::vector<InputShapeAndReshapeOrder> inputShapeAndReshapeOrders = {
-//     // <Input_shapes, [H,S]>
-//     {
-//         {{
-//              // Q,K,V:[B, L, H*S]
-//              {{-1, -1, 4 * 16}, {{1, 1, 4 * 16}, {1, 2, 4 * 16}, {2, 2, 4 * 16}}},
-//          },
-//          // reshapeOrderHS
-//          {4, 16}},
-//     }};
-
-// INSTANTIATE_TEST_SUITE_P(smoke_FuseSDPAReshapeTransposeTest,
-//                          FuseSDPAReshapeTransposeTest,
-//                          ::testing::Combine(::testing::Values(ElementType::f32),
-//                                             ::testing::ValuesIn(inputShapeAndReshapeOrders)),
-//                          FuseSDPAReshapeTransposeTest::getTestCaseName);
-// }  // namespace
+using InputShapeAndReshapeOrder = std::pair<std::vector<InputShape>, std::vector<int32_t>>;
+using FuseSDPAReshapeTransposeTestParams = std::tuple<ElementType, InputShapeAndReshapeOrder>;
+class FuseSDPAReshapeTransposeTest : virtual public ov::test::SubgraphBaseTest,
+                                     public testing::WithParamInterface<FuseSDPAReshapeTransposeTestParams>,
+                                     public CPUTestsBase {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<FuseSDPAReshapeTransposeTestParams>& obj) {
+        ElementType inType;
+        InputShapeAndReshapeOrder inputShapeAndOrders;
+        std::tie(inType, inputShapeAndOrders) = obj.param;
+        std::ostringstream result;
+        std::vector<InputShape>& inputShapes = inputShapeAndOrders.first;
+        auto& reshapeOrderHS = inputShapeAndOrders.second;
+        result << "IS=";
+        for (const auto& shape : inputShapes) {
+            result << ov::test::utils::partialShape2str({shape.first}) << "_";
+        }
+        result << "TS=";
+        for (const auto& shape : inputShapes) {
+            result << "(";
+            if (!shape.second.empty()) {
+                for (const auto& itr : shape.second) {
+                    result << ov::test::utils::vec2str(itr);
+                }
+            }
+            result << ")_";
+        }
+        result << "Prc=" << inType << "_";
+        result << "ReshapeOrderHS=";
+        result << "(";
+        for (const auto& itr : reshapeOrderHS) {
+            result << itr << ",";
+        }
+        result << ")";
+
+        return result.str();
+    }
+
+    void SetUp() override {
+        ElementType inType;
+        InputShapeAndReshapeOrder inputShapeAndOrders;
+        std::tie(inType, inputShapeAndOrders) = this->GetParam();
+        std::vector<InputShape>& inputShapes = inputShapeAndOrders.first;
+        auto& reshapeOrderHS = inputShapeAndOrders.second;
+        targetDevice = ov::test::utils::DEVICE_CPU;
+        rel_threshold = 1e-2f;
+        configuration[ov::hint::inference_precision.name()] = ov::element::f32;
+        if (inType == ElementType::bf16) {
+            configuration[ov::hint::inference_precision.name()] = ov::element::bf16;
+            rel_threshold = 0.01f;
+        }
+        init_input_shapes(inputShapes);
+
+        // pre SDPA reshape->transpose
+        ov::ParameterVector inputParams(3);
+        ov::SinkVector sinkNodes;
+        OutputVector transposes(3);
+        for (size_t i = 0; i < 3u; i++) {
+            inputParams[i] = std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[0]);
+
+            auto reshape_axis =
+                ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 0, reshapeOrderHS[0], reshapeOrderHS[1]});
+
+            std::shared_ptr<ov::Node> reshape_input_1 = inputParams[i];
+            if (i > 0) {
+                auto var = std::make_shared<ov::op::util::Variable>(
+                    ov::op::util::VariableInfo{inputDynamicShapes[0], inType, "var_" + std::to_string(i)});
+                auto readvalue = std::make_shared<ov::op::v6::ReadValue>(inputParams[i], var);
+                auto assign = std::make_shared<ov::op::v6::Assign>(readvalue, var);
+                sinkNodes.emplace_back(assign);
+                reshape_input_1 = readvalue;
+            }
+
+            auto reshape = std::make_shared<ov::op::v1::Reshape>(reshape_input_1, reshape_axis, true);
+            auto transposeOrder = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3});
+            transposes[i] = std::make_shared<ov::op::v1::Transpose>(reshape, transposeOrder);
+        }
+
+        auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(transposes, false);
+        sdpa->set_friendly_name("mha");
+
+        // post SDPA transpose + reshape
+        auto postOrder =
+            ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<size_t>{0, 2, 1, 3});  // BHLS -> BLHS
+        auto transposeSDPA = std::make_shared<ov::op::v1::Transpose>(sdpa, postOrder);
+
+        auto constReshape =
+            ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, reshapeOrderHS[0] * reshapeOrderHS[1]});
+        auto reshapeSDPA = std::make_shared<ov::op::v1::Reshape>(transposeSDPA, constReshape, true);  // BLHS -> B,L,HxS
+
+        function = std::make_shared<ov::Model>(ov::OutputVector{reshapeSDPA},
+                                               sinkNodes,
+                                               inputParams,
+                                               "FuseSDPAReshapeTranspose");
+        targetDevice = ov::test::utils::DEVICE_CPU;
+        functionRefs = function->clone();
+        pass::Manager manager;
+        // decompose ScaledDotProductAttention
+        manager.register_pass<ov::pass::ScaledDotProductAttentionDecomposition>();
+        manager.run_passes(functionRefs);
+    }
+
+    template <typename IT, typename T>
+    static void strided_iota(IT first, size_t n, T value, T stride) {
+        for (size_t i = 0; i < n; i++) {
+            *first++ = value;
+            value += stride;
+        }
+    }
+    void generate(int idx, const std::vector<ov::Shape>& targetInputStaticShapes) {
+        inputs.clear();
+        auto create_input = [this] (std::shared_ptr<ov::op::v0::Parameter> param, ov::Shape shape, float val) {
+            if (param->get_element_type() == ov::element::i32) {
+                ov::Tensor t{ov::element::i32, shape};
+                auto size = ov::shape_size<ov::Shape>(shape);
+                auto* p = static_cast<int*>(t.data());
+                auto start = static_cast<int>(val);
+                for (size_t i = 0; i < size; i++) {
+                    p[i] = (start + i) % size;
+                }
+                inputs.insert({param, t});
+            } else if (param->get_element_type() == ov::element::f32) {
+                ov::Tensor t{ov::element::f32, shape};
+                strided_iota(static_cast<float*>(t.data()), t.get_size(), val, 0.1f);
+                inputs.insert({param, t});
+            } else {
+                ASSERT_TRUE(param->get_element_type() == ov::element::bf16);
+                ov::Tensor t{ov::element::bf16, shape};
+                strided_iota(static_cast<ov::bfloat16*>(t.data()), t.get_size(), val, 0.1f);
+                inputs.insert({param, t});
+            }
+        };
+        // q, k, v
+        create_input(function->get_parameters()[0], targetInputStaticShapes[0], idx + 1.0f);
+        create_input(function->get_parameters()[1], targetInputStaticShapes[0], idx + 2.0f);
+        create_input(function->get_parameters()[2], targetInputStaticShapes[0], idx + 3.0f);
+    }
+    void prepare() {
+        compile_model();
+        inferRequest = compiledModel.create_infer_request();
+        ASSERT_TRUE(inferRequest);
+    }
+    void reset() {
+        for (auto&& state : inferRequest.query_state()) {
+            state.reset();
+        }
+    }
+
+    std::vector<ov::Tensor> run_test(std::shared_ptr<ov::Model> model) {
+        function = model;
+        prepare();
+        std::vector<ov::Tensor> outputs;
+        int idx = 0;
+        for (auto&& shapes : targetStaticShapes) {
+            generate(idx++, shapes);
+            for (const auto& input : inputs) {
+                inferRequest.set_tensor(input.first, input.second);
+            }
+            inferRequest.infer();
+            auto outputTensor = inferRequest.get_output_tensor(0);
+            ov::Tensor copy{outputTensor.get_element_type(), outputTensor.get_shape()};
+            outputTensor.copy_to(copy);
+            outputs.push_back(copy);
+            reset();
+        }
+        return outputs;
+    }
+};
+
+TEST_P(FuseSDPAReshapeTransposeTest, CompareWithRefs) {
+    SKIP_IF_CURRENT_TEST_IS_DISABLED();
+    bool reshape_transpose_fused = false;
+    auto actualOutputs = run_test(function);
+    CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1);
+    CheckNumberOfNodesWithType(compiledModel, "Reshape", 0);
+    CheckNumberOfNodesWithType(compiledModel, "Transpose", 0);
+    for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) {
+        if (n->get_friendly_name() == "mha/fused_reshape_transpose") {
+            reshape_transpose_fused = true;
+        }
+    }
+    ASSERT_TRUE(reshape_transpose_fused);
+
+    auto expectedOutputs = run_test(functionRefs);
+    for (size_t i = 0; i < actualOutputs.size(); i++) {
+        ov::test::utils::compare(expectedOutputs[i], actualOutputs[i], abs_threshold, rel_threshold);
+    }
+}
+
+namespace {
+const std::vector<InputShapeAndReshapeOrder> inputShapeAndReshapeOrders = {
+    // <Input_shapes, [H,S]>
+    {
+        {{
+             // Q,K,V:[B, L, H*S]
+             {{-1, -1, 4 * 16}, {{1, 1, 4 * 16}, {1, 2, 4 * 16}, {2, 2, 4 * 16}}},
+         },
+         // reshapeOrderHS
+         {4, 16}},
+    }};
+
+INSTANTIATE_TEST_SUITE_P(smoke_FuseSDPAReshapeTransposeTest,
+                         FuseSDPAReshapeTransposeTest,
+                         ::testing::Combine(::testing::Values(ElementType::f32),
+                                            ::testing::ValuesIn(inputShapeAndReshapeOrders)),
+                         FuseSDPAReshapeTransposeTest::getTestCaseName);
+}  // namespace
 }  // namespace test
 }  // namespace ov

From e2857157c1beb84cbcbd3a20fb4432f8345ec4e1 Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Fri, 27 Dec 2024 08:20:34 +0100
Subject: [PATCH 10/13] modify test to cover the change

---
 .../src/common/concat_multiple_query_sdp.cpp       |  6 +++---
 .../src/common/concat_transpose_sdp_transpose.cpp  | 14 +++++++++++---
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp
index d74ab99fb3d5ab..fe5ba2b7eac5e7 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp
@@ -152,9 +152,9 @@ class ConcatMultiQuerySDPTest : public testing::WithParamInterface<ConcatMultiQu
         auto unsqueezeK = std::make_shared<ov::op::v0::Unsqueeze>(concatK, unsquezeAxis);
         auto unsqueezeV = std::make_shared<ov::op::v0::Unsqueeze>(concatV, unsquezeAxis);
 
-        auto targetShape = ov::op::v0::Constant::create(qkvType, {1, 1, 1, 4, 1}, {1});
-        auto broadcastK = std::make_shared<ov::op::v1::Multiply>(unsqueezeK, targetShape);
-        auto broadcastV = std::make_shared<ov::op::v1::Multiply>(unsqueezeV, targetShape);
+        auto targetShape = ov::op::v0::Constant::create(element::i32, {5}, {1, 1, 1, 4, 1});
+        auto broadcastK = std::make_shared<ov::op::v3::Broadcast>(unsqueezeK, targetShape, op::BroadcastType::BIDIRECTIONAL);
+        auto broadcastV = std::make_shared<ov::op::v3::Broadcast>(unsqueezeV, targetShape, op::BroadcastType::BIDIRECTIONAL);
 
         auto target4D = ov::op::v0::Constant::create(ov::element::i32, {4}, {0, 0, 8, 64});
 
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp
index f4166544af2bf2..8ba978e32c4b9c 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp
@@ -71,7 +71,7 @@ class ConcatSDPTransposeTestBase : public testing::WithParamInterface<ConcatSDPT
             result << ")_";
         }
         result << "Prc=" << inType << "_";
-        result << "HasShapeOf=" << hasShapeof;
+        result << "HasShapeOf=" << hasShapeof << "_";
         result << "TransposeOrder=";
         result << "(";
         for (const auto& itr : transposeOrder) {
@@ -85,7 +85,6 @@ class ConcatSDPTransposeTestBase : public testing::WithParamInterface<ConcatSDPT
     void SetUp() override {
         ElementType inType;
         InputShapeAndTransposeOrder inputShapeAndOrders;
-        bool hasShapeOf;
         std::tie(inType, inputShapeAndOrders, hasShapeOf) = this->GetParam();
         std::vector<InputShape>& inputShapes = inputShapeAndOrders.first;
         transposeOrder = inputShapeAndOrders.second;
@@ -124,6 +123,10 @@ class ConcatSDPTransposeTestBase : public testing::WithParamInterface<ConcatSDPT
         // pre SDPA transpose
         auto preOrder = ov::op::v0::Constant::create(ov::element::i32, {4}, transposeOrder);
         auto transposeQ = std::make_shared<ov::op::v1::Transpose>(inputParams[0], preOrder);
+        std::shared_ptr<ov::Node> transposeQ_shapeof;
+        if (hasShapeOf) {
+            transposeQ_shapeof = std::make_shared<ov::op::v0::ShapeOf>(transposeQ);
+        }
 
         auto concat_axis = transposeOrder[2];
         auto beam_idx = std::make_shared<ov::op::v0::Parameter>(ElementType::i32, ov::PartialShape{-1});
@@ -166,6 +169,7 @@ class ConcatSDPTransposeTestBase : public testing::WithParamInterface<ConcatSDPT
         if (hasShapeOf) {
             results.push_back(pastk_shapeof);
             results.push_back(pastv_shapeof);
+            results.push_back(transposeQ_shapeof);
         }
         ov::SinkVector sinks{pastk_assign, pastv_assign};
         function = std::make_shared<ov::Model>(results, sinks, inputParams, "ConcatTranposeSDP");
@@ -237,6 +241,7 @@ class ConcatSDPTransposeTestBase : public testing::WithParamInterface<ConcatSDPT
         }
     }
     std::vector<size_t> transposeOrder;
+    bool hasShapeOf;
 };
 
 class ConcatSDPTransposeTest : public ConcatSDPTransposeTestBase {
@@ -287,7 +292,10 @@ TEST_P(ConcatSDPTransposeTest, CompareWithRefs) {
     CheckNumberOfNodesWithType(compiledModel, "Concatenation", 0);
     CheckNumberOfNodesWithType(compiledModel, "Reorder", 0);
     CheckNumberOfNodesWithType(compiledModel, "Transpose", 1);
-    CheckNumberOfNodesWithType(compiledModel, "Gather", 0);
+    // Transformation TSShapeOfForward will change:
+    // ?->transpose->shapeof ==> ?-->shapeof->gather
+    //                            |->transpose
+    CheckNumberOfNodesWithType(compiledModel, "Gather", hasShapeOf ? 1 : 0);
     auto expectedOutputs = run_test(functionRefs);
     CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 0);
     for (size_t i = 0; i < actualOutputs.size(); i++) {

From 02c2d1938704d956275c1c1d2fad6cc7726c688a Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Mon, 30 Dec 2024 11:57:01 +0100
Subject: [PATCH 11/13] disable SDPAFuseTransposeReshape

---
 .../common/pass/stateful_sdpa_fusion.cpp      |   1 -
 .../x64/fuse_reshape_transpose_to_sdpa.cpp    | 414 +++++++++---------
 2 files changed, 207 insertions(+), 208 deletions(-)

diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
index fe4a71f44be958..e930abf1102a8c 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
@@ -308,7 +308,6 @@ bool SDPASubgraphFusion::run_on_model(const std::shared_ptr<ov::Model>& f) {
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::SimplifyGatherShapeOf);
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::transpose_sinking::TSShapeOfForward);
     CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion);
-    CPU_REGISTER_PASS_X64(manager, SDPAFuseTransposeReshape);
 
     manager.run_passes(f);
     return false;
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp
index a75156c0f69fcb..0da3732c295b5c 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp
@@ -34,212 +34,212 @@ namespace test {
  */
 
 // <Input_shapes, [H,S]>
-using InputShapeAndReshapeOrder = std::pair<std::vector<InputShape>, std::vector<int32_t>>;
-using FuseSDPAReshapeTransposeTestParams = std::tuple<ElementType, InputShapeAndReshapeOrder>;
-class FuseSDPAReshapeTransposeTest : virtual public ov::test::SubgraphBaseTest,
-                                     public testing::WithParamInterface<FuseSDPAReshapeTransposeTestParams>,
-                                     public CPUTestsBase {
-public:
-    static std::string getTestCaseName(const testing::TestParamInfo<FuseSDPAReshapeTransposeTestParams>& obj) {
-        ElementType inType;
-        InputShapeAndReshapeOrder inputShapeAndOrders;
-        std::tie(inType, inputShapeAndOrders) = obj.param;
-        std::ostringstream result;
-        std::vector<InputShape>& inputShapes = inputShapeAndOrders.first;
-        auto& reshapeOrderHS = inputShapeAndOrders.second;
-        result << "IS=";
-        for (const auto& shape : inputShapes) {
-            result << ov::test::utils::partialShape2str({shape.first}) << "_";
-        }
-        result << "TS=";
-        for (const auto& shape : inputShapes) {
-            result << "(";
-            if (!shape.second.empty()) {
-                for (const auto& itr : shape.second) {
-                    result << ov::test::utils::vec2str(itr);
-                }
-            }
-            result << ")_";
-        }
-        result << "Prc=" << inType << "_";
-        result << "ReshapeOrderHS=";
-        result << "(";
-        for (const auto& itr : reshapeOrderHS) {
-            result << itr << ",";
-        }
-        result << ")";
-
-        return result.str();
-    }
-
-    void SetUp() override {
-        ElementType inType;
-        InputShapeAndReshapeOrder inputShapeAndOrders;
-        std::tie(inType, inputShapeAndOrders) = this->GetParam();
-        std::vector<InputShape>& inputShapes = inputShapeAndOrders.first;
-        auto& reshapeOrderHS = inputShapeAndOrders.second;
-        targetDevice = ov::test::utils::DEVICE_CPU;
-        rel_threshold = 1e-2f;
-        configuration[ov::hint::inference_precision.name()] = ov::element::f32;
-        if (inType == ElementType::bf16) {
-            configuration[ov::hint::inference_precision.name()] = ov::element::bf16;
-            rel_threshold = 0.01f;
-        }
-        init_input_shapes(inputShapes);
-
-        // pre SDPA reshape->transpose
-        ov::ParameterVector inputParams(3);
-        ov::SinkVector sinkNodes;
-        OutputVector transposes(3);
-        for (size_t i = 0; i < 3u; i++) {
-            inputParams[i] = std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[0]);
-
-            auto reshape_axis =
-                ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 0, reshapeOrderHS[0], reshapeOrderHS[1]});
-
-            std::shared_ptr<ov::Node> reshape_input_1 = inputParams[i];
-            if (i > 0) {
-                auto var = std::make_shared<ov::op::util::Variable>(
-                    ov::op::util::VariableInfo{inputDynamicShapes[0], inType, "var_" + std::to_string(i)});
-                auto readvalue = std::make_shared<ov::op::v6::ReadValue>(inputParams[i], var);
-                auto assign = std::make_shared<ov::op::v6::Assign>(readvalue, var);
-                sinkNodes.emplace_back(assign);
-                reshape_input_1 = readvalue;
-            }
-
-            auto reshape = std::make_shared<ov::op::v1::Reshape>(reshape_input_1, reshape_axis, true);
-            auto transposeOrder = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3});
-            transposes[i] = std::make_shared<ov::op::v1::Transpose>(reshape, transposeOrder);
-        }
-
-        auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(transposes, false);
-        sdpa->set_friendly_name("mha");
-
-        // post SDPA transpose + reshape
-        auto postOrder =
-            ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<size_t>{0, 2, 1, 3});  // BHLS -> BLHS
-        auto transposeSDPA = std::make_shared<ov::op::v1::Transpose>(sdpa, postOrder);
-
-        auto constReshape =
-            ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, reshapeOrderHS[0] * reshapeOrderHS[1]});
-        auto reshapeSDPA = std::make_shared<ov::op::v1::Reshape>(transposeSDPA, constReshape, true);  // BLHS -> B,L,HxS
-
-        function = std::make_shared<ov::Model>(ov::OutputVector{reshapeSDPA},
-                                               sinkNodes,
-                                               inputParams,
-                                               "FuseSDPAReshapeTranspose");
-        targetDevice = ov::test::utils::DEVICE_CPU;
-        functionRefs = function->clone();
-        pass::Manager manager;
-        // decompose ScaledDotProductAttention
-        manager.register_pass<ov::pass::ScaledDotProductAttentionDecomposition>();
-        manager.run_passes(functionRefs);
-    }
-
-    template <typename IT, typename T>
-    static void strided_iota(IT first, size_t n, T value, T stride) {
-        for (size_t i = 0; i < n; i++) {
-            *first++ = value;
-            value += stride;
-        }
-    }
-    void generate(int idx, const std::vector<ov::Shape>& targetInputStaticShapes) {
-        inputs.clear();
-        auto create_input = [this] (std::shared_ptr<ov::op::v0::Parameter> param, ov::Shape shape, float val) {
-            if (param->get_element_type() == ov::element::i32) {
-                ov::Tensor t{ov::element::i32, shape};
-                auto size = ov::shape_size<ov::Shape>(shape);
-                auto* p = static_cast<int*>(t.data());
-                auto start = static_cast<int>(val);
-                for (size_t i = 0; i < size; i++) {
-                    p[i] = (start + i) % size;
-                }
-                inputs.insert({param, t});
-            } else if (param->get_element_type() == ov::element::f32) {
-                ov::Tensor t{ov::element::f32, shape};
-                strided_iota(static_cast<float*>(t.data()), t.get_size(), val, 0.1f);
-                inputs.insert({param, t});
-            } else {
-                ASSERT_TRUE(param->get_element_type() == ov::element::bf16);
-                ov::Tensor t{ov::element::bf16, shape};
-                strided_iota(static_cast<ov::bfloat16*>(t.data()), t.get_size(), val, 0.1f);
-                inputs.insert({param, t});
-            }
-        };
-        // q, k, v
-        create_input(function->get_parameters()[0], targetInputStaticShapes[0], idx + 1.0f);
-        create_input(function->get_parameters()[1], targetInputStaticShapes[0], idx + 2.0f);
-        create_input(function->get_parameters()[2], targetInputStaticShapes[0], idx + 3.0f);
-    }
-    void prepare() {
-        compile_model();
-        inferRequest = compiledModel.create_infer_request();
-        ASSERT_TRUE(inferRequest);
-    }
-    void reset() {
-        for (auto&& state : inferRequest.query_state()) {
-            state.reset();
-        }
-    }
-
-    std::vector<ov::Tensor> run_test(std::shared_ptr<ov::Model> model) {
-        function = model;
-        prepare();
-        std::vector<ov::Tensor> outputs;
-        int idx = 0;
-        for (auto&& shapes : targetStaticShapes) {
-            generate(idx++, shapes);
-            for (const auto& input : inputs) {
-                inferRequest.set_tensor(input.first, input.second);
-            }
-            inferRequest.infer();
-            auto outputTensor = inferRequest.get_output_tensor(0);
-            ov::Tensor copy{outputTensor.get_element_type(), outputTensor.get_shape()};
-            outputTensor.copy_to(copy);
-            outputs.push_back(copy);
-            reset();
-        }
-        return outputs;
-    }
-};
-
-TEST_P(FuseSDPAReshapeTransposeTest, CompareWithRefs) {
-    SKIP_IF_CURRENT_TEST_IS_DISABLED();
-    bool reshape_transpose_fused = false;
-    auto actualOutputs = run_test(function);
-    CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1);
-    CheckNumberOfNodesWithType(compiledModel, "Reshape", 0);
-    CheckNumberOfNodesWithType(compiledModel, "Transpose", 0);
-    for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) {
-        if (n->get_friendly_name() == "mha/fused_reshape_transpose") {
-            reshape_transpose_fused = true;
-        }
-    }
-    ASSERT_TRUE(reshape_transpose_fused);
-
-    auto expectedOutputs = run_test(functionRefs);
-    for (size_t i = 0; i < actualOutputs.size(); i++) {
-        ov::test::utils::compare(expectedOutputs[i], actualOutputs[i], abs_threshold, rel_threshold);
-    }
-}
-
-namespace {
-const std::vector<InputShapeAndReshapeOrder> inputShapeAndReshapeOrders = {
-    // <Input_shapes, [H,S]>
-    {
-        {{
-             // Q,K,V:[B, L, H*S]
-             {{-1, -1, 4 * 16}, {{1, 1, 4 * 16}, {1, 2, 4 * 16}, {2, 2, 4 * 16}}},
-         },
-         // reshapeOrderHS
-         {4, 16}},
-    }};
-
-INSTANTIATE_TEST_SUITE_P(smoke_FuseSDPAReshapeTransposeTest,
-                         FuseSDPAReshapeTransposeTest,
-                         ::testing::Combine(::testing::Values(ElementType::f32),
-                                            ::testing::ValuesIn(inputShapeAndReshapeOrders)),
-                         FuseSDPAReshapeTransposeTest::getTestCaseName);
-}  // namespace
+// using InputShapeAndReshapeOrder = std::pair<std::vector<InputShape>, std::vector<int32_t>>;
+// using FuseSDPAReshapeTransposeTestParams = std::tuple<ElementType, InputShapeAndReshapeOrder>;
+// class FuseSDPAReshapeTransposeTest : virtual public ov::test::SubgraphBaseTest,
+//                                      public testing::WithParamInterface<FuseSDPAReshapeTransposeTestParams>,
+//                                      public CPUTestsBase {
+// public:
+//     static std::string getTestCaseName(const testing::TestParamInfo<FuseSDPAReshapeTransposeTestParams>& obj) {
+//         ElementType inType;
+//         InputShapeAndReshapeOrder inputShapeAndOrders;
+//         std::tie(inType, inputShapeAndOrders) = obj.param;
+//         std::ostringstream result;
+//         std::vector<InputShape>& inputShapes = inputShapeAndOrders.first;
+//         auto& reshapeOrderHS = inputShapeAndOrders.second;
+//         result << "IS=";
+//         for (const auto& shape : inputShapes) {
+//             result << ov::test::utils::partialShape2str({shape.first}) << "_";
+//         }
+//         result << "TS=";
+//         for (const auto& shape : inputShapes) {
+//             result << "(";
+//             if (!shape.second.empty()) {
+//                 for (const auto& itr : shape.second) {
+//                     result << ov::test::utils::vec2str(itr);
+//                 }
+//             }
+//             result << ")_";
+//         }
+//         result << "Prc=" << inType << "_";
+//         result << "ReshapeOrderHS=";
+//         result << "(";
+//         for (const auto& itr : reshapeOrderHS) {
+//             result << itr << ",";
+//         }
+//         result << ")";
+
+//         return result.str();
+//     }
+
+//     void SetUp() override {
+//         ElementType inType;
+//         InputShapeAndReshapeOrder inputShapeAndOrders;
+//         std::tie(inType, inputShapeAndOrders) = this->GetParam();
+//         std::vector<InputShape>& inputShapes = inputShapeAndOrders.first;
+//         auto& reshapeOrderHS = inputShapeAndOrders.second;
+//         targetDevice = ov::test::utils::DEVICE_CPU;
+//         rel_threshold = 1e-2f;
+//         configuration[ov::hint::inference_precision.name()] = ov::element::f32;
+//         if (inType == ElementType::bf16) {
+//             configuration[ov::hint::inference_precision.name()] = ov::element::bf16;
+//             rel_threshold = 0.01f;
+//         }
+//         init_input_shapes(inputShapes);
+
+//         // pre SDPA reshape->transpose
+//         ov::ParameterVector inputParams(3);
+//         ov::SinkVector sinkNodes;
+//         OutputVector transposes(3);
+//         for (size_t i = 0; i < 3u; i++) {
+//             inputParams[i] = std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[0]);
+
+//             auto reshape_axis =
+//                 ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 0, reshapeOrderHS[0], reshapeOrderHS[1]});
+
+//             std::shared_ptr<ov::Node> reshape_input_1 = inputParams[i];
+//             if (i > 0) {
+//                 auto var = std::make_shared<ov::op::util::Variable>(
+//                     ov::op::util::VariableInfo{inputDynamicShapes[0], inType, "var_" + std::to_string(i)});
+//                 auto readvalue = std::make_shared<ov::op::v6::ReadValue>(inputParams[i], var);
+//                 auto assign = std::make_shared<ov::op::v6::Assign>(readvalue, var);
+//                 sinkNodes.emplace_back(assign);
+//                 reshape_input_1 = readvalue;
+//             }
+
+//             auto reshape = std::make_shared<ov::op::v1::Reshape>(reshape_input_1, reshape_axis, true);
+//             auto transposeOrder = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3});
+//             transposes[i] = std::make_shared<ov::op::v1::Transpose>(reshape, transposeOrder);
+//         }
+
+//         auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(transposes, false);
+//         sdpa->set_friendly_name("mha");
+
+//         // post SDPA transpose + reshape
+//         auto postOrder =
+//             ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<size_t>{0, 2, 1, 3});  // BHLS -> BLHS
+//         auto transposeSDPA = std::make_shared<ov::op::v1::Transpose>(sdpa, postOrder);
+
+//         auto constReshape =
+//             ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, reshapeOrderHS[0] * reshapeOrderHS[1]});
+//         auto reshapeSDPA = std::make_shared<ov::op::v1::Reshape>(transposeSDPA, constReshape, true);  // BLHS -> B,L,HxS
+
+//         function = std::make_shared<ov::Model>(ov::OutputVector{reshapeSDPA},
+//                                                sinkNodes,
+//                                                inputParams,
+//                                                "FuseSDPAReshapeTranspose");
+//         targetDevice = ov::test::utils::DEVICE_CPU;
+//         functionRefs = function->clone();
+//         pass::Manager manager;
+//         // decompose ScaledDotProductAttention
+//         manager.register_pass<ov::pass::ScaledDotProductAttentionDecomposition>();
+//         manager.run_passes(functionRefs);
+//     }
+
+//     template <typename IT, typename T>
+//     static void strided_iota(IT first, size_t n, T value, T stride) {
+//         for (size_t i = 0; i < n; i++) {
+//             *first++ = value;
+//             value += stride;
+//         }
+//     }
+//     void generate(int idx, const std::vector<ov::Shape>& targetInputStaticShapes) {
+//         inputs.clear();
+//         auto create_input = [this] (std::shared_ptr<ov::op::v0::Parameter> param, ov::Shape shape, float val) {
+//             if (param->get_element_type() == ov::element::i32) {
+//                 ov::Tensor t{ov::element::i32, shape};
+//                 auto size = ov::shape_size<ov::Shape>(shape);
+//                 auto* p = static_cast<int*>(t.data());
+//                 auto start = static_cast<int>(val);
+//                 for (size_t i = 0; i < size; i++) {
+//                     p[i] = (start + i) % size;
+//                 }
+//                 inputs.insert({param, t});
+//             } else if (param->get_element_type() == ov::element::f32) {
+//                 ov::Tensor t{ov::element::f32, shape};
+//                 strided_iota(static_cast<float*>(t.data()), t.get_size(), val, 0.1f);
+//                 inputs.insert({param, t});
+//             } else {
+//                 ASSERT_TRUE(param->get_element_type() == ov::element::bf16);
+//                 ov::Tensor t{ov::element::bf16, shape};
+//                 strided_iota(static_cast<ov::bfloat16*>(t.data()), t.get_size(), val, 0.1f);
+//                 inputs.insert({param, t});
+//             }
+//         };
+//         // q, k, v
+//         create_input(function->get_parameters()[0], targetInputStaticShapes[0], idx + 1.0f);
+//         create_input(function->get_parameters()[1], targetInputStaticShapes[0], idx + 2.0f);
+//         create_input(function->get_parameters()[2], targetInputStaticShapes[0], idx + 3.0f);
+//     }
+//     void prepare() {
+//         compile_model();
+//         inferRequest = compiledModel.create_infer_request();
+//         ASSERT_TRUE(inferRequest);
+//     }
+//     void reset() {
+//         for (auto&& state : inferRequest.query_state()) {
+//             state.reset();
+//         }
+//     }
+
+//     std::vector<ov::Tensor> run_test(std::shared_ptr<ov::Model> model) {
+//         function = model;
+//         prepare();
+//         std::vector<ov::Tensor> outputs;
+//         int idx = 0;
+//         for (auto&& shapes : targetStaticShapes) {
+//             generate(idx++, shapes);
+//             for (const auto& input : inputs) {
+//                 inferRequest.set_tensor(input.first, input.second);
+//             }
+//             inferRequest.infer();
+//             auto outputTensor = inferRequest.get_output_tensor(0);
+//             ov::Tensor copy{outputTensor.get_element_type(), outputTensor.get_shape()};
+//             outputTensor.copy_to(copy);
+//             outputs.push_back(copy);
+//             reset();
+//         }
+//         return outputs;
+//     }
+// };
+
+// TEST_P(FuseSDPAReshapeTransposeTest, CompareWithRefs) {
+//     SKIP_IF_CURRENT_TEST_IS_DISABLED();
+//     bool reshape_transpose_fused = false;
+//     auto actualOutputs = run_test(function);
+//     CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1);
+//     CheckNumberOfNodesWithType(compiledModel, "Reshape", 0);
+//     CheckNumberOfNodesWithType(compiledModel, "Transpose", 0);
+//     for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) {
+//         if (n->get_friendly_name() == "mha/fused_reshape_transpose") {
+//             reshape_transpose_fused = true;
+//         }
+//     }
+//     ASSERT_TRUE(reshape_transpose_fused);
+
+//     auto expectedOutputs = run_test(functionRefs);
+//     for (size_t i = 0; i < actualOutputs.size(); i++) {
+//         ov::test::utils::compare(expectedOutputs[i], actualOutputs[i], abs_threshold, rel_threshold);
+//     }
+// }
+
+// namespace {
+// const std::vector<InputShapeAndReshapeOrder> inputShapeAndReshapeOrders = {
+//     // <Input_shapes, [H,S]>
+//     {
+//         {{
+//              // Q,K,V:[B, L, H*S]
+//              {{-1, -1, 4 * 16}, {{1, 1, 4 * 16}, {1, 2, 4 * 16}, {2, 2, 4 * 16}}},
+//          },
+//          // reshapeOrderHS
+//          {4, 16}},
+//     }};
+
+// INSTANTIATE_TEST_SUITE_P(smoke_FuseSDPAReshapeTransposeTest,
+//                          FuseSDPAReshapeTransposeTest,
+//                          ::testing::Combine(::testing::Values(ElementType::f32),
+//                                             ::testing::ValuesIn(inputShapeAndReshapeOrders)),
+//                          FuseSDPAReshapeTransposeTest::getTestCaseName);
+// }  // namespace
 }  // namespace test
 }  // namespace ov

From 1b9357717af73003e9e3a3c53e3238b151df5bcc Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Fri, 3 Jan 2025 05:31:50 +0100
Subject: [PATCH 12/13] apply review comments

---
 .../common/pass/stateful_sdpa_fusion.cpp      |   2 +
 .../x64/fuse_reshape_transpose_to_sdpa.cpp    | 414 +++++++++---------
 2 files changed, 209 insertions(+), 207 deletions(-)

diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
index e930abf1102a8c..adc590b41cc948 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
@@ -308,6 +308,8 @@ bool SDPASubgraphFusion::run_on_model(const std::shared_ptr<ov::Model>& f) {
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::SimplifyGatherShapeOf);
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::transpose_sinking::TSShapeOfForward);
     CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion);
+    // TODO: SDPAFuseTransposeReshape may cause regressions in icx.
+    // CPU_REGISTER_PASS_X64(manager, ov::intel_cpu::SDPAFuseTransposeReshape);
 
     manager.run_passes(f);
     return false;
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp
index 0da3732c295b5c..a646eb03df1a31 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp
@@ -34,212 +34,212 @@ namespace test {
  */
 
 // <Input_shapes, [H,S]>
-// using InputShapeAndReshapeOrder = std::pair<std::vector<InputShape>, std::vector<int32_t>>;
-// using FuseSDPAReshapeTransposeTestParams = std::tuple<ElementType, InputShapeAndReshapeOrder>;
-// class FuseSDPAReshapeTransposeTest : virtual public ov::test::SubgraphBaseTest,
-//                                      public testing::WithParamInterface<FuseSDPAReshapeTransposeTestParams>,
-//                                      public CPUTestsBase {
-// public:
-//     static std::string getTestCaseName(const testing::TestParamInfo<FuseSDPAReshapeTransposeTestParams>& obj) {
-//         ElementType inType;
-//         InputShapeAndReshapeOrder inputShapeAndOrders;
-//         std::tie(inType, inputShapeAndOrders) = obj.param;
-//         std::ostringstream result;
-//         std::vector<InputShape>& inputShapes = inputShapeAndOrders.first;
-//         auto& reshapeOrderHS = inputShapeAndOrders.second;
-//         result << "IS=";
-//         for (const auto& shape : inputShapes) {
-//             result << ov::test::utils::partialShape2str({shape.first}) << "_";
-//         }
-//         result << "TS=";
-//         for (const auto& shape : inputShapes) {
-//             result << "(";
-//             if (!shape.second.empty()) {
-//                 for (const auto& itr : shape.second) {
-//                     result << ov::test::utils::vec2str(itr);
-//                 }
-//             }
-//             result << ")_";
-//         }
-//         result << "Prc=" << inType << "_";
-//         result << "ReshapeOrderHS=";
-//         result << "(";
-//         for (const auto& itr : reshapeOrderHS) {
-//             result << itr << ",";
-//         }
-//         result << ")";
-
-//         return result.str();
-//     }
-
-//     void SetUp() override {
-//         ElementType inType;
-//         InputShapeAndReshapeOrder inputShapeAndOrders;
-//         std::tie(inType, inputShapeAndOrders) = this->GetParam();
-//         std::vector<InputShape>& inputShapes = inputShapeAndOrders.first;
-//         auto& reshapeOrderHS = inputShapeAndOrders.second;
-//         targetDevice = ov::test::utils::DEVICE_CPU;
-//         rel_threshold = 1e-2f;
-//         configuration[ov::hint::inference_precision.name()] = ov::element::f32;
-//         if (inType == ElementType::bf16) {
-//             configuration[ov::hint::inference_precision.name()] = ov::element::bf16;
-//             rel_threshold = 0.01f;
-//         }
-//         init_input_shapes(inputShapes);
-
-//         // pre SDPA reshape->transpose
-//         ov::ParameterVector inputParams(3);
-//         ov::SinkVector sinkNodes;
-//         OutputVector transposes(3);
-//         for (size_t i = 0; i < 3u; i++) {
-//             inputParams[i] = std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[0]);
-
-//             auto reshape_axis =
-//                 ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 0, reshapeOrderHS[0], reshapeOrderHS[1]});
-
-//             std::shared_ptr<ov::Node> reshape_input_1 = inputParams[i];
-//             if (i > 0) {
-//                 auto var = std::make_shared<ov::op::util::Variable>(
-//                     ov::op::util::VariableInfo{inputDynamicShapes[0], inType, "var_" + std::to_string(i)});
-//                 auto readvalue = std::make_shared<ov::op::v6::ReadValue>(inputParams[i], var);
-//                 auto assign = std::make_shared<ov::op::v6::Assign>(readvalue, var);
-//                 sinkNodes.emplace_back(assign);
-//                 reshape_input_1 = readvalue;
-//             }
-
-//             auto reshape = std::make_shared<ov::op::v1::Reshape>(reshape_input_1, reshape_axis, true);
-//             auto transposeOrder = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3});
-//             transposes[i] = std::make_shared<ov::op::v1::Transpose>(reshape, transposeOrder);
-//         }
-
-//         auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(transposes, false);
-//         sdpa->set_friendly_name("mha");
-
-//         // post SDPA transpose + reshape
-//         auto postOrder =
-//             ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<size_t>{0, 2, 1, 3});  // BHLS -> BLHS
-//         auto transposeSDPA = std::make_shared<ov::op::v1::Transpose>(sdpa, postOrder);
-
-//         auto constReshape =
-//             ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, reshapeOrderHS[0] * reshapeOrderHS[1]});
-//         auto reshapeSDPA = std::make_shared<ov::op::v1::Reshape>(transposeSDPA, constReshape, true);  // BLHS -> B,L,HxS
-
-//         function = std::make_shared<ov::Model>(ov::OutputVector{reshapeSDPA},
-//                                                sinkNodes,
-//                                                inputParams,
-//                                                "FuseSDPAReshapeTranspose");
-//         targetDevice = ov::test::utils::DEVICE_CPU;
-//         functionRefs = function->clone();
-//         pass::Manager manager;
-//         // decompose ScaledDotProductAttention
-//         manager.register_pass<ov::pass::ScaledDotProductAttentionDecomposition>();
-//         manager.run_passes(functionRefs);
-//     }
-
-//     template <typename IT, typename T>
-//     static void strided_iota(IT first, size_t n, T value, T stride) {
-//         for (size_t i = 0; i < n; i++) {
-//             *first++ = value;
-//             value += stride;
-//         }
-//     }
-//     void generate(int idx, const std::vector<ov::Shape>& targetInputStaticShapes) {
-//         inputs.clear();
-//         auto create_input = [this] (std::shared_ptr<ov::op::v0::Parameter> param, ov::Shape shape, float val) {
-//             if (param->get_element_type() == ov::element::i32) {
-//                 ov::Tensor t{ov::element::i32, shape};
-//                 auto size = ov::shape_size<ov::Shape>(shape);
-//                 auto* p = static_cast<int*>(t.data());
-//                 auto start = static_cast<int>(val);
-//                 for (size_t i = 0; i < size; i++) {
-//                     p[i] = (start + i) % size;
-//                 }
-//                 inputs.insert({param, t});
-//             } else if (param->get_element_type() == ov::element::f32) {
-//                 ov::Tensor t{ov::element::f32, shape};
-//                 strided_iota(static_cast<float*>(t.data()), t.get_size(), val, 0.1f);
-//                 inputs.insert({param, t});
-//             } else {
-//                 ASSERT_TRUE(param->get_element_type() == ov::element::bf16);
-//                 ov::Tensor t{ov::element::bf16, shape};
-//                 strided_iota(static_cast<ov::bfloat16*>(t.data()), t.get_size(), val, 0.1f);
-//                 inputs.insert({param, t});
-//             }
-//         };
-//         // q, k, v
-//         create_input(function->get_parameters()[0], targetInputStaticShapes[0], idx + 1.0f);
-//         create_input(function->get_parameters()[1], targetInputStaticShapes[0], idx + 2.0f);
-//         create_input(function->get_parameters()[2], targetInputStaticShapes[0], idx + 3.0f);
-//     }
-//     void prepare() {
-//         compile_model();
-//         inferRequest = compiledModel.create_infer_request();
-//         ASSERT_TRUE(inferRequest);
-//     }
-//     void reset() {
-//         for (auto&& state : inferRequest.query_state()) {
-//             state.reset();
-//         }
-//     }
-
-//     std::vector<ov::Tensor> run_test(std::shared_ptr<ov::Model> model) {
-//         function = model;
-//         prepare();
-//         std::vector<ov::Tensor> outputs;
-//         int idx = 0;
-//         for (auto&& shapes : targetStaticShapes) {
-//             generate(idx++, shapes);
-//             for (const auto& input : inputs) {
-//                 inferRequest.set_tensor(input.first, input.second);
-//             }
-//             inferRequest.infer();
-//             auto outputTensor = inferRequest.get_output_tensor(0);
-//             ov::Tensor copy{outputTensor.get_element_type(), outputTensor.get_shape()};
-//             outputTensor.copy_to(copy);
-//             outputs.push_back(copy);
-//             reset();
-//         }
-//         return outputs;
-//     }
-// };
-
-// TEST_P(FuseSDPAReshapeTransposeTest, CompareWithRefs) {
-//     SKIP_IF_CURRENT_TEST_IS_DISABLED();
-//     bool reshape_transpose_fused = false;
-//     auto actualOutputs = run_test(function);
-//     CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1);
-//     CheckNumberOfNodesWithType(compiledModel, "Reshape", 0);
-//     CheckNumberOfNodesWithType(compiledModel, "Transpose", 0);
-//     for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) {
-//         if (n->get_friendly_name() == "mha/fused_reshape_transpose") {
-//             reshape_transpose_fused = true;
-//         }
-//     }
-//     ASSERT_TRUE(reshape_transpose_fused);
-
-//     auto expectedOutputs = run_test(functionRefs);
-//     for (size_t i = 0; i < actualOutputs.size(); i++) {
-//         ov::test::utils::compare(expectedOutputs[i], actualOutputs[i], abs_threshold, rel_threshold);
-//     }
-// }
-
-// namespace {
-// const std::vector<InputShapeAndReshapeOrder> inputShapeAndReshapeOrders = {
-//     // <Input_shapes, [H,S]>
-//     {
-//         {{
-//              // Q,K,V:[B, L, H*S]
-//              {{-1, -1, 4 * 16}, {{1, 1, 4 * 16}, {1, 2, 4 * 16}, {2, 2, 4 * 16}}},
-//          },
-//          // reshapeOrderHS
-//          {4, 16}},
-//     }};
-
-// INSTANTIATE_TEST_SUITE_P(smoke_FuseSDPAReshapeTransposeTest,
-//                          FuseSDPAReshapeTransposeTest,
-//                          ::testing::Combine(::testing::Values(ElementType::f32),
-//                                             ::testing::ValuesIn(inputShapeAndReshapeOrders)),
-//                          FuseSDPAReshapeTransposeTest::getTestCaseName);
-// }  // namespace
+using InputShapeAndReshapeOrder = std::pair<std::vector<InputShape>, std::vector<int32_t>>;
+using FuseSDPAReshapeTransposeTestParams = std::tuple<ElementType, InputShapeAndReshapeOrder>;
+class FuseSDPAReshapeTransposeTest : virtual public ov::test::SubgraphBaseTest,
+                                     public testing::WithParamInterface<FuseSDPAReshapeTransposeTestParams>,
+                                     public CPUTestsBase {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<FuseSDPAReshapeTransposeTestParams>& obj) {
+        ElementType inType;
+        InputShapeAndReshapeOrder inputShapeAndOrders;
+        std::tie(inType, inputShapeAndOrders) = obj.param;
+        std::ostringstream result;
+        std::vector<InputShape>& inputShapes = inputShapeAndOrders.first;
+        auto& reshapeOrderHS = inputShapeAndOrders.second;
+        result << "IS=";
+        for (const auto& shape : inputShapes) {
+            result << ov::test::utils::partialShape2str({shape.first}) << "_";
+        }
+        result << "TS=";
+        for (const auto& shape : inputShapes) {
+            result << "(";
+            if (!shape.second.empty()) {
+                for (const auto& itr : shape.second) {
+                    result << ov::test::utils::vec2str(itr);
+                }
+            }
+            result << ")_";
+        }
+        result << "Prc=" << inType << "_";
+        result << "ReshapeOrderHS=";
+        result << "(";
+        for (const auto& itr : reshapeOrderHS) {
+            result << itr << ",";
+        }
+        result << ")";
+
+        return result.str();
+    }
+
+    void SetUp() override {
+        ElementType inType;
+        InputShapeAndReshapeOrder inputShapeAndOrders;
+        std::tie(inType, inputShapeAndOrders) = this->GetParam();
+        std::vector<InputShape>& inputShapes = inputShapeAndOrders.first;
+        auto& reshapeOrderHS = inputShapeAndOrders.second;
+        targetDevice = ov::test::utils::DEVICE_CPU;
+        rel_threshold = 1e-2f;
+        configuration[ov::hint::inference_precision.name()] = ov::element::f32;
+        if (inType == ElementType::bf16) {
+            configuration[ov::hint::inference_precision.name()] = ov::element::bf16;
+            rel_threshold = 0.01f;
+        }
+        init_input_shapes(inputShapes);
+
+        // pre SDPA reshape->transpose
+        ov::ParameterVector inputParams(3);
+        ov::SinkVector sinkNodes;
+        OutputVector transposes(3);
+        for (size_t i = 0; i < 3u; i++) {
+            inputParams[i] = std::make_shared<ov::op::v0::Parameter>(inType, inputDynamicShapes[0]);
+
+            auto reshape_axis =
+                ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 0, reshapeOrderHS[0], reshapeOrderHS[1]});
+
+            std::shared_ptr<ov::Node> reshape_input_1 = inputParams[i];
+            if (i > 0) {
+                auto var = std::make_shared<ov::op::util::Variable>(
+                    ov::op::util::VariableInfo{inputDynamicShapes[0], inType, "var_" + std::to_string(i)});
+                auto readvalue = std::make_shared<ov::op::v6::ReadValue>(inputParams[i], var);
+                auto assign = std::make_shared<ov::op::v6::Assign>(readvalue, var);
+                sinkNodes.emplace_back(assign);
+                reshape_input_1 = readvalue;
+            }
+
+            auto reshape = std::make_shared<ov::op::v1::Reshape>(reshape_input_1, reshape_axis, true);
+            auto transposeOrder = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3});
+            transposes[i] = std::make_shared<ov::op::v1::Transpose>(reshape, transposeOrder);
+        }
+
+        auto sdpa = std::make_shared<ov::op::v13::ScaledDotProductAttention>(transposes, false);
+        sdpa->set_friendly_name("mha");
+
+        // post SDPA transpose + reshape
+        auto postOrder =
+            ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector<size_t>{0, 2, 1, 3});  // BHLS -> BLHS
+        auto transposeSDPA = std::make_shared<ov::op::v1::Transpose>(sdpa, postOrder);
+
+        auto constReshape =
+            ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, reshapeOrderHS[0] * reshapeOrderHS[1]});
+        auto reshapeSDPA = std::make_shared<ov::op::v1::Reshape>(transposeSDPA, constReshape, true);  // BLHS -> B,L,HxS
+
+        function = std::make_shared<ov::Model>(ov::OutputVector{reshapeSDPA},
+                                               sinkNodes,
+                                               inputParams,
+                                               "FuseSDPAReshapeTranspose");
+        targetDevice = ov::test::utils::DEVICE_CPU;
+        functionRefs = function->clone();
+        pass::Manager manager;
+        // decompose ScaledDotProductAttention
+        manager.register_pass<ov::pass::ScaledDotProductAttentionDecomposition>();
+        manager.run_passes(functionRefs);
+    }
+
+    template <typename IT, typename T>
+    static void strided_iota(IT first, size_t n, T value, T stride) {
+        for (size_t i = 0; i < n; i++) {
+            *first++ = value;
+            value += stride;
+        }
+    }
+    void generate(int idx, const std::vector<ov::Shape>& targetInputStaticShapes) {
+        inputs.clear();
+        auto create_input = [this] (std::shared_ptr<ov::op::v0::Parameter> param, ov::Shape shape, float val) {
+            if (param->get_element_type() == ov::element::i32) {
+                ov::Tensor t{ov::element::i32, shape};
+                auto size = ov::shape_size<ov::Shape>(shape);
+                auto* p = static_cast<int*>(t.data());
+                auto start = static_cast<int>(val);
+                for (size_t i = 0; i < size; i++) {
+                    p[i] = (start + i) % size;
+                }
+                inputs.insert({param, t});
+            } else if (param->get_element_type() == ov::element::f32) {
+                ov::Tensor t{ov::element::f32, shape};
+                strided_iota(static_cast<float*>(t.data()), t.get_size(), val, 0.1f);
+                inputs.insert({param, t});
+            } else {
+                ASSERT_TRUE(param->get_element_type() == ov::element::bf16);
+                ov::Tensor t{ov::element::bf16, shape};
+                strided_iota(static_cast<ov::bfloat16*>(t.data()), t.get_size(), val, 0.1f);
+                inputs.insert({param, t});
+            }
+        };
+        // q, k, v
+        create_input(function->get_parameters()[0], targetInputStaticShapes[0], idx + 1.0f);
+        create_input(function->get_parameters()[1], targetInputStaticShapes[0], idx + 2.0f);
+        create_input(function->get_parameters()[2], targetInputStaticShapes[0], idx + 3.0f);
+    }
+    void prepare() {
+        compile_model();
+        inferRequest = compiledModel.create_infer_request();
+        ASSERT_TRUE(inferRequest);
+    }
+    void reset() {
+        for (auto&& state : inferRequest.query_state()) {
+            state.reset();
+        }
+    }
+
+    std::vector<ov::Tensor> run_test(std::shared_ptr<ov::Model> model) {
+        function = model;
+        prepare();
+        std::vector<ov::Tensor> outputs;
+        int idx = 0;
+        for (auto&& shapes : targetStaticShapes) {
+            generate(idx++, shapes);
+            for (const auto& input : inputs) {
+                inferRequest.set_tensor(input.first, input.second);
+            }
+            inferRequest.infer();
+            auto outputTensor = inferRequest.get_output_tensor(0);
+            ov::Tensor copy{outputTensor.get_element_type(), outputTensor.get_shape()};
+            outputTensor.copy_to(copy);
+            outputs.push_back(copy);
+            reset();
+        }
+        return outputs;
+    }
+};
+
+TEST_P(FuseSDPAReshapeTransposeTest, CompareWithRefs) {
+    GTEST_SKIP() << "TODO: investigate perf-regression on ICX." << std::endl;
+    bool reshape_transpose_fused = false;
+    auto actualOutputs = run_test(function);
+    CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1);
+    CheckNumberOfNodesWithType(compiledModel, "Reshape", 0);
+    CheckNumberOfNodesWithType(compiledModel, "Transpose", 0);
+    for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) {
+        if (n->get_friendly_name() == "mha/fused_reshape_transpose") {
+            reshape_transpose_fused = true;
+        }
+    }
+    ASSERT_TRUE(reshape_transpose_fused);
+
+    auto expectedOutputs = run_test(functionRefs);
+    for (size_t i = 0; i < actualOutputs.size(); i++) {
+        ov::test::utils::compare(expectedOutputs[i], actualOutputs[i], abs_threshold, rel_threshold);
+    }
+}
+
+namespace {
+const std::vector<InputShapeAndReshapeOrder> inputShapeAndReshapeOrders = {
+    // <Input_shapes, [H,S]>
+    {
+        {{
+             // Q,K,V:[B, L, H*S]
+             {{-1, -1, 4 * 16}, {{1, 1, 4 * 16}, {1, 2, 4 * 16}, {2, 2, 4 * 16}}},
+         },
+         // reshapeOrderHS
+         {4, 16}},
+    }};
+
+INSTANTIATE_TEST_SUITE_P(smoke_FuseSDPAReshapeTransposeTest,
+                         FuseSDPAReshapeTransposeTest,
+                         ::testing::Combine(::testing::Values(ElementType::f32),
+                                            ::testing::ValuesIn(inputShapeAndReshapeOrders)),
+                         FuseSDPAReshapeTransposeTest::getTestCaseName);
+}  // namespace
 }  // namespace test
 }  // namespace ov

From a8750e045fca7d4efc79c3f12ef9d699c0f6ae30 Mon Sep 17 00:00:00 2001
From: Luo Cheng <cheng.luo@intel.com>
Date: Mon, 6 Jan 2025 07:54:22 +0000
Subject: [PATCH 13/13] enable SDPAFuseTransposeReshape with stateful

---
 .../cpu_opset/common/pass/stateful_sdpa_fusion.cpp   |  4 ++--
 .../x64/pass/sdpa_fuse_transpose_reshape.cpp         | 12 ++++++------
 .../src/x64/fuse_reshape_transpose_to_sdpa.cpp       |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
index adc590b41cc948..9b9aa4f4b34e48 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp
@@ -308,8 +308,8 @@ bool SDPASubgraphFusion::run_on_model(const std::shared_ptr<ov::Model>& f) {
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::SimplifyGatherShapeOf);
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::transpose_sinking::TSShapeOfForward);
     CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion);
-    // TODO: SDPAFuseTransposeReshape may cause regressions in icx.
-    // CPU_REGISTER_PASS_X64(manager, ov::intel_cpu::SDPAFuseTransposeReshape);
+    // TODO: remove the following after snippets support patterns with dynamic shapes
+    CPU_REGISTER_PASS_X64(manager, ov::intel_cpu::SDPAFuseTransposeReshape);
 
     manager.run_passes(f);
     return false;
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/sdpa_fuse_transpose_reshape.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/sdpa_fuse_transpose_reshape.cpp
index 9b48708bc8ed5a..e33b468917c51a 100644
--- a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/sdpa_fuse_transpose_reshape.cpp
+++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/sdpa_fuse_transpose_reshape.cpp
@@ -18,13 +18,13 @@
  * Description: SDPA fuse transpose and reshape.
  *           Original pattern                            Fused pattern
  *
- *  input1         input2       input3
+ *  input1        readvalue      readvalue
  *     |             |             |
  * q_reshape     k_reshape     v_reshap
  *     |             |             |                         (qkv transpose and reshape's orders)
- * q_transpose  k_transpose   v_transpose                                     |
- *         \         |        /                      input1  input2  input3   |
- *          \        |       /                          \      |       /      /
+ * q_transpose  k_transpose   v_transpose                                        |
+ *         \         |        /                      input1 ReadValue ReadValue  |
+ *          \        |       /                          \      |       /        /
  *       ScaledDotProductAttention   --------->        SDPAWithTransposeReshape
  *                   |                                         |
  *              out_transpose                                  |
@@ -41,8 +41,8 @@ intel_cpu::SDPAFuseTransposeReshape::SDPAFuseTransposeReshape() {
     MATCHER_SCOPE(SDPAFuseTransposeReshape);
 
     auto q_reshape_node = wrap_type<op::v1::Reshape>({any_input(), any_input()});
-    auto k_reshape_node = wrap_type<op::v1::Reshape>({any_input(), any_input()});
-    auto v_reshape_node = wrap_type<op::v1::Reshape>({any_input(), any_input()});
+    auto k_reshape_node = wrap_type<op::v1::Reshape>({wrap_type<op::v6::ReadValue>(), any_input()});
+    auto v_reshape_node = wrap_type<op::v1::Reshape>({wrap_type<op::v6::ReadValue>(), any_input()});
 
     auto q_transpose_order_node = wrap_type<op::v0::Constant>();
     auto k_transpose_order_node = wrap_type<op::v0::Constant>();
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp
index a646eb03df1a31..a75156c0f69fcb 100644
--- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp
+++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp
@@ -204,7 +204,7 @@ class FuseSDPAReshapeTransposeTest : virtual public ov::test::SubgraphBaseTest,
 };
 
 TEST_P(FuseSDPAReshapeTransposeTest, CompareWithRefs) {
-    GTEST_SKIP() << "TODO: investigate perf-regression on ICX." << std::endl;
+    SKIP_IF_CURRENT_TEST_IS_DISABLED();
     bool reshape_transpose_fused = false;
     auto actualOutputs = run_test(function);
     CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1);