From 7a57f7abb4f7e600e3c168135335ca2ee2df657b Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Thu, 29 Aug 2024 12:01:14 +0800 Subject: [PATCH 01/13] add linux-perf --- src/plugins/intel_cpu/src/graph.cpp | 14 +- .../intel_cpu/src/nodes/linux_perf.hpp | 1242 +++++++++++++++++ 2 files changed, 1252 insertions(+), 4 deletions(-) create mode 100644 src/plugins/intel_cpu/src/nodes/linux_perf.hpp diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index aab78a4d5f15bd..92b541f9b2543a 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -45,6 +45,7 @@ #include "utils/node_dumper.h" #include "utils/precision_support.h" #include "utils/verbose.h" +#include "nodes/linux_perf.hpp" #if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) # include @@ -108,6 +109,7 @@ void Graph::Replicate(const std::shared_ptr& model, OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, itt::domains::intel_cpu_LT, "Graph::Replicate", "ov::Model"); this->_name = model->get_friendly_name(); + LinuxPerf::Init(); // Map data object onto producer node std::map, NodePtr> op2node; @@ -1162,6 +1164,7 @@ VecMemoryDescs Graph::getOutputMemoryDescriptors() const { void Graph::InferStatic(SyncInferRequest* request, int numaId) { for (const auto& node : m_executableGraphNodes) { + auto perf1 = LinuxPerf::Profile(node->getTypeStr()); ExecuteNodeWithCatch(node, request, numaId); } } @@ -1437,11 +1440,15 @@ inline void Graph::ExecuteNodeWithCatch(const NodePtr& node, SyncInferRequest* r template void Graph::InferDynamic(SyncInferRequest* request, int numaId, UpdateStrategy&& update) { size_t inferCounter = 0; + auto perf = LinuxPerf::Profile(std::string("Graph::InferDynamic_#") + std::to_string(infer_count)); for (auto stopIndx : m_executableSyncNodesInds) { - update(stopIndx); - + { + auto perf1 = LinuxPerf::Profile("update"); + update(stopIndx); + } for (; inferCounter < stopIndx; ++inferCounter) { auto& node = m_executableGraphNodes[inferCounter]; + auto perf1 = LinuxPerf::Profile(node->getTypeStr()); // + "_" + node->getName()); ExecuteNodeWithCatch(node, request, numaId); } @@ -1487,8 +1494,7 @@ void Graph::Infer(SyncInferRequest* request) { static_cast(status)); } - if (infer_count != -1) - infer_count++; + infer_count++; } void Graph::SortTopologically() { diff --git a/src/plugins/intel_cpu/src/nodes/linux_perf.hpp b/src/plugins/intel_cpu/src/nodes/linux_perf.hpp new file mode 100644 index 00000000000000..fa9498fab70e81 --- /dev/null +++ b/src/plugins/intel_cpu/src/nodes/linux_perf.hpp @@ -0,0 +1,1242 @@ + +#include +#include +//#include +#include +#include +#include + +#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 30 +#include +#define gettid() syscall(SYS_gettid) +#endif + +inline int perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu, int group_fd, unsigned long flags) { + return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags); +} + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace LinuxPerf { + +#define _LINE_STRINGIZE(x) _LINE_STRINGIZE2(x) +#define _LINE_STRINGIZE2(x) #x +#define LINE_STRING _LINE_STRINGIZE(__LINE__) + +#define LINUX_PERF_ "\e[33m[LINUX_PERF:" LINE_STRING "]\e[0m " + +inline uint64_t get_time_ns() { + struct timespec tp0; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &tp0) != 0) { + perror(LINUX_PERF_"clock_gettime(CLOCK_MONOTONIC_RAW,...) failed!"); + abort(); + } + return (tp0.tv_sec * 1000000000) + tp0.tv_nsec; +} + +struct TscCounter { + uint64_t tsc_ticks_per_second; + uint64_t tsc_ticks_base; + double tsc_to_usec(uint64_t tsc_ticks) const { + if (tsc_ticks < tsc_ticks_base) + return 0; + return (tsc_ticks - tsc_ticks_base) * 1000000.0 / tsc_ticks_per_second; + } + double tsc_to_usec(uint64_t tsc_ticks0, uint64_t tsc_ticks1) const { + if (tsc_ticks1 < tsc_ticks0) + return 0; + return (tsc_ticks1 - tsc_ticks0) * 1000000.0 / tsc_ticks_per_second; + } + TscCounter() { + uint64_t start_ticks = __rdtsc(); + std::this_thread::sleep_for(std::chrono::seconds(1)); + tsc_ticks_per_second = (__rdtsc() - start_ticks); + std::cout << LINUX_PERF_"tsc_ticks_per_second = " << tsc_ticks_per_second << std::endl; + tsc_ticks_base = __rdtsc(); + + // use CLOCK_MONOTONIC_RAW instead of TSC + tsc_ticks_per_second = 1000000000; // ns + tsc_ticks_base = get_time_ns(); + } +}; + +class IPerfEventDumper { +public: + virtual void dump_json(std::ofstream& fw, TscCounter& tsc) = 0; +}; + +struct PerfEventJsonDumper { + std::mutex g_mutex; + std::set all_dumpers; + const char* dump_file_name = "perf_dump.json"; + bool dump_file_over = false; + bool not_finalized = true; + std::ofstream fw; + std::atomic_int totalProfilerManagers{0}; + TscCounter tsc; + + ~PerfEventJsonDumper() { + if (not_finalized) + finalize(); + } + + void finalize() { + if (!not_finalized) + return; + std::lock_guard guard(g_mutex); + if (dump_file_over || all_dumpers.empty()) + return; + + // start dump + fw.open(dump_file_name, std::ios::out); + fw << "{\n"; + fw << "\"schemaVersion\": 1,\n"; + fw << "\"traceEvents\": [\n"; + fw.flush(); + + for (auto& pthis : all_dumpers) { + pthis->dump_json(fw, tsc); + } + all_dumpers.clear(); + + fw << R"({ + "name": "Profiler End", + "ph": "i", + "s": "g", + "pid": "Traces", + "tid": "Trace OV Profiler", + "ts":)" + << tsc.tsc_to_usec(get_time_ns()) << "}", + fw << "]\n"; + fw << "}\n"; + auto total_size = fw.tellp(); + fw.close(); + dump_file_over = true; + not_finalized = false; + + std::cout << LINUX_PERF_"Dumpped "; + + if (total_size < 1024) std::cout << total_size << " bytes "; + else if (total_size < 1024*1024) std::cout << total_size/1024 << " KB "; + else std::cout << total_size/(1024 * 1024) << " MB "; + std::cout << " to " << dump_file_name << std::endl; + } + + int register_manager(IPerfEventDumper* pthis) { + std::lock_guard guard(g_mutex); + std::stringstream ss; + auto serial_id = totalProfilerManagers.fetch_add(1); + ss << LINUX_PERF_"#" << serial_id << "(" << pthis << ") : is registed." << std::endl; + std::cout << ss.str(); + all_dumpers.emplace(pthis); + return serial_id; + } + + static PerfEventJsonDumper& get() { + static PerfEventJsonDumper inst; + return inst; + } +}; + +inline std::vector str_split(const std::string& s, std::string delimiter) { + std::vector ret; + size_t last = 0; + size_t next = 0; + while ((next = s.find(delimiter, last)) != std::string::npos) { + //std::cout << last << "," << next << "=" << s.substr(last, next-last) << "\n"; + ret.push_back(s.substr(last, next-last)); + last = next + 1; + } + ret.push_back(s.substr(last)); + return ret; +} + +template +T& read_ring_buffer(perf_event_mmap_page& meta, uint64_t& offset) { + auto offset0 = offset; + offset += sizeof(T); + return *reinterpret_cast(reinterpret_cast(&meta) + meta.data_offset + (offset0)%meta.data_size); +} + +struct PerfRawConfig { + PerfRawConfig() { + // env var defined raw events + const char* str_raw_config = std::getenv("LINUX_PERF"); + if (str_raw_config) { + CPU_ZERO(&cpu_mask); + // options are separated by ":" as PATH + auto options = str_split(str_raw_config, ":"); + for(auto& opt : options) { + auto items = str_split(opt, "="); + if (items.size() == 2) { + if (items[0] == "dump") { + // limit the number of dumps per thread + dump = strtoll(&items[1][0], nullptr, 0); + } else if (items[0] == "cpus") { + // thread's affinity (cpu-binding) can be changed by threading-libs(TBB/OpenMP) anytime + // sched_getaffinity() can only get correct binding at start-up time, another way is to specify it + // also too many events may generate if per-thread event is used, cpus can limit + // cpus=56 + // cpus=56.57.59 + auto cpus = str_split(items[1], ","); + CPU_ZERO(&cpu_mask); + for(auto& cpu : cpus) { + CPU_SET(std::atoi(cpu.c_str()), &cpu_mask); + } + } else { + auto config = strtoul(&items[1][0], nullptr, 0); + if (config > 0) + raw_configs.emplace_back(items[0], config); + } + } + if (items.size() == 1) { + if (items[0] == "switch-cpu") { + // get cpu_mask as early as possible + switch_cpu = true; + CPU_ZERO(&cpu_mask); + if (sched_getaffinity(getpid(), sizeof(cpu_set_t), &cpu_mask)) { + perror(LINUX_PERF_"sched_getaffinity failed:"); + abort(); + } + } + if (items[0] == "dump") + dump = std::numeric_limits::max(); // no limit to number of dumps + } + } + + for(auto& cfg : raw_configs) { + printf(LINUX_PERF_" config: %s=0x%lx\n", cfg.first.c_str(), cfg.second); + } + if (switch_cpu) { + printf(LINUX_PERF_" config: switch_cpu\n"); + } + if (dump) + printf(LINUX_PERF_" config: dump=%ld\n", dump); + if (CPU_COUNT(&cpu_mask)) { + printf(LINUX_PERF_" config: cpus="); + for (int cpu = 0; cpu < (int)sizeof(cpu_set_t)*8; cpu++) + if(CPU_ISSET(cpu, &cpu_mask)) printf("%d,", cpu); + printf("\n"); + } + } else { + printf(LINUX_PERF_" LINUX_PERF is unset, example: LINUX_PERF=dump,switch-cpu,L2_MISS=0x10d1\n"); + } + } + + bool dump_on_cpu(int cpu) { + if (dump == 0) + return false; + if (CPU_COUNT(&cpu_mask)) + return CPU_ISSET(cpu, &cpu_mask); + return true; + } + + int64_t dump = 0; + cpu_set_t cpu_mask; + bool switch_cpu = false; + std::vector dump_cpus; + std::vector> raw_configs; + + static PerfRawConfig& get() { + static PerfRawConfig inst; + return inst; + } +}; + + +// context switch events +// this will visualize +struct PerfEventCtxSwitch : public IPerfEventDumper { + bool is_enabled; + + struct event { + int fd; + perf_event_mmap_page * meta; + int cpu; + uint64_t ctx_switch_in_time; + uint64_t ctx_switch_in_tid; + uint64_t ctx_last_time; + + event(int fd, perf_event_mmap_page * meta): fd(fd), meta(meta) {} + }; + std::vector events; + + PerfEventCtxSwitch() { + is_enabled = PerfRawConfig::get().switch_cpu; + if (is_enabled) { + // make sure TSC in PerfEventJsonDumper is the very first thing to initialize + PerfEventJsonDumper::get().register_manager(this); + + // open fd for each CPU + cpu_set_t mask = PerfRawConfig::get().cpu_mask; + + long number_of_processors = sysconf(_SC_NPROCESSORS_ONLN); + printf(LINUX_PERF_"sizeof(cpu_set_t):%lu: _SC_NPROCESSORS_ONLN=%ld CPU_COUNT=%d\n", sizeof(cpu_set_t), number_of_processors, CPU_COUNT(&mask)); + if (CPU_COUNT(&mask) >= number_of_processors) { + printf(LINUX_PERF_" no affinity is set, will not enable PerfEventCtxSwitch\n"); + is_enabled = false; + return; + } + + for (int cpu = 0; cpu < (int)sizeof(cpu_set_t)*8; cpu++) { + auto is_set = CPU_ISSET(cpu, &mask); + if (!is_set) continue; + + perf_event_attr pea; + memset(&pea, 0, sizeof(struct perf_event_attr)); + pea.type = PERF_TYPE_HARDWARE; + pea.size = sizeof(struct perf_event_attr); + pea.config = PERF_COUNT_HW_REF_CPU_CYCLES; // not the point, can be any + pea.disabled = 0; + pea.exclude_kernel = 1; + pea.exclude_hv = 1; + pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; + // pinned: It applies only to hardware counters and only to group leaders + pea.pinned = 1; + pea.read_format |= PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; + + // for group master, generate PERF_RECORD_SWITCH into ring-buffer + // is helpful to visualize context switch + pea.context_switch = 1; + // then TID, TIME, ID, STREAM_ID, and CPU can additionally be included in non-PERF_RECORD_SAMPLEs + // if the corresponding sample_type is selected + pea.sample_id_all = 1; + pea.sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_TID | PERF_SAMPLE_CPU; + auto mmap_length = sysconf(_SC_PAGESIZE) * (1024 + 1); + pea.use_clockid = 1; + pea.clockid = CLOCK_MONOTONIC_RAW; + + // calling thread on any processor + pid_t pid = -1; + // measures all processes/threads on the specified CPU + int ctx_switch_fd = perf_event_open(&pea, pid, cpu, -1, 0); + if (ctx_switch_fd < 0) { + perror(LINUX_PERF_"PerfEventCtxSwitch perf_event_open failed (check /proc/sys/kernel/perf_event_paranoid please)"); + abort(); + } + + auto* ctx_switch_pmeta = reinterpret_cast(mmap(NULL, mmap_length, PROT_READ | PROT_WRITE, MAP_SHARED, ctx_switch_fd, 0)); + if (ctx_switch_pmeta == MAP_FAILED) { + perror(LINUX_PERF_"mmap perf_event_mmap_page failed:"); + close(ctx_switch_fd); + abort(); + } + printf(LINUX_PERF_"perf_event_open CPU_WIDE context_switch on cpu %d, ctx_switch_fd=%d\n", cpu, ctx_switch_fd); + events.emplace_back(ctx_switch_fd, ctx_switch_pmeta); + events.back().ctx_switch_in_time = get_time_ns(); + events.back().ctx_last_time = get_time_ns(); + events.back().cpu = cpu; + } + my_pid = getpid(); + my_tid = gettid(); + } + } + + ~PerfEventCtxSwitch() { + if (is_enabled) { + PerfEventJsonDumper::get().finalize(); + } + for(auto& ev : events) { + close(ev.fd); + } + } + + struct ProfileData { + uint64_t tsc_start; + uint64_t tsc_end; + uint32_t tid; + uint32_t cpu; + bool preempt; // preempt means current TID preempts previous thread + }; + + std::deque all_dump_data; + + void dump_json(std::ofstream& fw, TscCounter& tsc) override { + static std::atomic_uint64_t async_evid{0}; + if (!is_enabled) return; + + updateRingBuffer(); + + auto data_size = all_dump_data.size(); + if (!data_size) return; + + for (auto& ev : events) { + if (ev.ctx_switch_in_time == 0) continue; + all_dump_data.emplace_back(); + auto* pd = &all_dump_data.back(); + pd->tid = ev.ctx_switch_in_tid; + pd->cpu = ev.cpu; + pd->tsc_start = ev.ctx_switch_in_time; + pd->tsc_end = get_time_ns(); + ev.ctx_switch_in_time = 0; + } + + auto pid = 9999; // fake pid for CPU + auto cat = "TID"; + + // TID is used for CPU id instead + for (auto& d : all_dump_data) { + auto duration = tsc.tsc_to_usec(d.tsc_start, d.tsc_end); + auto start = tsc.tsc_to_usec(d.tsc_start); + //auto end = tsc.tsc_to_usec(d.tsc_end); + auto cpu_id = d.cpu; + + fw << "{\"ph\": \"X\", \"name\": \"" << d.tid << "\", \"cat\":\"" << cat << "\"," + << "\"pid\": " << pid << ", \"tid\": \"CPU" << cpu_id << "\"," + << "\"ts\": " << std::setprecision (15) << start << ", \"dur\": " << duration << "},\n"; + } + } + + bool ring_buffer_verbose = false; + uint32_t my_pid = 0; + uint32_t my_tid = 0; + std::atomic atom_gard{0}; + + void updateRingBuffer() { + // only one thread can enter + const int lock_value = atom_gard.exchange(1); + if (lock_value == 1) { + // has been locked, return; + return; + } + + // only update when any ring-buffer is half loaded + bool need_update = false; + for(auto& ev : events) { + auto& mmap_meta = *ev.meta; + auto used_size = (mmap_meta.data_tail - mmap_meta.data_head) % mmap_meta.data_size; + if (used_size > (mmap_meta.data_size >> 1)) { + need_update = true; + break; + } + } + + if (!need_update) { + // unlock + atom_gard.exchange(0); + return; + } + + for(auto& ev : events) { + auto& mmap_meta = *ev.meta; + uint64_t head0 = mmap_meta.data_tail; + uint64_t head1 = mmap_meta.data_head; + //printf("ring-buffer@end: %lu~%lu %llu %llu %llu\n", head0, head1, group_meta.data_tail, group_meta.data_offset, group_meta.data_size); + + if (head0 != head1) { + if (ring_buffer_verbose) { + printf("PERF_RECORD_SWITCH = %d\n", PERF_RECORD_SWITCH); + printf("PERF_RECORD_SWITCH_CPU_WIDE = %d\n", PERF_RECORD_SWITCH_CPU_WIDE); + printf("PERF_RECORD_MISC_SWITCH_OUT = %d\n", PERF_RECORD_MISC_SWITCH_OUT); + printf("PERF_RECORD_MISC_SWITCH_OUT_PREEMPT = %d\n", PERF_RECORD_MISC_SWITCH_OUT_PREEMPT); + } + + while(head0 < head1) { + auto h0 = head0; + auto type = read_ring_buffer<__u32>(mmap_meta, head0); + auto misc = read_ring_buffer<__u16>(mmap_meta, head0); + auto size = read_ring_buffer<__u16>(mmap_meta, head0); + uint32_t next_prev_pid = 0, next_prev_tid = 0; + if (type == PERF_RECORD_SWITCH_CPU_WIDE) { + // previous PID/TID if switching-in + // next PID/TID if switching-out + next_prev_pid = read_ring_buffer<__u32>(mmap_meta, head0); + next_prev_tid = read_ring_buffer<__u32>(mmap_meta, head0); + } + auto pid = read_ring_buffer<__u32>(mmap_meta, head0); + auto tid = read_ring_buffer<__u32>(mmap_meta, head0); + auto time = read_ring_buffer(mmap_meta, head0); + auto cpu = read_ring_buffer<__u32>(mmap_meta, head0); + auto reserved0 = read_ring_buffer<__u32>(mmap_meta, head0); + (void)reserved0; + (void)next_prev_pid; + (void)pid; + + // skip idle process (with TID 0) + if (tid > 0 && ring_buffer_verbose) { + printf("event: %lu/%lu\ttype,misc,size=(%u,%u,%u) cpu%u,next_prev_tid=%u,tid=%u time:(%lu), (+%lu)\n", + h0, head1, + type, misc, size, + cpu, next_prev_tid, tid, + time, + time - ev.ctx_last_time); + } + + if (type == PERF_RECORD_SWITCH_CPU_WIDE && tid > 0) { + if (misc & PERF_RECORD_MISC_SWITCH_OUT || misc & PERF_RECORD_MISC_SWITCH_OUT_PREEMPT) { + // switch out + // generate a log + all_dump_data.emplace_back(); + auto* pd = &all_dump_data.back(); + pd->tid = tid; + pd->cpu = cpu; + pd->preempt = (misc & PERF_RECORD_MISC_SWITCH_OUT_PREEMPT); + //printf("ctx_switch_in_time=%lu\n", ctx_switch_in_time); + pd->tsc_start = ev.ctx_switch_in_time; + pd->tsc_end = time; + + if (ring_buffer_verbose) printf("\t cpu: %u tid: %u %lu (+%lu)\n", cpu, tid, ev.ctx_switch_in_time, time-ev.ctx_switch_in_time); + + ev.ctx_switch_in_time = 0; + } else { + // switch in + ev.ctx_switch_in_time = time; + ev.ctx_switch_in_tid = tid; + } + } + + ev.ctx_last_time = time; + head0 += size - (head0 - h0); + } + + if (head0 != head1) { + printf("head0(%lu) != head1(%lu)\n", head0, head1); + abort(); + } + + // update tail so kernel can keep generate event records + mmap_meta.data_tail = head0; + std::atomic_thread_fence(std::memory_order_seq_cst); + } + } + atom_gard.exchange(0); + } + + static PerfEventCtxSwitch& get() { + static PerfEventCtxSwitch inst; + return inst; + } +}; + +/* +RAW HARDWARE EVENT DESCRIPTOR + Even when an event is not available in a symbolic form within perf right now, it can be encoded in a per processor specific way. + + For instance For x86 CPUs NNN represents the raw register encoding with the layout of IA32_PERFEVTSELx MSRs (see [Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 3B: System Programming Guide] Figure 30-1 + Layout of IA32_PERFEVTSELx MSRs) or AMD’s PerfEvtSeln (see [AMD64 Architecture Programmer’s Manual Volume 2: System Programming], Page 344, Figure 13-7 Performance Event-Select Register (PerfEvtSeln)). + + Note: Only the following bit fields can be set in x86 counter registers: event, umask, edge, inv, cmask. Esp. guest/host only and OS/user mode flags must be setup using EVENT MODIFIERS. + + event 7:0 + umask 15:8 + edge 18 + inv 23 + cmask 31:24 +*/ +#define X86_RAW_EVENT(EventSel, UMask, CMask) ((CMask << 24) | (UMask << 8) | (EventSel)) + +struct PerfEventGroup : public IPerfEventDumper { + int group_fd = -1; + uint64_t read_format; + + struct event { + int fd = -1; + uint64_t id = 0; + uint64_t pmc_index = 0; + perf_event_mmap_page* pmeta = nullptr; + std::string name = "?"; + char format[32]; + }; + std::vector events; + + uint64_t read_buf[512]; // 4KB + uint64_t time_enabled; + uint64_t time_running; + uint64_t pmc_width; + uint64_t pmc_mask; + uint64_t values[32]; + uint32_t tsc_time_shift; + uint32_t tsc_time_mult; + + // ref_cpu_cycles even id + // this event is fixed function counter provided by most x86 CPU + // and it provides TSC clock which is: + // - very high-resolution (<1ns or >1GHz) + // - independent of CPU-frequency throttling + int ref_cpu_cycles_evid = -1; + int sw_task_clock_evid = -1; + int hw_cpu_cycles_evid = -1; + int hw_instructions_evid = -1; + + struct ProfileData { + uint64_t tsc_start; + uint64_t tsc_end; + std::string title; + const char * cat; + int32_t id; + static const int data_size = 16; // 4(fixed) + 8(PMU) + 4(software) + uint64_t data[data_size] = {0}; + // f/i/u/p + char extra_data_type[data_size] = {0}; + union { + double f; + int64_t i; + void * p; + } extra_data[data_size]; + + template + char get_extra_type(T t) { + if (std::is_pointer::value) return 'p'; + if (std::is_floating_point::value) return 'f'; + if (std::is_integral::value) return 'i'; + return '\0'; + } + template + void set_extra_data(int i, T* t) { extra_data[i].p = t; } + void set_extra_data(int i, float t) { extra_data[i].f = t; } + void set_extra_data(int i, double t) { extra_data[i].f = t; } + template + void set_extra_data(int i, T t) { + static_assert(std::is_integral::value); + extra_data[i].i = t; + } + + template + void set_extra_data(Values... vals) { + static_assert(data_size >= sizeof...(vals)); + int j = 0; + int unused1[] = { 0, (set_extra_data(j++, vals), 0)... }; + (void)unused1; + j = 0; + int unused2[] = { 0, (extra_data_type[j++] = get_extra_type(vals), 0)... }; + (void)unused2; + extra_data_type[j] = '\0'; + } + + ProfileData(const std::string& title) : title(title) { + start(); + } + void start() { + tsc_start = get_time_ns(); + } + void stop() { + tsc_end = get_time_ns(); + } + }; + + bool enable_dump_json = false; + int64_t dump_limit = 0; + std::deque all_dump_data; + int serial; + + using CallBackEventArgsSerializer = std::function; + CallBackEventArgsSerializer fn_evt_args_serializer; + + void dump_json(std::ofstream& fw, TscCounter& tsc) override { + static std::atomic_uint64_t async_evid{0}; + if (!enable_dump_json) + return; + auto data_size = all_dump_data.size(); + if (!data_size) + return; + + for (auto& d : all_dump_data) { + auto duration = tsc.tsc_to_usec(d.tsc_start, d.tsc_end); + auto title = std::string(d.title) + "_" + std::to_string(d.id); + auto cat = d.cat; + //auto pid = serial; + auto start = tsc.tsc_to_usec(d.tsc_start); + //auto end = tsc.tsc_to_usec(d.tsc_end); + + if (d.id < 0) { + // async events + // {"cat": "foo", "name": "async_read2", "pid": 4092243, "id": 4092246, "ph": "b", "ts": 23819.718}, + fw << "{\"ph\": \"b\", \"name\": \"" << d.title << "\", \"cat\":\"" << cat << "\"," + << "\"pid\": " << my_pid << ", \"id\": " << (-d.id) << "," + << "\"ts\": " << std::setprecision (15) << start << "},"; + + fw << "{\"ph\": \"e\", \"name\": \"" << d.title << "\", \"cat\":\"" << cat << "\"," + << "\"pid\": " << my_pid << ", \"id\": " << (-d.id) << "," + << "\"ts\": " << std::setprecision (15) << tsc.tsc_to_usec(d.tsc_end) << ","; + } else { + fw << "{\"ph\": \"X\", \"name\": \"" << title << "\", \"cat\":\"" << cat << "\"," + << "\"pid\": " << my_pid << ", \"tid\": " << my_tid << "," + << "\"ts\": " << std::setprecision (15) << start << ", \"dur\": " << duration << ","; + } + + fw << "\"args\":{"; + { + std::stringstream ss; + if (fn_evt_args_serializer) + fn_evt_args_serializer(ss, duration, d.data); + if (sw_task_clock_evid >= 0) { + // PERF_COUNT_SW_TASK_CLOCK in nano-seconds + ss << "\"CPU Usage\":" << (d.data[sw_task_clock_evid] * 1e-3)/duration << ","; + } + if (hw_cpu_cycles_evid >= 0) { + if (sw_task_clock_evid >= 0 && d.data[sw_task_clock_evid] > 0) { + ss << "\"CPU Freq(GHz)\":" << static_cast(d.data[hw_cpu_cycles_evid])/d.data[sw_task_clock_evid] << ","; + } else { + ss << "\"CPU Freq(GHz)\":" << static_cast(d.data[hw_cpu_cycles_evid])*1e-3/duration << ","; + } + if (hw_instructions_evid >= 0 && d.data[hw_instructions_evid] > 0) { + ss << "\"CPI\":" << static_cast(d.data[hw_cpu_cycles_evid])/d.data[hw_instructions_evid] << ","; + } + } + auto prev_locale = ss.imbue(std::locale("")); + const char * sep = ""; + for(size_t i = 0; i < events.size() && i < d.data_size; i++) { + ss << sep << "\"" << events[i].name << "\":\"" << d.data[i] << "\""; + sep = ","; + } + ss.imbue(prev_locale); + if (d.extra_data_type[0] != 0) { + sep = ""; + ss << ",\"Extra Data\":["; + for(size_t i = 0; i < d.data_size && (d.extra_data_type[i] != 0); i++) { + if (d.extra_data_type[i] == 'f') ss << sep << d.extra_data[i].f; + else if (d.extra_data_type[i] == 'i') ss << sep << d.extra_data[i].i; + else if (d.extra_data_type[i] == 'p') ss << sep << "\"" << d.extra_data[i].p << "\""; + else ss << sep << "\"?\""; + sep = ","; + } + ss << "]"; + } + fw << ss.str(); + } + fw << "}},\n"; + } + all_dump_data.clear(); + std::cout << LINUX_PERF_"#" << serial << "(" << this << ") finalize: dumpped " << data_size << std::endl; + } + + uint64_t operator[](size_t i) { + if (i < events.size()) { + return values[i]; + } else { + printf(LINUX_PERF_"PerfEventGroup: operator[] with index %lu oveflow (>%lu)\n", i, events.size()); + abort(); + } + return 0; + } + + PerfEventGroup() = default; + + struct Config { + uint32_t type; + uint64_t config; + const char * name; + Config(uint32_t type, uint64_t config, const char * name = "?") : type(type), config(config), name(name) {} + }; + + uint32_t my_pid = 0; + uint32_t my_tid = 0; + + PerfEventGroup(const std::vector type_configs, CallBackEventArgsSerializer fn = {}) : fn_evt_args_serializer(fn) { + for(auto& tc : type_configs) { + if (tc.type == PERF_TYPE_SOFTWARE) { + add_sw(tc.config); + } + if (tc.type == PERF_TYPE_HARDWARE) { + add_hw(tc.config); + } + if (tc.type == PERF_TYPE_RAW) { + add_raw(tc.config); + } + events.back().name = tc.name; + snprintf(events.back().format, sizeof(events.back().format), "%%%lulu, ", strlen(tc.name)); + } + + // env var defined raw events + for (auto raw_cfg : PerfRawConfig::get().raw_configs) { + add_raw(raw_cfg.second); + events.back().name = raw_cfg.first; + } + + dump_limit = PerfRawConfig::get().dump; + enable_dump_json = PerfRawConfig::get().dump_on_cpu(sched_getcpu()); + serial = 0; + if (enable_dump_json) { + serial = PerfEventJsonDumper::get().register_manager(this); + } + my_pid = getpid(); + my_tid = gettid(); + + enable(); + } + + ~PerfEventGroup() { + if (enable_dump_json) + PerfEventJsonDumper::get().finalize(); + disable(); + for(auto & ev : events) { + close(ev.fd); + } + } + + void show_header() { + std::stringstream ss; + ss << "\e[33m"; + ss << "#" << serial << ":"; + for(auto& ev : events) { + ss << ev.name << ", "; + } + ss << "\e[0m\n"; + std::cout << ss.str(); + } + + void add_raw(uint64_t config, bool pinned=false) { + perf_event_attr pea; + memset(&pea, 0, sizeof(struct perf_event_attr)); + pea.type = PERF_TYPE_RAW; + pea.size = sizeof(struct perf_event_attr); + pea.config = config; + pea.disabled = 1; + pea.exclude_kernel = 1; + pea.exclude_hv = 1; + pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; + if (pinned && group_fd == -1) { + // pinned: It applies only to hardware counters and only to group leaders + pea.pinned = 1; + } + if (group_fd == -1) { + pea.read_format |= PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; + } + add(&pea); + } + + void add_hw(uint64_t config, bool pinned=false) { + perf_event_attr pea; + memset(&pea, 0, sizeof(struct perf_event_attr)); + pea.type = PERF_TYPE_HARDWARE; + pea.size = sizeof(struct perf_event_attr); + pea.config = config; + pea.disabled = 1; + pea.exclude_kernel = 1; + pea.exclude_hv = 1; + pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; + if (pinned && group_fd == -1) { + // pinned: It applies only to hardware counters and only to group leaders + pea.pinned = 1; + } + if (group_fd == -1) { + pea.read_format |= PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; + } + add(&pea); + } + + void add_sw(uint64_t config) { + perf_event_attr pea; + memset(&pea, 0, sizeof(struct perf_event_attr)); + pea.type = PERF_TYPE_SOFTWARE; + pea.size = sizeof(struct perf_event_attr); + pea.config = config; + pea.disabled = 1; + pea.exclude_kernel = 0; // some SW events are counted as kernel + pea.exclude_hv = 1; + //pea.pinned = 1; //sw event cannot set pinned!!! + pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID ; + add(&pea); + } + + void add(perf_event_attr* pev_attr, pid_t pid = 0, int cpu = -1) { + event ev; + + size_t mmap_length = sysconf(_SC_PAGESIZE) * 1; + // clockid must consistent within group + pev_attr->use_clockid = 1; + // can be synched with clock_gettime(CLOCK_MONOTONIC_RAW) + pev_attr->clockid = CLOCK_MONOTONIC_RAW; + + RETRY: + ev.fd = perf_event_open(pev_attr, pid, cpu, group_fd, 0); + if (ev.fd < 0) { + if (!pev_attr->exclude_kernel) { + printf(LINUX_PERF_"perf_event_open(type=%d,config=%lld) with exclude_kernel=0 failed (due to /proc/sys/kernel/perf_event_paranoid is 2), set exclude_kernel=1 and retry...\n", + pev_attr->type, pev_attr->config); + pev_attr->exclude_kernel = 1; + goto RETRY; + } else { + printf(LINUX_PERF_"perf_event_open(type=%d,config=%lld) failed", pev_attr->type, pev_attr->config); + perror(""); + abort(); + } + } + ioctl(ev.fd, PERF_EVENT_IOC_ID, &ev.id); + + ev.pmeta = reinterpret_cast(mmap(NULL, mmap_length, PROT_READ | PROT_WRITE, MAP_SHARED, ev.fd, 0)); + if (ev.pmeta == MAP_FAILED) { + perror(LINUX_PERF_"mmap perf_event_mmap_page failed:"); + close(ev.fd); + abort(); + } + + if (group_fd == -1) { + group_fd = ev.fd; + read_format = pev_attr->read_format; + } + if (pev_attr->type == PERF_TYPE_HARDWARE && pev_attr->config == PERF_COUNT_HW_REF_CPU_CYCLES) { + ref_cpu_cycles_evid = events.size(); + } + if (pev_attr->type == PERF_TYPE_SOFTWARE && pev_attr->config == PERF_COUNT_SW_TASK_CLOCK) { + sw_task_clock_evid = events.size(); + } + if (pev_attr->type == PERF_TYPE_HARDWARE && pev_attr->config == PERF_COUNT_HW_CPU_CYCLES) { + hw_cpu_cycles_evid = events.size(); + } + if (pev_attr->type == PERF_TYPE_HARDWARE && pev_attr->config == PERF_COUNT_HW_INSTRUCTIONS) { + hw_instructions_evid = events.size(); + } + //printf("perf_event_open : fd=%d, id=%lu\n", ev.fd, ev.id); + + events.push_back(ev); + } + + bool event_group_enabled = false; + uint32_t num_events_no_pmc; + + void enable() { + if (event_group_enabled) + return; + ioctl(group_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP); + ioctl(group_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); + // PMC index is only valid when being enabled + num_events_no_pmc = 0; + for(auto& ev : events) { + if (ev.pmc_index == 0 && ev.pmeta->cap_user_rdpmc) { + uint32_t seqlock; + do { + seqlock = ev.pmeta->lock; + std::atomic_thread_fence(std::memory_order_seq_cst); + ev.pmc_index = ev.pmeta->index; + pmc_width = ev.pmeta->pmc_width; + pmc_mask = 1; + pmc_mask = (pmc_mask << pmc_width) - 1; + if (ev.pmeta->cap_user_time) { + tsc_time_shift = ev.pmeta->time_shift; + tsc_time_mult = ev.pmeta->time_mult; + //printf("time: %u,%u\n", tsc_time_shift, tsc_time_mult); + } + std::atomic_thread_fence(std::memory_order_seq_cst); + } while (ev.pmeta->lock != seqlock || (seqlock & 1)); + } + // some events like PERF_TYPE_SOFTWARE cannot read using rdpmc() + if (ev.pmc_index == 0) + num_events_no_pmc ++; + } + event_group_enabled = true; + } + + uint64_t tsc2nano(uint64_t cyc) { + uint64_t quot, rem; + quot = cyc >> tsc_time_shift; + rem = cyc & (((uint64_t)1 << tsc_time_shift) - 1); + return quot * tsc_time_mult + ((rem * tsc_time_mult) >> tsc_time_shift); + } + + void disable() { + if (!event_group_enabled) + return; + + ioctl(group_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP); + + for(auto& ev : events) { + ev.pmc_index = 0; + } + event_group_enabled = false; + } + + uint64_t rdpmc(int i, uint64_t base = 0) { + return (_rdpmc(events[i].pmc_index - 1) - base) & pmc_mask; + } + + template + std::vector rdpmc(FN fn, std::string name = {}, int64_t loop_cnt = 0, std::function addinfo = {}) { + int cnt = events.size(); + std::vector pmc(cnt, 0); + + bool use_pmc = (num_events_no_pmc == 0); + if (use_pmc) { + for(int i = 0; i < cnt; i++) { + if (events[i].pmc_index) + pmc[i] = _rdpmc(events[i].pmc_index - 1); + else + pmc[i] = 0; + } + } else { + read(); + for(int i = 0; i < cnt; i++) { + pmc[i] = values[i]; + } + } + + auto tsc0 = __rdtsc(); + fn(); + auto tsc1 = __rdtsc(); + + if (use_pmc) { + for(int i = 0; i < cnt; i++) { + if (events[i].pmc_index) + pmc[i] = (_rdpmc(events[i].pmc_index - 1) - pmc[i]) & pmc_mask; + else + pmc[i] = 0; + } + } else { + read(); + for(int i = 0; i < cnt; i++) { + pmc[i] -= values[i]; + } + } + + if (!name.empty()) { + char log_buff[1024]; + char * log = log_buff; + log += sprintf(log, "\e[33m"); + for(int i = 0; i < cnt; i++) { + log += sprintf(log, events[i].format, pmc[i]); + } + auto duration_ns = tsc2nano(tsc1 - tsc0); + + log += sprintf(log, "\e[0m [%16s] %.3f us", name.c_str(), duration_ns/1e3); + if (hw_cpu_cycles_evid >= 0) { + log += sprintf(log, " CPU:%.2f(GHz)", 1.0 * pmc[hw_cpu_cycles_evid] / duration_ns); + if (hw_instructions_evid >= 0) { + log += sprintf(log, " CPI:%.2f", 1.0 * pmc[hw_cpu_cycles_evid] / pmc[hw_instructions_evid]); + } + if (loop_cnt > 0) { + // cycles per kernel (or per-iteration) + log += sprintf(log, " CPK:%.1fx%d", 1.0 * pmc[hw_cpu_cycles_evid] / loop_cnt, loop_cnt); + } + } + if (addinfo) { + addinfo(duration_ns, &pmc[0], log); + } + log += sprintf(log, "\n"); + printf(log_buff); + } + return pmc; + } + + void read(bool verbose = false) { + for(size_t i = 0; i < events.size(); i++) values[i] = 0; + + if (::read(group_fd, read_buf, sizeof(read_buf)) == -1) { + perror(LINUX_PERF_"read perf event failed:"); + abort(); + } + + uint64_t * readv = read_buf; + auto nr = *readv++; + if (verbose) printf("number of counters:\t%lu\n", nr); + time_enabled = 0; + time_running = 0; + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + time_enabled = *readv++; + if (verbose) printf("time_enabled:\t%lu\n", time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + time_running = *readv++; + if (verbose) printf("time_running:\t%lu\n", time_running); + } + + for (size_t i = 0; i < nr; i++) { + auto value = *readv++; + auto id = *readv++; + for (size_t k = 0; k < events.size(); k++) { + if (id == events[k].id) { + values[k] = value; + } + } + } + + if (verbose) { + for (size_t k = 0; k < events.size(); k++) { + printf("\t[%lu]: %lu\n", k, values[k]); + } + } + } + + //================================================================================ + // profiler API with json_dump capability + struct ProfileScope { + PerfEventGroup* pevg = nullptr; + ProfileData* pd = nullptr; + bool do_unlock = false; + ProfileScope() = default; + ProfileScope(PerfEventGroup* pevg, ProfileData* pd, bool do_unlock = false) : pevg(pevg), pd(pd), do_unlock(do_unlock) {} + + // Move only + ProfileScope(const ProfileScope&) = delete; + ProfileScope& operator=(const ProfileScope&) = delete; + + ProfileScope(ProfileScope&& other) { + pevg = other.pevg; + pd = other.pd; + other.pevg = nullptr; + other.pd = nullptr; + } + + ProfileScope& operator=(ProfileScope&& other) { + if (&other != this) { + pevg = other.pevg; + pd = other.pd; + other.pevg = nullptr; + other.pd = nullptr; + } + + return *this; + } + + uint64_t* finish() { + if (do_unlock) { + PerfEventGroup::get_sampling_lock() --; + } + if (!pevg || !pd) + return nullptr; + + pd->stop(); + bool use_pmc = (pevg->num_events_no_pmc == 0); + if (use_pmc) { + for (size_t i =0; i < pevg->events.size() && i < pd->data_size; i++) + if (pevg->events[i].pmc_index) + pd->data[i] = (_rdpmc(pevg->events[i].pmc_index - 1) - pd->data[i]) & pevg->pmc_mask; + else + pd->data[i] = 0; + } else { + pevg->read(); + for (size_t i =0; i < pevg->events.size() && i < pd->data_size; i++) + pd->data[i] = pevg->values[i] - pd->data[i]; + } + pevg = nullptr; + return pd->data; + } + + ~ProfileScope() { + finish(); + } + }; + + ProfileData* _profile(const std::string& title, int id = 0) { + if (get_sampling_lock().load() != 0) + return nullptr; + if (dump_limit == 0) + return nullptr; + dump_limit --; + + PerfEventCtxSwitch::get().updateRingBuffer(); + + all_dump_data.emplace_back(title); + auto* pd = &all_dump_data.back(); + pd->cat = "enable"; + pd->id = id; + + // use rdpmc if possible + bool use_pmc = (num_events_no_pmc == 0); + if (use_pmc) { + for (size_t i =0; i < events.size() && i < pd->data_size; i++) + if (events[i].pmc_index) + pd->data[i] = _rdpmc(events[i].pmc_index - 1); + } else { + read(); + for (size_t i =0; i < events.size() && i < pd->data_size; i++) + pd->data[i] = values[i]; + } + + return pd; + } + + static PerfEventGroup& get() { + thread_local PerfEventGroup pevg({ + {PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, "HW_CPU_CYCLES"}, + {PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, "HW_INSTRUCTIONS"}, + {PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES, "HW_CACHE_MISSES"}, + //{PERF_TYPE_HARDWARE, PERF_COUNT_HW_REF_CPU_CYCLES, "HW_REF_CPU_CYCLES"}, + {PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES, "SW_CONTEXT_SWITCHES"}, + {PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK, "SW_TASK_CLOCK"}, + {PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS, "SW_PAGE_FAULTS"}, + + // XSNP_NONE : ... were hits in L3 without snoops required (data is not owned by any other core's local cache) + // XSNP_FWD /XSNP_HITM : ... were HitM responses from shared L3 (data was exclusivly/dirty owned by another core's local cache) + // XSNP_NO_FWD/XSNP_HIT : ... were L3 and cross-core snoop hits in on-pkg core cache (data was shared/clean in another core's local cache) + + {PERF_TYPE_RAW, X86_RAW_EVENT(0xd2, 0x01, 0x00), "XSNP_MISS"}, + {PERF_TYPE_RAW, X86_RAW_EVENT(0xd2, 0x02, 0x00), "XSNP_NO_FWD"}, + {PERF_TYPE_RAW, X86_RAW_EVENT(0xd2, 0x04, 0x00), "XSNP_FWD"}, + {PERF_TYPE_RAW, X86_RAW_EVENT(0xd2, 0x08, 0x00), "XSNP_NONE"}, + }); + return pevg; + } + + // this lock is global, affect all threads + static std::atomic_int& get_sampling_lock() { + static std::atomic_int sampling_lock{0}; + return sampling_lock; + } +}; + +using ProfileScope = PerfEventGroup::ProfileScope; + +#if 1 +// pwe-thread event group with default events pre-selected +template +ProfileScope Profile(const std::string& title, int id = 0, Args&&... args) { + auto& pevg = PerfEventGroup::get(); + auto* pd = pevg._profile(title, id); + if (pd) { + pd->set_extra_data(std::forward(args)...); + } + return {&pevg, pd}; +} + +// overload accept sampling_probability, which can be used to disable profile in scope +template +ProfileScope Profile(float sampling_probability, const std::string& title, int id = 0, Args&&... args) { + auto& pevg = PerfEventGroup::get(); + auto* pd = pevg._profile(title, id); + if (pd) { + pd->set_extra_data(std::forward(args)...); + } + + bool disable_profile = ((std::rand() % 1000)*0.001f >= sampling_probability); + if (disable_profile) { + PerfEventGroup::get_sampling_lock() ++; + } + return {&pevg, pd, disable_profile}; +} + +inline int Init() { + // this is for capture all context switching events + PerfEventCtxSwitch::get(); + + // this is for making main threads the first process + auto dummy = Profile("start"); + return 0; +} + +#else + +template +int Profile(const std::string& title, int id = 0, Args&&... args) { + return 0; +} + +// overload accept sampling_probability, which can be used to disable profile in scope +template +int Profile(float sampling_probability, const std::string& title, int id = 0, Args&&... args) { + return 0; +} + +inline int Init() { + return 0; +} + +#endif + +} // namespace LinuxPerf From f29f7d0f005551208686d47f6dc49f4c413909ab Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Thu, 19 Dec 2024 09:27:01 +0100 Subject: [PATCH 02/13] move StatefulSDPAFusion before CommonOptimizations --- .../transformations/transformation_pipeline.cpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 4d7df9a335e98a..dee31df767daa0 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -431,6 +431,8 @@ void Transformations::PreLpt(const std::vector& defaultPrecis ov::pass::KeepConstAndDecompression); CPU_REGISTER_PASS_COMMON(manager, ov::pass::AUGRUCellFusion); + CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion); + CPU_REGISTER_PASS_X64(manager, ov::intel_cpu::SDPAFuseTransposeReshape); CPU_REGISTER_PASS_COMMON(manager, ov::pass::CommonOptimizations); CPU_REGISTER_PASS_X64(manager, ov::pass::KeepConstsPrecision, decompression_precisions, false, true); CPU_SET_CALLBACK_X64( @@ -654,16 +656,6 @@ void Transformations::PreLpt(const std::vector& defaultPrecis CPU_SET_CALLBACK_COMMON(manager, nmsCallback, ov::pass::ConvertNMS9ToNMSIEInternal); CPU_SET_CALLBACK_COMMON(manager, nmsCallback, ov::pass::ConvertMulticlassNmsToMulticlassNmsIE); CPU_SET_CALLBACK_COMMON(manager, nmsCallback, ov::pass::ConvertMatrixNmsToMatrixNmsIE); - CPU_SET_CALLBACK_COMMON( - manager, - [this](const_node_ptr& node) -> bool { - std::string errorMsg; - // Current SDPA impl is optimized only for LLM models, so we decompose it for others to avoid perf - // regression. Matching the pattern is a little complicated, so we just check if there is any state nodes. - return node::ScaledDotProductAttention::isSupportedOperation(node, errorMsg) && - model->get_variables().size() > 0; - }, - ov::pass::ScaledDotProductAttentionDecomposition); // List of enabled/disabled transformations @@ -946,8 +938,6 @@ void Transformations::PostLpt() { #endif // OPENVINO_ARCH_X86_64 CPU_REGISTER_PASS_COMMON(postLPTPassManager, ov::pass::transpose_sinking::TSShapeOfForward); - CPU_REGISTER_PASS_COMMON(postLPTPassManager, StatefulSDPAFusion); - CPU_REGISTER_PASS_X64(postLPTPassManager, ov::intel_cpu::SDPAFuseTransposeReshape); CPU_REGISTER_PASS_X64(postLPTPassManager, ov::pass::RMSFusion, false); CPU_REGISTER_PASS_X64(postLPTPassManager, ov::intel_cpu::DecomposeRMSNorm); CPU_SET_CALLBACK_X64( From 9208d96c3a813c75103ae7e5b4b01d591eb25f49 Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Fri, 20 Dec 2024 02:12:40 +0100 Subject: [PATCH 03/13] add env for test --- .../intel_cpu/src/nodes/linux_perf.hpp | 2 +- .../transformation_pipeline.cpp | 40 ++++++++++++++++++- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/linux_perf.hpp b/src/plugins/intel_cpu/src/nodes/linux_perf.hpp index fa9498fab70e81..f3c3e4304ec3da 100644 --- a/src/plugins/intel_cpu/src/nodes/linux_perf.hpp +++ b/src/plugins/intel_cpu/src/nodes/linux_perf.hpp @@ -1183,7 +1183,7 @@ struct PerfEventGroup : public IPerfEventDumper { using ProfileScope = PerfEventGroup::ProfileScope; -#if 1 +#if 0 // pwe-thread event group with default events pre-selected template ProfileScope Profile(const std::string& title, int id = 0, Args&&... args) { diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index dee31df767daa0..164568c3b51188 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -431,8 +431,12 @@ void Transformations::PreLpt(const std::vector& defaultPrecis ov::pass::KeepConstAndDecompression); CPU_REGISTER_PASS_COMMON(manager, ov::pass::AUGRUCellFusion); - CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion); - CPU_REGISTER_PASS_X64(manager, ov::intel_cpu::SDPAFuseTransposeReshape); + auto p = std::getenv("USE_OLD"); + bool use_old = p && p[0] == '1'; + if (!use_old) { + CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion); + CPU_REGISTER_PASS_X64(manager, ov::intel_cpu::SDPAFuseTransposeReshape); + } CPU_REGISTER_PASS_COMMON(manager, ov::pass::CommonOptimizations); CPU_REGISTER_PASS_X64(manager, ov::pass::KeepConstsPrecision, decompression_precisions, false, true); CPU_SET_CALLBACK_X64( @@ -656,6 +660,18 @@ void Transformations::PreLpt(const std::vector& defaultPrecis CPU_SET_CALLBACK_COMMON(manager, nmsCallback, ov::pass::ConvertNMS9ToNMSIEInternal); CPU_SET_CALLBACK_COMMON(manager, nmsCallback, ov::pass::ConvertMulticlassNmsToMulticlassNmsIE); CPU_SET_CALLBACK_COMMON(manager, nmsCallback, ov::pass::ConvertMatrixNmsToMatrixNmsIE); + if (use_old) { + CPU_SET_CALLBACK_COMMON( + manager, + [this](const_node_ptr& node) -> bool { + std::string errorMsg; + // Current SDPA impl is optimized only for LLM models, so we decompose it for others to avoid perf + // regression. Matching the pattern is a little complicated, so we just check if there is any state nodes. + return node::ScaledDotProductAttention::isSupportedOperation(node, errorMsg) && + model->get_variables().size() > 0; + }, + ov::pass::ScaledDotProductAttentionDecomposition); + } // List of enabled/disabled transformations @@ -938,6 +954,12 @@ void Transformations::PostLpt() { #endif // OPENVINO_ARCH_X86_64 CPU_REGISTER_PASS_COMMON(postLPTPassManager, ov::pass::transpose_sinking::TSShapeOfForward); + auto p = std::getenv("USE_OLD"); + bool use_old = p && p[0] == '1'; + if (use_old) { + CPU_REGISTER_PASS_COMMON(postLPTPassManager, StatefulSDPAFusion); + CPU_REGISTER_PASS_X64(postLPTPassManager, ov::intel_cpu::SDPAFuseTransposeReshape); + } CPU_REGISTER_PASS_X64(postLPTPassManager, ov::pass::RMSFusion, false); CPU_REGISTER_PASS_X64(postLPTPassManager, ov::intel_cpu::DecomposeRMSNorm); CPU_SET_CALLBACK_X64( @@ -960,6 +982,20 @@ void Transformations::PostLpt() { symbolic_pipeline->get_manager()->register_pass(); postLPTPassManager.run_passes(model); + p = std::getenv("CHECK_SDPA"); + bool check_sdpa = p && p[0] == '1'; + if (check_sdpa) { + size_t count = 0; + for (auto&& node : model->get_ordered_ops()) { + if (node->get_type_name() == std::string("ScaledDotProductAttentionWithKVCache")) { + count++; + } + } + // char buf[128] = {0}; + // sprintf(buf, "KVCACHE=%ld", count); + // std::cout << buf << std::endl; + setenv("KVCACHE", std::to_string(count).c_str(), true); + } } void Transformations::MainSnippets(void) { From 5323836f13d36d4c37f83e323ce1ac223ad468db Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Mon, 23 Dec 2024 08:25:14 +0100 Subject: [PATCH 04/13] add dependent transformations --- .../transformations/utils/gen_pattern.hpp | 97 ++++++++++++++++--- .../common/pass/stateful_sdpa_fusion.cpp | 39 ++++++-- .../common/pass/stateful_sdpa_fusion.hpp | 7 ++ .../transformation_pipeline.cpp | 5 +- 4 files changed, 123 insertions(+), 25 deletions(-) diff --git a/src/common/transformations/include/transformations/utils/gen_pattern.hpp b/src/common/transformations/include/transformations/utils/gen_pattern.hpp index 976561b4844a17..215825d2cd13eb 100644 --- a/src/common/transformations/include/transformations/utils/gen_pattern.hpp +++ b/src/common/transformations/include/transformations/utils/gen_pattern.hpp @@ -40,6 +40,14 @@ namespace gen_pattern { #ifdef CPU_DEBUG_CAPS +# ifdef __GNUC__ +# define CURRENT_LINE_NO __builtin_LINE() +# define CURRENT_FILE __builtin_FILE() +# else +# define CURRENT_LINE_NO -1 +# define CURRENT_FILE "" +# endif + template static inline void _verbose_log(Args&&... args) { std::stringstream ss; @@ -58,6 +66,10 @@ static bool matcher_verbose_enabled() { if (matcher_verbose_enabled()) \ _verbose_log(__VA_ARGS__) #else + +# define CURRENT_LINE_NO -1 +# define CURRENT_FILE "" + static bool matcher_verbose_enabled() { return false; } @@ -181,6 +193,8 @@ class Symbol { double literal_const_value; std::shared_ptr lhs; std::shared_ptr rhs; + const char* filename = ""; + int line_no = -1; // _,+,-,*,/ // l : literal const // n : named symbol @@ -220,10 +234,12 @@ class Symbol { entity->op = 'n'; entity->name = name; } - Symbol(const int value) { + Symbol(const int value, int line_no = CURRENT_LINE_NO, const char* file = CURRENT_FILE) { entity = std::make_shared(); entity->op = 'l'; entity->literal_const_value = value; + entity->line_no = line_no; + entity->filename = file; } Symbol(char op, const Symbol& lhs, const Symbol& rhs) { entity = std::make_shared(); @@ -246,8 +262,12 @@ class Symbol { void* get_id() const { return entity.get(); } - const char* get_name() const { - return entity->name; + std::string get_name() const { + if (entity->line_no == -1 || is_independent_var()) + return entity->name; + auto filename = strrchr(entity->filename, '/') ? strrchr(entity->filename, '/') + 1 : entity->filename; + std::string name(filename); // use filename:lineno instead + return name + ":" + std::to_string(entity->line_no); } bool operator<(const Symbol& rhs) const { return get_id() < rhs.get_id(); @@ -739,7 +759,9 @@ class GenericPattern : public ov::pass::pattern::op::Pattern { explicit GenericPattern(const DiscreteTypeInfo& type_info, const OutputVector& args, const detail::AttrMap& attrs, - const char* vt) + const char* vt, + const int line_no = -1, + const char* file = "") : ov::pass::pattern::op::Pattern(args), m_type_info(type_info), m_attrs(attrs), @@ -758,6 +780,12 @@ class GenericPattern : public ov::pass::pattern::op::Pattern { sep = ","; } ss << ")"; + if (line_no != -1) { + // add the code line no to the log: + // O P752(P736,P745)@fuse_rotary_positional_embeddings.cpp:551 vs ... + auto filename = strrchr(file, '/') ? strrchr(file, '/') + 1 : file; + ss << "@" << filename << ":" << line_no; + } m_signature = ss.str(); set_friendly_name(std::string("P") + std::to_string(id)); } @@ -776,7 +804,13 @@ class GenericPattern : public ov::pass::pattern::op::Pattern { // strictly requires pattern & graph value to come from output port with same index, // this is absolute necessary when pattern contains split node connections. if (pattern_value.get_index() != graph_value.get_index()) { - _VERBOSE_LOG(level, "X output index mismatch: ", pattern_value.get_index(), "!=", graph_value.get_index()); + _VERBOSE_LOG(level, + "X output index mismatch:(", + m_signature, + "): ", + pattern_value.get_index(), + "!=", + graph_value.get_index()); return false; } @@ -1018,7 +1052,9 @@ template std::shared_ptr makePattern(const std::vector& inputs, detail::AttrMap attrmap = {}, const char* vt = nullptr, - const char* friendly_name = nullptr) { + const char* friendly_name = nullptr, + int line_no = CURRENT_LINE_NO, + const char* file = CURRENT_FILE) { OutputVector args; for (auto& in : inputs) args.push_back(in.get_output()); @@ -1026,7 +1062,8 @@ std::shared_ptr makePattern(const std::vector& inputs // pattern nodes are better for pattern matching because // - it can be generic/incomplete, so normal OP node is not working properly // - it has predicate to correctly decide which branch to take (in Or pattern) - auto pattern_node = std::make_shared(T::get_type_info_static(), args, attrmap, vt); + auto pattern_node = + std::make_shared(T::get_type_info_static(), args, attrmap, vt, line_no, file); if (friendly_name) pattern_node->set_friendly_name(friendly_name); @@ -1120,7 +1157,9 @@ inline std::shared_ptr GenStridedSlice(detail::PatternNode data, detail::PatternNode start, detail::PatternNode stop, detail::PatternNode step, - size_t axis) { + size_t axis, + int line_no = CURRENT_LINE_NO, + const char* file = CURRENT_FILE) { std::vector begin_mask(axis + 1, 1); std::vector end_mask(axis + 1, 1); std::vector new_axis_mask; @@ -1135,12 +1174,27 @@ inline std::shared_ptr GenStridedSlice(detail::PatternNode data, {"end_mask", end_mask}, {"new_axis_mask", new_axis_mask}, {"shrink_axis_mask", shrink_axis_mask}, - {"ellipsis_mask", ellipsis_mask}}); + {"ellipsis_mask", ellipsis_mask}}, + nullptr, + nullptr, + line_no, + file); return opt2; } -inline std::shared_ptr GenSlice(detail::PatternNode data, Symbol start, Symbol stop, Symbol step, size_t axis) { - auto opt1 = makePattern({data, {start}, {stop}, {step}, {static_cast(axis)}}); +inline std::shared_ptr GenSlice(detail::PatternNode data, + Symbol start, + Symbol stop, + Symbol step, + size_t axis, + int line_no = CURRENT_LINE_NO, + const char* file = CURRENT_FILE) { + auto opt1 = makePattern({data, {start}, {stop}, {step}, {static_cast(axis)}}, + {}, + nullptr, + nullptr, + line_no, + file); std::vector vbegin(axis + 1, Symbol(0)); std::vector vend(axis + 1, Symbol(0)); @@ -1168,7 +1222,11 @@ inline std::shared_ptr GenSlice(detail::PatternNode data, Symbol start, Sy {"end_mask", end_mask}, {"new_axis_mask", new_axis_mask}, {"shrink_axis_mask", shrink_axis_mask}, - {"ellipsis_mask", ellipsis_mask}}); + {"ellipsis_mask", ellipsis_mask}}, + nullptr, + nullptr, + line_no, + file); return opt1 | opt2; } @@ -1329,7 +1387,9 @@ class PatternValidator { auto id = sym.get_id(); if (symbol_value_map.count(id)) { if (symbol_value_map[id] != value) { - _VERBOSE_LOG(" in-consistency between multiple references of same symbol : ", + _VERBOSE_LOG(" in-consistency between multiple references of same symbol(", + sym.get_name(), + "): ", symbol_value_map[id], " != ", value); @@ -1345,7 +1405,12 @@ class PatternValidator { if (sym.is_literal_const()) { auto literal = sym.eval(symbol_value_map); if (literal != value) { - _VERBOSE_LOG(" mismatch between literal symbol & value : ", literal, " != ", value); + _VERBOSE_LOG(" mismatch between literal symbol & value(", + sym.get_name(), + "): ", + literal, + " != ", + value); return false; } // no need to put literal into value map to eval them. @@ -1373,7 +1438,9 @@ class PatternValidator { } } if (!is_match) { - _VERBOSE_LOG(" mismatch between derived & value : ", + _VERBOSE_LOG(" mismatch between derived & value(", + sym.get_name(), + "): ", std::setprecision(std::numeric_limits::max_digits10), derived, " != ", diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp index 447adb0b2fe23f..08b5ec14f32e1c 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -20,7 +21,11 @@ #include "itt.hpp" #include "openvino/opsets/opset1.hpp" #include "ov_ops/type_relaxed.hpp" +#include "transformations/common_optimizations/simplify_shape_of_sub_graph.hpp" #include "transformations/cpu_opset/common/op/sdpa.hpp" +#include "transformations/defs.hpp" +#include "transformations/op_conversions/convert_broadcast3.hpp" +#include "transformations/transpose_sinking/ts_shape_of.hpp" using namespace ov::gen_pattern; namespace ov { @@ -57,7 +62,7 @@ StatefulSDPAFusion::StatefulSDPAFusion() { std::shared_ptr computed_bcst_k, computed_bcst_v, multiply_k, multiply_v; std::shared_ptr mq_reshape_k, mq_reshape_v; auto multi_query_bcst = [](const std::shared_ptr& kv) { - auto reshape_kv = wrap_type({kv, any_input()}); + auto reshape_kv = makePattern({kv, any_input()}); auto unsqueeze_kv = makePattern({kv, any_input()}); auto check_one = [](Output output) -> bool { @@ -73,8 +78,8 @@ StatefulSDPAFusion::StatefulSDPAFusion() { makePattern({wrap_type(check_one), any_input(), any_input()}, {{"mode", "numpy"}}); - auto multiply_kv = wrap_type({reshape_kv | unsqueeze_kv, constant_bcst | computed_bcst}); - auto result = wrap_type({multiply_kv, any_input()}); + auto multiply_kv = makePattern({reshape_kv | unsqueeze_kv, constant_bcst | computed_bcst}); + auto result = makePattern({multiply_kv, any_input()}); return std::make_tuple(result, reshape_kv, unsqueeze_kv, computed_bcst, multiply_kv); }; @@ -178,15 +183,19 @@ StatefulSDPAFusion::StatefulSDPAFusion() { opset6::Assign *assign_k_node = nullptr, *assign_v_node = nullptr; opset1::Convert *assign_cvt_k_node = nullptr, *assign_cvt_v_node = nullptr; - if (!find_assign(concat_k_node, assign_k_node, assign_cvt_k_node)) + if (!find_assign(concat_k_node, assign_k_node, assign_cvt_k_node)) { return false; - if (past_k_node->get_variable_id() != assign_k_node->get_variable_id()) + } + if (past_k_node->get_variable_id() != assign_k_node->get_variable_id()) { return false; + } - if (!find_assign(concat_v_node, assign_v_node, assign_cvt_v_node)) + if (!find_assign(concat_v_node, assign_v_node, assign_cvt_v_node)) { return false; - if (past_v_node->get_variable_id() != assign_v_node->get_variable_id()) + } + if (past_v_node->get_variable_id() != assign_v_node->get_variable_id()) { return false; + } auto is_optional_one_child = [&pattern_map](const std::vector>& nodes) { for (auto&& node : nodes) { @@ -284,5 +293,21 @@ StatefulSDPAFusion::StatefulSDPAFusion() { this->register_matcher(m, callback); } +bool SDPASubgraphFusion::run_on_model(const std::shared_ptr& f) { + RUN_ON_FUNCTION_SCOPE(SDPASubgraphFusion); + using namespace ov::pass::pattern; + ov::pass::Manager manager(get_pass_config(), "SDPASubgraphFusion"); + manager.set_per_pass_validation(false); + + CPU_REGISTER_PASS_COMMON(manager, ov::pass::SimplifyShapeOfSubGraph, true); + CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConvertBroadcast3); + CPU_REGISTER_PASS_COMMON(manager, ov::pass::transpose_sinking::TSShapeOfForward); + CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion); + CPU_REGISTER_PASS_COMMON(manager, ov::pass::Validate); + + manager.run_passes(f); + return false; +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.hpp index 96028402aa9f92..59494736bb2c2e 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.hpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.hpp @@ -14,5 +14,12 @@ class StatefulSDPAFusion : public ov::pass::MatcherPass { StatefulSDPAFusion(); }; +class SDPASubgraphFusion : public ov::pass::ModelPass { +public: + OPENVINO_RTTI("SDPASubgraphFusion", "0"); + + bool run_on_model(const std::shared_ptr& f) override; +}; + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 164568c3b51188..21413addc3187a 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -434,8 +434,7 @@ void Transformations::PreLpt(const std::vector& defaultPrecis auto p = std::getenv("USE_OLD"); bool use_old = p && p[0] == '1'; if (!use_old) { - CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion); - CPU_REGISTER_PASS_X64(manager, ov::intel_cpu::SDPAFuseTransposeReshape); + CPU_REGISTER_PASS_COMMON(manager, SDPASubgraphFusion); } CPU_REGISTER_PASS_COMMON(manager, ov::pass::CommonOptimizations); CPU_REGISTER_PASS_X64(manager, ov::pass::KeepConstsPrecision, decompression_precisions, false, true); @@ -953,10 +952,10 @@ void Transformations::PostLpt() { } #endif // OPENVINO_ARCH_X86_64 - CPU_REGISTER_PASS_COMMON(postLPTPassManager, ov::pass::transpose_sinking::TSShapeOfForward); auto p = std::getenv("USE_OLD"); bool use_old = p && p[0] == '1'; if (use_old) { + CPU_REGISTER_PASS_COMMON(postLPTPassManager, ov::pass::transpose_sinking::TSShapeOfForward); CPU_REGISTER_PASS_COMMON(postLPTPassManager, StatefulSDPAFusion); CPU_REGISTER_PASS_X64(postLPTPassManager, ov::intel_cpu::SDPAFuseTransposeReshape); } From e95bedc483cff3c5386f5f49d9e7c18f5eb48e03 Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Tue, 24 Dec 2024 09:31:21 +0100 Subject: [PATCH 05/13] fix mixtral failure --- .../common/pass/stateful_sdpa_fusion.cpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp index 08b5ec14f32e1c..45ea04b8fc753a 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp @@ -61,6 +61,7 @@ StatefulSDPAFusion::StatefulSDPAFusion() { std::shared_ptr reshape_k, reshape_v, unsqueeze_k, unsqueeze_v; std::shared_ptr computed_bcst_k, computed_bcst_v, multiply_k, multiply_v; std::shared_ptr mq_reshape_k, mq_reshape_v; + std::shared_ptr computed_bcst3_k, computed_bcst3_v; auto multi_query_bcst = [](const std::shared_ptr& kv) { auto reshape_kv = makePattern({kv, any_input()}); auto unsqueeze_kv = makePattern({kv, any_input()}); @@ -79,12 +80,16 @@ StatefulSDPAFusion::StatefulSDPAFusion() { {{"mode", "numpy"}}); auto multiply_kv = makePattern({reshape_kv | unsqueeze_kv, constant_bcst | computed_bcst}); - auto result = makePattern({multiply_kv, any_input()}); - return std::make_tuple(result, reshape_kv, unsqueeze_kv, computed_bcst, multiply_kv); + auto computed_bcst3 = + makePattern({unsqueeze_kv, any_input()}, + {{"mode", "bidirectional"}}); + + auto result = makePattern({multiply_kv | computed_bcst3, any_input()}); + return std::make_tuple(result, reshape_kv, unsqueeze_kv, computed_bcst, multiply_kv, computed_bcst3); }; - std::tie(mq_reshape_k, reshape_k, unsqueeze_k, computed_bcst_k, multiply_k) = multi_query_bcst(concat_k); - std::tie(mq_reshape_v, reshape_v, unsqueeze_v, computed_bcst_v, multiply_v) = multi_query_bcst(concat_v); + std::tie(mq_reshape_k, reshape_k, unsqueeze_k, computed_bcst_k, multiply_k, computed_bcst3_k) = multi_query_bcst(concat_k); + std::tie(mq_reshape_v, reshape_v, unsqueeze_v, computed_bcst_v, multiply_v, computed_bcst3_v) = multi_query_bcst(concat_v); auto present_k = concat_k | mq_reshape_k; auto present_v = concat_v | mq_reshape_v; @@ -221,7 +226,9 @@ StatefulSDPAFusion::StatefulSDPAFusion() { computed_bcst_v, multiply_v, mq_reshape_k, - mq_reshape_v})) { + mq_reshape_v, + computed_bcst3_k, + computed_bcst3_v})) { return false; } @@ -300,7 +307,6 @@ bool SDPASubgraphFusion::run_on_model(const std::shared_ptr& f) { manager.set_per_pass_validation(false); CPU_REGISTER_PASS_COMMON(manager, ov::pass::SimplifyShapeOfSubGraph, true); - CPU_REGISTER_PASS_COMMON(manager, ov::pass::ConvertBroadcast3); CPU_REGISTER_PASS_COMMON(manager, ov::pass::transpose_sinking::TSShapeOfForward); CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion); CPU_REGISTER_PASS_COMMON(manager, ov::pass::Validate); From bbc20a2616a4e4ec393accdb524a5f76a7966e3e Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Tue, 24 Dec 2024 09:36:13 +0100 Subject: [PATCH 06/13] code clean --- src/plugins/intel_cpu/src/graph.cpp | 14 +- .../intel_cpu/src/nodes/linux_perf.hpp | 1242 ----------------- .../common/pass/stateful_sdpa_fusion.cpp | 10 +- .../transformation_pipeline.cpp | 39 +- 4 files changed, 10 insertions(+), 1295 deletions(-) delete mode 100644 src/plugins/intel_cpu/src/nodes/linux_perf.hpp diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp index 92b541f9b2543a..aab78a4d5f15bd 100644 --- a/src/plugins/intel_cpu/src/graph.cpp +++ b/src/plugins/intel_cpu/src/graph.cpp @@ -45,7 +45,6 @@ #include "utils/node_dumper.h" #include "utils/precision_support.h" #include "utils/verbose.h" -#include "nodes/linux_perf.hpp" #if (OV_THREAD == OV_THREAD_TBB || OV_THREAD == OV_THREAD_TBB_AUTO) # include @@ -109,7 +108,6 @@ void Graph::Replicate(const std::shared_ptr& model, OV_ITT_SCOPE_CHAIN(FIRST_INFERENCE, taskChain, itt::domains::intel_cpu_LT, "Graph::Replicate", "ov::Model"); this->_name = model->get_friendly_name(); - LinuxPerf::Init(); // Map data object onto producer node std::map, NodePtr> op2node; @@ -1164,7 +1162,6 @@ VecMemoryDescs Graph::getOutputMemoryDescriptors() const { void Graph::InferStatic(SyncInferRequest* request, int numaId) { for (const auto& node : m_executableGraphNodes) { - auto perf1 = LinuxPerf::Profile(node->getTypeStr()); ExecuteNodeWithCatch(node, request, numaId); } } @@ -1440,15 +1437,11 @@ inline void Graph::ExecuteNodeWithCatch(const NodePtr& node, SyncInferRequest* r template void Graph::InferDynamic(SyncInferRequest* request, int numaId, UpdateStrategy&& update) { size_t inferCounter = 0; - auto perf = LinuxPerf::Profile(std::string("Graph::InferDynamic_#") + std::to_string(infer_count)); for (auto stopIndx : m_executableSyncNodesInds) { - { - auto perf1 = LinuxPerf::Profile("update"); - update(stopIndx); - } + update(stopIndx); + for (; inferCounter < stopIndx; ++inferCounter) { auto& node = m_executableGraphNodes[inferCounter]; - auto perf1 = LinuxPerf::Profile(node->getTypeStr()); // + "_" + node->getName()); ExecuteNodeWithCatch(node, request, numaId); } @@ -1494,7 +1487,8 @@ void Graph::Infer(SyncInferRequest* request) { static_cast(status)); } - infer_count++; + if (infer_count != -1) + infer_count++; } void Graph::SortTopologically() { diff --git a/src/plugins/intel_cpu/src/nodes/linux_perf.hpp b/src/plugins/intel_cpu/src/nodes/linux_perf.hpp deleted file mode 100644 index f3c3e4304ec3da..00000000000000 --- a/src/plugins/intel_cpu/src/nodes/linux_perf.hpp +++ /dev/null @@ -1,1242 +0,0 @@ - -#include -#include -//#include -#include -#include -#include - -#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 30 -#include -#define gettid() syscall(SYS_gettid) -#endif - -inline int perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu, int group_fd, unsigned long flags) { - return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags); -} - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace LinuxPerf { - -#define _LINE_STRINGIZE(x) _LINE_STRINGIZE2(x) -#define _LINE_STRINGIZE2(x) #x -#define LINE_STRING _LINE_STRINGIZE(__LINE__) - -#define LINUX_PERF_ "\e[33m[LINUX_PERF:" LINE_STRING "]\e[0m " - -inline uint64_t get_time_ns() { - struct timespec tp0; - if (clock_gettime(CLOCK_MONOTONIC_RAW, &tp0) != 0) { - perror(LINUX_PERF_"clock_gettime(CLOCK_MONOTONIC_RAW,...) failed!"); - abort(); - } - return (tp0.tv_sec * 1000000000) + tp0.tv_nsec; -} - -struct TscCounter { - uint64_t tsc_ticks_per_second; - uint64_t tsc_ticks_base; - double tsc_to_usec(uint64_t tsc_ticks) const { - if (tsc_ticks < tsc_ticks_base) - return 0; - return (tsc_ticks - tsc_ticks_base) * 1000000.0 / tsc_ticks_per_second; - } - double tsc_to_usec(uint64_t tsc_ticks0, uint64_t tsc_ticks1) const { - if (tsc_ticks1 < tsc_ticks0) - return 0; - return (tsc_ticks1 - tsc_ticks0) * 1000000.0 / tsc_ticks_per_second; - } - TscCounter() { - uint64_t start_ticks = __rdtsc(); - std::this_thread::sleep_for(std::chrono::seconds(1)); - tsc_ticks_per_second = (__rdtsc() - start_ticks); - std::cout << LINUX_PERF_"tsc_ticks_per_second = " << tsc_ticks_per_second << std::endl; - tsc_ticks_base = __rdtsc(); - - // use CLOCK_MONOTONIC_RAW instead of TSC - tsc_ticks_per_second = 1000000000; // ns - tsc_ticks_base = get_time_ns(); - } -}; - -class IPerfEventDumper { -public: - virtual void dump_json(std::ofstream& fw, TscCounter& tsc) = 0; -}; - -struct PerfEventJsonDumper { - std::mutex g_mutex; - std::set all_dumpers; - const char* dump_file_name = "perf_dump.json"; - bool dump_file_over = false; - bool not_finalized = true; - std::ofstream fw; - std::atomic_int totalProfilerManagers{0}; - TscCounter tsc; - - ~PerfEventJsonDumper() { - if (not_finalized) - finalize(); - } - - void finalize() { - if (!not_finalized) - return; - std::lock_guard guard(g_mutex); - if (dump_file_over || all_dumpers.empty()) - return; - - // start dump - fw.open(dump_file_name, std::ios::out); - fw << "{\n"; - fw << "\"schemaVersion\": 1,\n"; - fw << "\"traceEvents\": [\n"; - fw.flush(); - - for (auto& pthis : all_dumpers) { - pthis->dump_json(fw, tsc); - } - all_dumpers.clear(); - - fw << R"({ - "name": "Profiler End", - "ph": "i", - "s": "g", - "pid": "Traces", - "tid": "Trace OV Profiler", - "ts":)" - << tsc.tsc_to_usec(get_time_ns()) << "}", - fw << "]\n"; - fw << "}\n"; - auto total_size = fw.tellp(); - fw.close(); - dump_file_over = true; - not_finalized = false; - - std::cout << LINUX_PERF_"Dumpped "; - - if (total_size < 1024) std::cout << total_size << " bytes "; - else if (total_size < 1024*1024) std::cout << total_size/1024 << " KB "; - else std::cout << total_size/(1024 * 1024) << " MB "; - std::cout << " to " << dump_file_name << std::endl; - } - - int register_manager(IPerfEventDumper* pthis) { - std::lock_guard guard(g_mutex); - std::stringstream ss; - auto serial_id = totalProfilerManagers.fetch_add(1); - ss << LINUX_PERF_"#" << serial_id << "(" << pthis << ") : is registed." << std::endl; - std::cout << ss.str(); - all_dumpers.emplace(pthis); - return serial_id; - } - - static PerfEventJsonDumper& get() { - static PerfEventJsonDumper inst; - return inst; - } -}; - -inline std::vector str_split(const std::string& s, std::string delimiter) { - std::vector ret; - size_t last = 0; - size_t next = 0; - while ((next = s.find(delimiter, last)) != std::string::npos) { - //std::cout << last << "," << next << "=" << s.substr(last, next-last) << "\n"; - ret.push_back(s.substr(last, next-last)); - last = next + 1; - } - ret.push_back(s.substr(last)); - return ret; -} - -template -T& read_ring_buffer(perf_event_mmap_page& meta, uint64_t& offset) { - auto offset0 = offset; - offset += sizeof(T); - return *reinterpret_cast(reinterpret_cast(&meta) + meta.data_offset + (offset0)%meta.data_size); -} - -struct PerfRawConfig { - PerfRawConfig() { - // env var defined raw events - const char* str_raw_config = std::getenv("LINUX_PERF"); - if (str_raw_config) { - CPU_ZERO(&cpu_mask); - // options are separated by ":" as PATH - auto options = str_split(str_raw_config, ":"); - for(auto& opt : options) { - auto items = str_split(opt, "="); - if (items.size() == 2) { - if (items[0] == "dump") { - // limit the number of dumps per thread - dump = strtoll(&items[1][0], nullptr, 0); - } else if (items[0] == "cpus") { - // thread's affinity (cpu-binding) can be changed by threading-libs(TBB/OpenMP) anytime - // sched_getaffinity() can only get correct binding at start-up time, another way is to specify it - // also too many events may generate if per-thread event is used, cpus can limit - // cpus=56 - // cpus=56.57.59 - auto cpus = str_split(items[1], ","); - CPU_ZERO(&cpu_mask); - for(auto& cpu : cpus) { - CPU_SET(std::atoi(cpu.c_str()), &cpu_mask); - } - } else { - auto config = strtoul(&items[1][0], nullptr, 0); - if (config > 0) - raw_configs.emplace_back(items[0], config); - } - } - if (items.size() == 1) { - if (items[0] == "switch-cpu") { - // get cpu_mask as early as possible - switch_cpu = true; - CPU_ZERO(&cpu_mask); - if (sched_getaffinity(getpid(), sizeof(cpu_set_t), &cpu_mask)) { - perror(LINUX_PERF_"sched_getaffinity failed:"); - abort(); - } - } - if (items[0] == "dump") - dump = std::numeric_limits::max(); // no limit to number of dumps - } - } - - for(auto& cfg : raw_configs) { - printf(LINUX_PERF_" config: %s=0x%lx\n", cfg.first.c_str(), cfg.second); - } - if (switch_cpu) { - printf(LINUX_PERF_" config: switch_cpu\n"); - } - if (dump) - printf(LINUX_PERF_" config: dump=%ld\n", dump); - if (CPU_COUNT(&cpu_mask)) { - printf(LINUX_PERF_" config: cpus="); - for (int cpu = 0; cpu < (int)sizeof(cpu_set_t)*8; cpu++) - if(CPU_ISSET(cpu, &cpu_mask)) printf("%d,", cpu); - printf("\n"); - } - } else { - printf(LINUX_PERF_" LINUX_PERF is unset, example: LINUX_PERF=dump,switch-cpu,L2_MISS=0x10d1\n"); - } - } - - bool dump_on_cpu(int cpu) { - if (dump == 0) - return false; - if (CPU_COUNT(&cpu_mask)) - return CPU_ISSET(cpu, &cpu_mask); - return true; - } - - int64_t dump = 0; - cpu_set_t cpu_mask; - bool switch_cpu = false; - std::vector dump_cpus; - std::vector> raw_configs; - - static PerfRawConfig& get() { - static PerfRawConfig inst; - return inst; - } -}; - - -// context switch events -// this will visualize -struct PerfEventCtxSwitch : public IPerfEventDumper { - bool is_enabled; - - struct event { - int fd; - perf_event_mmap_page * meta; - int cpu; - uint64_t ctx_switch_in_time; - uint64_t ctx_switch_in_tid; - uint64_t ctx_last_time; - - event(int fd, perf_event_mmap_page * meta): fd(fd), meta(meta) {} - }; - std::vector events; - - PerfEventCtxSwitch() { - is_enabled = PerfRawConfig::get().switch_cpu; - if (is_enabled) { - // make sure TSC in PerfEventJsonDumper is the very first thing to initialize - PerfEventJsonDumper::get().register_manager(this); - - // open fd for each CPU - cpu_set_t mask = PerfRawConfig::get().cpu_mask; - - long number_of_processors = sysconf(_SC_NPROCESSORS_ONLN); - printf(LINUX_PERF_"sizeof(cpu_set_t):%lu: _SC_NPROCESSORS_ONLN=%ld CPU_COUNT=%d\n", sizeof(cpu_set_t), number_of_processors, CPU_COUNT(&mask)); - if (CPU_COUNT(&mask) >= number_of_processors) { - printf(LINUX_PERF_" no affinity is set, will not enable PerfEventCtxSwitch\n"); - is_enabled = false; - return; - } - - for (int cpu = 0; cpu < (int)sizeof(cpu_set_t)*8; cpu++) { - auto is_set = CPU_ISSET(cpu, &mask); - if (!is_set) continue; - - perf_event_attr pea; - memset(&pea, 0, sizeof(struct perf_event_attr)); - pea.type = PERF_TYPE_HARDWARE; - pea.size = sizeof(struct perf_event_attr); - pea.config = PERF_COUNT_HW_REF_CPU_CYCLES; // not the point, can be any - pea.disabled = 0; - pea.exclude_kernel = 1; - pea.exclude_hv = 1; - pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; - // pinned: It applies only to hardware counters and only to group leaders - pea.pinned = 1; - pea.read_format |= PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; - - // for group master, generate PERF_RECORD_SWITCH into ring-buffer - // is helpful to visualize context switch - pea.context_switch = 1; - // then TID, TIME, ID, STREAM_ID, and CPU can additionally be included in non-PERF_RECORD_SAMPLEs - // if the corresponding sample_type is selected - pea.sample_id_all = 1; - pea.sample_type = PERF_SAMPLE_TIME | PERF_SAMPLE_TID | PERF_SAMPLE_CPU; - auto mmap_length = sysconf(_SC_PAGESIZE) * (1024 + 1); - pea.use_clockid = 1; - pea.clockid = CLOCK_MONOTONIC_RAW; - - // calling thread on any processor - pid_t pid = -1; - // measures all processes/threads on the specified CPU - int ctx_switch_fd = perf_event_open(&pea, pid, cpu, -1, 0); - if (ctx_switch_fd < 0) { - perror(LINUX_PERF_"PerfEventCtxSwitch perf_event_open failed (check /proc/sys/kernel/perf_event_paranoid please)"); - abort(); - } - - auto* ctx_switch_pmeta = reinterpret_cast(mmap(NULL, mmap_length, PROT_READ | PROT_WRITE, MAP_SHARED, ctx_switch_fd, 0)); - if (ctx_switch_pmeta == MAP_FAILED) { - perror(LINUX_PERF_"mmap perf_event_mmap_page failed:"); - close(ctx_switch_fd); - abort(); - } - printf(LINUX_PERF_"perf_event_open CPU_WIDE context_switch on cpu %d, ctx_switch_fd=%d\n", cpu, ctx_switch_fd); - events.emplace_back(ctx_switch_fd, ctx_switch_pmeta); - events.back().ctx_switch_in_time = get_time_ns(); - events.back().ctx_last_time = get_time_ns(); - events.back().cpu = cpu; - } - my_pid = getpid(); - my_tid = gettid(); - } - } - - ~PerfEventCtxSwitch() { - if (is_enabled) { - PerfEventJsonDumper::get().finalize(); - } - for(auto& ev : events) { - close(ev.fd); - } - } - - struct ProfileData { - uint64_t tsc_start; - uint64_t tsc_end; - uint32_t tid; - uint32_t cpu; - bool preempt; // preempt means current TID preempts previous thread - }; - - std::deque all_dump_data; - - void dump_json(std::ofstream& fw, TscCounter& tsc) override { - static std::atomic_uint64_t async_evid{0}; - if (!is_enabled) return; - - updateRingBuffer(); - - auto data_size = all_dump_data.size(); - if (!data_size) return; - - for (auto& ev : events) { - if (ev.ctx_switch_in_time == 0) continue; - all_dump_data.emplace_back(); - auto* pd = &all_dump_data.back(); - pd->tid = ev.ctx_switch_in_tid; - pd->cpu = ev.cpu; - pd->tsc_start = ev.ctx_switch_in_time; - pd->tsc_end = get_time_ns(); - ev.ctx_switch_in_time = 0; - } - - auto pid = 9999; // fake pid for CPU - auto cat = "TID"; - - // TID is used for CPU id instead - for (auto& d : all_dump_data) { - auto duration = tsc.tsc_to_usec(d.tsc_start, d.tsc_end); - auto start = tsc.tsc_to_usec(d.tsc_start); - //auto end = tsc.tsc_to_usec(d.tsc_end); - auto cpu_id = d.cpu; - - fw << "{\"ph\": \"X\", \"name\": \"" << d.tid << "\", \"cat\":\"" << cat << "\"," - << "\"pid\": " << pid << ", \"tid\": \"CPU" << cpu_id << "\"," - << "\"ts\": " << std::setprecision (15) << start << ", \"dur\": " << duration << "},\n"; - } - } - - bool ring_buffer_verbose = false; - uint32_t my_pid = 0; - uint32_t my_tid = 0; - std::atomic atom_gard{0}; - - void updateRingBuffer() { - // only one thread can enter - const int lock_value = atom_gard.exchange(1); - if (lock_value == 1) { - // has been locked, return; - return; - } - - // only update when any ring-buffer is half loaded - bool need_update = false; - for(auto& ev : events) { - auto& mmap_meta = *ev.meta; - auto used_size = (mmap_meta.data_tail - mmap_meta.data_head) % mmap_meta.data_size; - if (used_size > (mmap_meta.data_size >> 1)) { - need_update = true; - break; - } - } - - if (!need_update) { - // unlock - atom_gard.exchange(0); - return; - } - - for(auto& ev : events) { - auto& mmap_meta = *ev.meta; - uint64_t head0 = mmap_meta.data_tail; - uint64_t head1 = mmap_meta.data_head; - //printf("ring-buffer@end: %lu~%lu %llu %llu %llu\n", head0, head1, group_meta.data_tail, group_meta.data_offset, group_meta.data_size); - - if (head0 != head1) { - if (ring_buffer_verbose) { - printf("PERF_RECORD_SWITCH = %d\n", PERF_RECORD_SWITCH); - printf("PERF_RECORD_SWITCH_CPU_WIDE = %d\n", PERF_RECORD_SWITCH_CPU_WIDE); - printf("PERF_RECORD_MISC_SWITCH_OUT = %d\n", PERF_RECORD_MISC_SWITCH_OUT); - printf("PERF_RECORD_MISC_SWITCH_OUT_PREEMPT = %d\n", PERF_RECORD_MISC_SWITCH_OUT_PREEMPT); - } - - while(head0 < head1) { - auto h0 = head0; - auto type = read_ring_buffer<__u32>(mmap_meta, head0); - auto misc = read_ring_buffer<__u16>(mmap_meta, head0); - auto size = read_ring_buffer<__u16>(mmap_meta, head0); - uint32_t next_prev_pid = 0, next_prev_tid = 0; - if (type == PERF_RECORD_SWITCH_CPU_WIDE) { - // previous PID/TID if switching-in - // next PID/TID if switching-out - next_prev_pid = read_ring_buffer<__u32>(mmap_meta, head0); - next_prev_tid = read_ring_buffer<__u32>(mmap_meta, head0); - } - auto pid = read_ring_buffer<__u32>(mmap_meta, head0); - auto tid = read_ring_buffer<__u32>(mmap_meta, head0); - auto time = read_ring_buffer(mmap_meta, head0); - auto cpu = read_ring_buffer<__u32>(mmap_meta, head0); - auto reserved0 = read_ring_buffer<__u32>(mmap_meta, head0); - (void)reserved0; - (void)next_prev_pid; - (void)pid; - - // skip idle process (with TID 0) - if (tid > 0 && ring_buffer_verbose) { - printf("event: %lu/%lu\ttype,misc,size=(%u,%u,%u) cpu%u,next_prev_tid=%u,tid=%u time:(%lu), (+%lu)\n", - h0, head1, - type, misc, size, - cpu, next_prev_tid, tid, - time, - time - ev.ctx_last_time); - } - - if (type == PERF_RECORD_SWITCH_CPU_WIDE && tid > 0) { - if (misc & PERF_RECORD_MISC_SWITCH_OUT || misc & PERF_RECORD_MISC_SWITCH_OUT_PREEMPT) { - // switch out - // generate a log - all_dump_data.emplace_back(); - auto* pd = &all_dump_data.back(); - pd->tid = tid; - pd->cpu = cpu; - pd->preempt = (misc & PERF_RECORD_MISC_SWITCH_OUT_PREEMPT); - //printf("ctx_switch_in_time=%lu\n", ctx_switch_in_time); - pd->tsc_start = ev.ctx_switch_in_time; - pd->tsc_end = time; - - if (ring_buffer_verbose) printf("\t cpu: %u tid: %u %lu (+%lu)\n", cpu, tid, ev.ctx_switch_in_time, time-ev.ctx_switch_in_time); - - ev.ctx_switch_in_time = 0; - } else { - // switch in - ev.ctx_switch_in_time = time; - ev.ctx_switch_in_tid = tid; - } - } - - ev.ctx_last_time = time; - head0 += size - (head0 - h0); - } - - if (head0 != head1) { - printf("head0(%lu) != head1(%lu)\n", head0, head1); - abort(); - } - - // update tail so kernel can keep generate event records - mmap_meta.data_tail = head0; - std::atomic_thread_fence(std::memory_order_seq_cst); - } - } - atom_gard.exchange(0); - } - - static PerfEventCtxSwitch& get() { - static PerfEventCtxSwitch inst; - return inst; - } -}; - -/* -RAW HARDWARE EVENT DESCRIPTOR - Even when an event is not available in a symbolic form within perf right now, it can be encoded in a per processor specific way. - - For instance For x86 CPUs NNN represents the raw register encoding with the layout of IA32_PERFEVTSELx MSRs (see [Intel® 64 and IA-32 Architectures Software Developer’s Manual Volume 3B: System Programming Guide] Figure 30-1 - Layout of IA32_PERFEVTSELx MSRs) or AMD’s PerfEvtSeln (see [AMD64 Architecture Programmer’s Manual Volume 2: System Programming], Page 344, Figure 13-7 Performance Event-Select Register (PerfEvtSeln)). - - Note: Only the following bit fields can be set in x86 counter registers: event, umask, edge, inv, cmask. Esp. guest/host only and OS/user mode flags must be setup using EVENT MODIFIERS. - - event 7:0 - umask 15:8 - edge 18 - inv 23 - cmask 31:24 -*/ -#define X86_RAW_EVENT(EventSel, UMask, CMask) ((CMask << 24) | (UMask << 8) | (EventSel)) - -struct PerfEventGroup : public IPerfEventDumper { - int group_fd = -1; - uint64_t read_format; - - struct event { - int fd = -1; - uint64_t id = 0; - uint64_t pmc_index = 0; - perf_event_mmap_page* pmeta = nullptr; - std::string name = "?"; - char format[32]; - }; - std::vector events; - - uint64_t read_buf[512]; // 4KB - uint64_t time_enabled; - uint64_t time_running; - uint64_t pmc_width; - uint64_t pmc_mask; - uint64_t values[32]; - uint32_t tsc_time_shift; - uint32_t tsc_time_mult; - - // ref_cpu_cycles even id - // this event is fixed function counter provided by most x86 CPU - // and it provides TSC clock which is: - // - very high-resolution (<1ns or >1GHz) - // - independent of CPU-frequency throttling - int ref_cpu_cycles_evid = -1; - int sw_task_clock_evid = -1; - int hw_cpu_cycles_evid = -1; - int hw_instructions_evid = -1; - - struct ProfileData { - uint64_t tsc_start; - uint64_t tsc_end; - std::string title; - const char * cat; - int32_t id; - static const int data_size = 16; // 4(fixed) + 8(PMU) + 4(software) - uint64_t data[data_size] = {0}; - // f/i/u/p - char extra_data_type[data_size] = {0}; - union { - double f; - int64_t i; - void * p; - } extra_data[data_size]; - - template - char get_extra_type(T t) { - if (std::is_pointer::value) return 'p'; - if (std::is_floating_point::value) return 'f'; - if (std::is_integral::value) return 'i'; - return '\0'; - } - template - void set_extra_data(int i, T* t) { extra_data[i].p = t; } - void set_extra_data(int i, float t) { extra_data[i].f = t; } - void set_extra_data(int i, double t) { extra_data[i].f = t; } - template - void set_extra_data(int i, T t) { - static_assert(std::is_integral::value); - extra_data[i].i = t; - } - - template - void set_extra_data(Values... vals) { - static_assert(data_size >= sizeof...(vals)); - int j = 0; - int unused1[] = { 0, (set_extra_data(j++, vals), 0)... }; - (void)unused1; - j = 0; - int unused2[] = { 0, (extra_data_type[j++] = get_extra_type(vals), 0)... }; - (void)unused2; - extra_data_type[j] = '\0'; - } - - ProfileData(const std::string& title) : title(title) { - start(); - } - void start() { - tsc_start = get_time_ns(); - } - void stop() { - tsc_end = get_time_ns(); - } - }; - - bool enable_dump_json = false; - int64_t dump_limit = 0; - std::deque all_dump_data; - int serial; - - using CallBackEventArgsSerializer = std::function; - CallBackEventArgsSerializer fn_evt_args_serializer; - - void dump_json(std::ofstream& fw, TscCounter& tsc) override { - static std::atomic_uint64_t async_evid{0}; - if (!enable_dump_json) - return; - auto data_size = all_dump_data.size(); - if (!data_size) - return; - - for (auto& d : all_dump_data) { - auto duration = tsc.tsc_to_usec(d.tsc_start, d.tsc_end); - auto title = std::string(d.title) + "_" + std::to_string(d.id); - auto cat = d.cat; - //auto pid = serial; - auto start = tsc.tsc_to_usec(d.tsc_start); - //auto end = tsc.tsc_to_usec(d.tsc_end); - - if (d.id < 0) { - // async events - // {"cat": "foo", "name": "async_read2", "pid": 4092243, "id": 4092246, "ph": "b", "ts": 23819.718}, - fw << "{\"ph\": \"b\", \"name\": \"" << d.title << "\", \"cat\":\"" << cat << "\"," - << "\"pid\": " << my_pid << ", \"id\": " << (-d.id) << "," - << "\"ts\": " << std::setprecision (15) << start << "},"; - - fw << "{\"ph\": \"e\", \"name\": \"" << d.title << "\", \"cat\":\"" << cat << "\"," - << "\"pid\": " << my_pid << ", \"id\": " << (-d.id) << "," - << "\"ts\": " << std::setprecision (15) << tsc.tsc_to_usec(d.tsc_end) << ","; - } else { - fw << "{\"ph\": \"X\", \"name\": \"" << title << "\", \"cat\":\"" << cat << "\"," - << "\"pid\": " << my_pid << ", \"tid\": " << my_tid << "," - << "\"ts\": " << std::setprecision (15) << start << ", \"dur\": " << duration << ","; - } - - fw << "\"args\":{"; - { - std::stringstream ss; - if (fn_evt_args_serializer) - fn_evt_args_serializer(ss, duration, d.data); - if (sw_task_clock_evid >= 0) { - // PERF_COUNT_SW_TASK_CLOCK in nano-seconds - ss << "\"CPU Usage\":" << (d.data[sw_task_clock_evid] * 1e-3)/duration << ","; - } - if (hw_cpu_cycles_evid >= 0) { - if (sw_task_clock_evid >= 0 && d.data[sw_task_clock_evid] > 0) { - ss << "\"CPU Freq(GHz)\":" << static_cast(d.data[hw_cpu_cycles_evid])/d.data[sw_task_clock_evid] << ","; - } else { - ss << "\"CPU Freq(GHz)\":" << static_cast(d.data[hw_cpu_cycles_evid])*1e-3/duration << ","; - } - if (hw_instructions_evid >= 0 && d.data[hw_instructions_evid] > 0) { - ss << "\"CPI\":" << static_cast(d.data[hw_cpu_cycles_evid])/d.data[hw_instructions_evid] << ","; - } - } - auto prev_locale = ss.imbue(std::locale("")); - const char * sep = ""; - for(size_t i = 0; i < events.size() && i < d.data_size; i++) { - ss << sep << "\"" << events[i].name << "\":\"" << d.data[i] << "\""; - sep = ","; - } - ss.imbue(prev_locale); - if (d.extra_data_type[0] != 0) { - sep = ""; - ss << ",\"Extra Data\":["; - for(size_t i = 0; i < d.data_size && (d.extra_data_type[i] != 0); i++) { - if (d.extra_data_type[i] == 'f') ss << sep << d.extra_data[i].f; - else if (d.extra_data_type[i] == 'i') ss << sep << d.extra_data[i].i; - else if (d.extra_data_type[i] == 'p') ss << sep << "\"" << d.extra_data[i].p << "\""; - else ss << sep << "\"?\""; - sep = ","; - } - ss << "]"; - } - fw << ss.str(); - } - fw << "}},\n"; - } - all_dump_data.clear(); - std::cout << LINUX_PERF_"#" << serial << "(" << this << ") finalize: dumpped " << data_size << std::endl; - } - - uint64_t operator[](size_t i) { - if (i < events.size()) { - return values[i]; - } else { - printf(LINUX_PERF_"PerfEventGroup: operator[] with index %lu oveflow (>%lu)\n", i, events.size()); - abort(); - } - return 0; - } - - PerfEventGroup() = default; - - struct Config { - uint32_t type; - uint64_t config; - const char * name; - Config(uint32_t type, uint64_t config, const char * name = "?") : type(type), config(config), name(name) {} - }; - - uint32_t my_pid = 0; - uint32_t my_tid = 0; - - PerfEventGroup(const std::vector type_configs, CallBackEventArgsSerializer fn = {}) : fn_evt_args_serializer(fn) { - for(auto& tc : type_configs) { - if (tc.type == PERF_TYPE_SOFTWARE) { - add_sw(tc.config); - } - if (tc.type == PERF_TYPE_HARDWARE) { - add_hw(tc.config); - } - if (tc.type == PERF_TYPE_RAW) { - add_raw(tc.config); - } - events.back().name = tc.name; - snprintf(events.back().format, sizeof(events.back().format), "%%%lulu, ", strlen(tc.name)); - } - - // env var defined raw events - for (auto raw_cfg : PerfRawConfig::get().raw_configs) { - add_raw(raw_cfg.second); - events.back().name = raw_cfg.first; - } - - dump_limit = PerfRawConfig::get().dump; - enable_dump_json = PerfRawConfig::get().dump_on_cpu(sched_getcpu()); - serial = 0; - if (enable_dump_json) { - serial = PerfEventJsonDumper::get().register_manager(this); - } - my_pid = getpid(); - my_tid = gettid(); - - enable(); - } - - ~PerfEventGroup() { - if (enable_dump_json) - PerfEventJsonDumper::get().finalize(); - disable(); - for(auto & ev : events) { - close(ev.fd); - } - } - - void show_header() { - std::stringstream ss; - ss << "\e[33m"; - ss << "#" << serial << ":"; - for(auto& ev : events) { - ss << ev.name << ", "; - } - ss << "\e[0m\n"; - std::cout << ss.str(); - } - - void add_raw(uint64_t config, bool pinned=false) { - perf_event_attr pea; - memset(&pea, 0, sizeof(struct perf_event_attr)); - pea.type = PERF_TYPE_RAW; - pea.size = sizeof(struct perf_event_attr); - pea.config = config; - pea.disabled = 1; - pea.exclude_kernel = 1; - pea.exclude_hv = 1; - pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; - if (pinned && group_fd == -1) { - // pinned: It applies only to hardware counters and only to group leaders - pea.pinned = 1; - } - if (group_fd == -1) { - pea.read_format |= PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; - } - add(&pea); - } - - void add_hw(uint64_t config, bool pinned=false) { - perf_event_attr pea; - memset(&pea, 0, sizeof(struct perf_event_attr)); - pea.type = PERF_TYPE_HARDWARE; - pea.size = sizeof(struct perf_event_attr); - pea.config = config; - pea.disabled = 1; - pea.exclude_kernel = 1; - pea.exclude_hv = 1; - pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID; - if (pinned && group_fd == -1) { - // pinned: It applies only to hardware counters and only to group leaders - pea.pinned = 1; - } - if (group_fd == -1) { - pea.read_format |= PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; - } - add(&pea); - } - - void add_sw(uint64_t config) { - perf_event_attr pea; - memset(&pea, 0, sizeof(struct perf_event_attr)); - pea.type = PERF_TYPE_SOFTWARE; - pea.size = sizeof(struct perf_event_attr); - pea.config = config; - pea.disabled = 1; - pea.exclude_kernel = 0; // some SW events are counted as kernel - pea.exclude_hv = 1; - //pea.pinned = 1; //sw event cannot set pinned!!! - pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID ; - add(&pea); - } - - void add(perf_event_attr* pev_attr, pid_t pid = 0, int cpu = -1) { - event ev; - - size_t mmap_length = sysconf(_SC_PAGESIZE) * 1; - // clockid must consistent within group - pev_attr->use_clockid = 1; - // can be synched with clock_gettime(CLOCK_MONOTONIC_RAW) - pev_attr->clockid = CLOCK_MONOTONIC_RAW; - - RETRY: - ev.fd = perf_event_open(pev_attr, pid, cpu, group_fd, 0); - if (ev.fd < 0) { - if (!pev_attr->exclude_kernel) { - printf(LINUX_PERF_"perf_event_open(type=%d,config=%lld) with exclude_kernel=0 failed (due to /proc/sys/kernel/perf_event_paranoid is 2), set exclude_kernel=1 and retry...\n", - pev_attr->type, pev_attr->config); - pev_attr->exclude_kernel = 1; - goto RETRY; - } else { - printf(LINUX_PERF_"perf_event_open(type=%d,config=%lld) failed", pev_attr->type, pev_attr->config); - perror(""); - abort(); - } - } - ioctl(ev.fd, PERF_EVENT_IOC_ID, &ev.id); - - ev.pmeta = reinterpret_cast(mmap(NULL, mmap_length, PROT_READ | PROT_WRITE, MAP_SHARED, ev.fd, 0)); - if (ev.pmeta == MAP_FAILED) { - perror(LINUX_PERF_"mmap perf_event_mmap_page failed:"); - close(ev.fd); - abort(); - } - - if (group_fd == -1) { - group_fd = ev.fd; - read_format = pev_attr->read_format; - } - if (pev_attr->type == PERF_TYPE_HARDWARE && pev_attr->config == PERF_COUNT_HW_REF_CPU_CYCLES) { - ref_cpu_cycles_evid = events.size(); - } - if (pev_attr->type == PERF_TYPE_SOFTWARE && pev_attr->config == PERF_COUNT_SW_TASK_CLOCK) { - sw_task_clock_evid = events.size(); - } - if (pev_attr->type == PERF_TYPE_HARDWARE && pev_attr->config == PERF_COUNT_HW_CPU_CYCLES) { - hw_cpu_cycles_evid = events.size(); - } - if (pev_attr->type == PERF_TYPE_HARDWARE && pev_attr->config == PERF_COUNT_HW_INSTRUCTIONS) { - hw_instructions_evid = events.size(); - } - //printf("perf_event_open : fd=%d, id=%lu\n", ev.fd, ev.id); - - events.push_back(ev); - } - - bool event_group_enabled = false; - uint32_t num_events_no_pmc; - - void enable() { - if (event_group_enabled) - return; - ioctl(group_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP); - ioctl(group_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); - // PMC index is only valid when being enabled - num_events_no_pmc = 0; - for(auto& ev : events) { - if (ev.pmc_index == 0 && ev.pmeta->cap_user_rdpmc) { - uint32_t seqlock; - do { - seqlock = ev.pmeta->lock; - std::atomic_thread_fence(std::memory_order_seq_cst); - ev.pmc_index = ev.pmeta->index; - pmc_width = ev.pmeta->pmc_width; - pmc_mask = 1; - pmc_mask = (pmc_mask << pmc_width) - 1; - if (ev.pmeta->cap_user_time) { - tsc_time_shift = ev.pmeta->time_shift; - tsc_time_mult = ev.pmeta->time_mult; - //printf("time: %u,%u\n", tsc_time_shift, tsc_time_mult); - } - std::atomic_thread_fence(std::memory_order_seq_cst); - } while (ev.pmeta->lock != seqlock || (seqlock & 1)); - } - // some events like PERF_TYPE_SOFTWARE cannot read using rdpmc() - if (ev.pmc_index == 0) - num_events_no_pmc ++; - } - event_group_enabled = true; - } - - uint64_t tsc2nano(uint64_t cyc) { - uint64_t quot, rem; - quot = cyc >> tsc_time_shift; - rem = cyc & (((uint64_t)1 << tsc_time_shift) - 1); - return quot * tsc_time_mult + ((rem * tsc_time_mult) >> tsc_time_shift); - } - - void disable() { - if (!event_group_enabled) - return; - - ioctl(group_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP); - - for(auto& ev : events) { - ev.pmc_index = 0; - } - event_group_enabled = false; - } - - uint64_t rdpmc(int i, uint64_t base = 0) { - return (_rdpmc(events[i].pmc_index - 1) - base) & pmc_mask; - } - - template - std::vector rdpmc(FN fn, std::string name = {}, int64_t loop_cnt = 0, std::function addinfo = {}) { - int cnt = events.size(); - std::vector pmc(cnt, 0); - - bool use_pmc = (num_events_no_pmc == 0); - if (use_pmc) { - for(int i = 0; i < cnt; i++) { - if (events[i].pmc_index) - pmc[i] = _rdpmc(events[i].pmc_index - 1); - else - pmc[i] = 0; - } - } else { - read(); - for(int i = 0; i < cnt; i++) { - pmc[i] = values[i]; - } - } - - auto tsc0 = __rdtsc(); - fn(); - auto tsc1 = __rdtsc(); - - if (use_pmc) { - for(int i = 0; i < cnt; i++) { - if (events[i].pmc_index) - pmc[i] = (_rdpmc(events[i].pmc_index - 1) - pmc[i]) & pmc_mask; - else - pmc[i] = 0; - } - } else { - read(); - for(int i = 0; i < cnt; i++) { - pmc[i] -= values[i]; - } - } - - if (!name.empty()) { - char log_buff[1024]; - char * log = log_buff; - log += sprintf(log, "\e[33m"); - for(int i = 0; i < cnt; i++) { - log += sprintf(log, events[i].format, pmc[i]); - } - auto duration_ns = tsc2nano(tsc1 - tsc0); - - log += sprintf(log, "\e[0m [%16s] %.3f us", name.c_str(), duration_ns/1e3); - if (hw_cpu_cycles_evid >= 0) { - log += sprintf(log, " CPU:%.2f(GHz)", 1.0 * pmc[hw_cpu_cycles_evid] / duration_ns); - if (hw_instructions_evid >= 0) { - log += sprintf(log, " CPI:%.2f", 1.0 * pmc[hw_cpu_cycles_evid] / pmc[hw_instructions_evid]); - } - if (loop_cnt > 0) { - // cycles per kernel (or per-iteration) - log += sprintf(log, " CPK:%.1fx%d", 1.0 * pmc[hw_cpu_cycles_evid] / loop_cnt, loop_cnt); - } - } - if (addinfo) { - addinfo(duration_ns, &pmc[0], log); - } - log += sprintf(log, "\n"); - printf(log_buff); - } - return pmc; - } - - void read(bool verbose = false) { - for(size_t i = 0; i < events.size(); i++) values[i] = 0; - - if (::read(group_fd, read_buf, sizeof(read_buf)) == -1) { - perror(LINUX_PERF_"read perf event failed:"); - abort(); - } - - uint64_t * readv = read_buf; - auto nr = *readv++; - if (verbose) printf("number of counters:\t%lu\n", nr); - time_enabled = 0; - time_running = 0; - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - time_enabled = *readv++; - if (verbose) printf("time_enabled:\t%lu\n", time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - time_running = *readv++; - if (verbose) printf("time_running:\t%lu\n", time_running); - } - - for (size_t i = 0; i < nr; i++) { - auto value = *readv++; - auto id = *readv++; - for (size_t k = 0; k < events.size(); k++) { - if (id == events[k].id) { - values[k] = value; - } - } - } - - if (verbose) { - for (size_t k = 0; k < events.size(); k++) { - printf("\t[%lu]: %lu\n", k, values[k]); - } - } - } - - //================================================================================ - // profiler API with json_dump capability - struct ProfileScope { - PerfEventGroup* pevg = nullptr; - ProfileData* pd = nullptr; - bool do_unlock = false; - ProfileScope() = default; - ProfileScope(PerfEventGroup* pevg, ProfileData* pd, bool do_unlock = false) : pevg(pevg), pd(pd), do_unlock(do_unlock) {} - - // Move only - ProfileScope(const ProfileScope&) = delete; - ProfileScope& operator=(const ProfileScope&) = delete; - - ProfileScope(ProfileScope&& other) { - pevg = other.pevg; - pd = other.pd; - other.pevg = nullptr; - other.pd = nullptr; - } - - ProfileScope& operator=(ProfileScope&& other) { - if (&other != this) { - pevg = other.pevg; - pd = other.pd; - other.pevg = nullptr; - other.pd = nullptr; - } - - return *this; - } - - uint64_t* finish() { - if (do_unlock) { - PerfEventGroup::get_sampling_lock() --; - } - if (!pevg || !pd) - return nullptr; - - pd->stop(); - bool use_pmc = (pevg->num_events_no_pmc == 0); - if (use_pmc) { - for (size_t i =0; i < pevg->events.size() && i < pd->data_size; i++) - if (pevg->events[i].pmc_index) - pd->data[i] = (_rdpmc(pevg->events[i].pmc_index - 1) - pd->data[i]) & pevg->pmc_mask; - else - pd->data[i] = 0; - } else { - pevg->read(); - for (size_t i =0; i < pevg->events.size() && i < pd->data_size; i++) - pd->data[i] = pevg->values[i] - pd->data[i]; - } - pevg = nullptr; - return pd->data; - } - - ~ProfileScope() { - finish(); - } - }; - - ProfileData* _profile(const std::string& title, int id = 0) { - if (get_sampling_lock().load() != 0) - return nullptr; - if (dump_limit == 0) - return nullptr; - dump_limit --; - - PerfEventCtxSwitch::get().updateRingBuffer(); - - all_dump_data.emplace_back(title); - auto* pd = &all_dump_data.back(); - pd->cat = "enable"; - pd->id = id; - - // use rdpmc if possible - bool use_pmc = (num_events_no_pmc == 0); - if (use_pmc) { - for (size_t i =0; i < events.size() && i < pd->data_size; i++) - if (events[i].pmc_index) - pd->data[i] = _rdpmc(events[i].pmc_index - 1); - } else { - read(); - for (size_t i =0; i < events.size() && i < pd->data_size; i++) - pd->data[i] = values[i]; - } - - return pd; - } - - static PerfEventGroup& get() { - thread_local PerfEventGroup pevg({ - {PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, "HW_CPU_CYCLES"}, - {PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, "HW_INSTRUCTIONS"}, - {PERF_TYPE_HARDWARE, PERF_COUNT_HW_CACHE_MISSES, "HW_CACHE_MISSES"}, - //{PERF_TYPE_HARDWARE, PERF_COUNT_HW_REF_CPU_CYCLES, "HW_REF_CPU_CYCLES"}, - {PERF_TYPE_SOFTWARE, PERF_COUNT_SW_CONTEXT_SWITCHES, "SW_CONTEXT_SWITCHES"}, - {PERF_TYPE_SOFTWARE, PERF_COUNT_SW_TASK_CLOCK, "SW_TASK_CLOCK"}, - {PERF_TYPE_SOFTWARE, PERF_COUNT_SW_PAGE_FAULTS, "SW_PAGE_FAULTS"}, - - // XSNP_NONE : ... were hits in L3 without snoops required (data is not owned by any other core's local cache) - // XSNP_FWD /XSNP_HITM : ... were HitM responses from shared L3 (data was exclusivly/dirty owned by another core's local cache) - // XSNP_NO_FWD/XSNP_HIT : ... were L3 and cross-core snoop hits in on-pkg core cache (data was shared/clean in another core's local cache) - - {PERF_TYPE_RAW, X86_RAW_EVENT(0xd2, 0x01, 0x00), "XSNP_MISS"}, - {PERF_TYPE_RAW, X86_RAW_EVENT(0xd2, 0x02, 0x00), "XSNP_NO_FWD"}, - {PERF_TYPE_RAW, X86_RAW_EVENT(0xd2, 0x04, 0x00), "XSNP_FWD"}, - {PERF_TYPE_RAW, X86_RAW_EVENT(0xd2, 0x08, 0x00), "XSNP_NONE"}, - }); - return pevg; - } - - // this lock is global, affect all threads - static std::atomic_int& get_sampling_lock() { - static std::atomic_int sampling_lock{0}; - return sampling_lock; - } -}; - -using ProfileScope = PerfEventGroup::ProfileScope; - -#if 0 -// pwe-thread event group with default events pre-selected -template -ProfileScope Profile(const std::string& title, int id = 0, Args&&... args) { - auto& pevg = PerfEventGroup::get(); - auto* pd = pevg._profile(title, id); - if (pd) { - pd->set_extra_data(std::forward(args)...); - } - return {&pevg, pd}; -} - -// overload accept sampling_probability, which can be used to disable profile in scope -template -ProfileScope Profile(float sampling_probability, const std::string& title, int id = 0, Args&&... args) { - auto& pevg = PerfEventGroup::get(); - auto* pd = pevg._profile(title, id); - if (pd) { - pd->set_extra_data(std::forward(args)...); - } - - bool disable_profile = ((std::rand() % 1000)*0.001f >= sampling_probability); - if (disable_profile) { - PerfEventGroup::get_sampling_lock() ++; - } - return {&pevg, pd, disable_profile}; -} - -inline int Init() { - // this is for capture all context switching events - PerfEventCtxSwitch::get(); - - // this is for making main threads the first process - auto dummy = Profile("start"); - return 0; -} - -#else - -template -int Profile(const std::string& title, int id = 0, Args&&... args) { - return 0; -} - -// overload accept sampling_probability, which can be used to disable profile in scope -template -int Profile(float sampling_probability, const std::string& title, int id = 0, Args&&... args) { - return 0; -} - -inline int Init() { - return 0; -} - -#endif - -} // namespace LinuxPerf diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp index 45ea04b8fc753a..e52c2494d82c86 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp @@ -80,16 +80,16 @@ StatefulSDPAFusion::StatefulSDPAFusion() { {{"mode", "numpy"}}); auto multiply_kv = makePattern({reshape_kv | unsqueeze_kv, constant_bcst | computed_bcst}); - auto computed_bcst3 = - makePattern({unsqueeze_kv, any_input()}, - {{"mode", "bidirectional"}}); + auto computed_bcst3 = makePattern({unsqueeze_kv, any_input()}, {{"mode", "bidirectional"}}); auto result = makePattern({multiply_kv | computed_bcst3, any_input()}); return std::make_tuple(result, reshape_kv, unsqueeze_kv, computed_bcst, multiply_kv, computed_bcst3); }; - std::tie(mq_reshape_k, reshape_k, unsqueeze_k, computed_bcst_k, multiply_k, computed_bcst3_k) = multi_query_bcst(concat_k); - std::tie(mq_reshape_v, reshape_v, unsqueeze_v, computed_bcst_v, multiply_v, computed_bcst3_v) = multi_query_bcst(concat_v); + std::tie(mq_reshape_k, reshape_k, unsqueeze_k, computed_bcst_k, multiply_k, computed_bcst3_k) = + multi_query_bcst(concat_k); + std::tie(mq_reshape_v, reshape_v, unsqueeze_v, computed_bcst_v, multiply_v, computed_bcst3_v) = + multi_query_bcst(concat_v); auto present_k = concat_k | mq_reshape_k; auto present_v = concat_v | mq_reshape_v; diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 21413addc3187a..3ebfc6f8e4658b 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -431,11 +431,7 @@ void Transformations::PreLpt(const std::vector& defaultPrecis ov::pass::KeepConstAndDecompression); CPU_REGISTER_PASS_COMMON(manager, ov::pass::AUGRUCellFusion); - auto p = std::getenv("USE_OLD"); - bool use_old = p && p[0] == '1'; - if (!use_old) { - CPU_REGISTER_PASS_COMMON(manager, SDPASubgraphFusion); - } + CPU_REGISTER_PASS_COMMON(manager, SDPASubgraphFusion); CPU_REGISTER_PASS_COMMON(manager, ov::pass::CommonOptimizations); CPU_REGISTER_PASS_X64(manager, ov::pass::KeepConstsPrecision, decompression_precisions, false, true); CPU_SET_CALLBACK_X64( @@ -659,18 +655,6 @@ void Transformations::PreLpt(const std::vector& defaultPrecis CPU_SET_CALLBACK_COMMON(manager, nmsCallback, ov::pass::ConvertNMS9ToNMSIEInternal); CPU_SET_CALLBACK_COMMON(manager, nmsCallback, ov::pass::ConvertMulticlassNmsToMulticlassNmsIE); CPU_SET_CALLBACK_COMMON(manager, nmsCallback, ov::pass::ConvertMatrixNmsToMatrixNmsIE); - if (use_old) { - CPU_SET_CALLBACK_COMMON( - manager, - [this](const_node_ptr& node) -> bool { - std::string errorMsg; - // Current SDPA impl is optimized only for LLM models, so we decompose it for others to avoid perf - // regression. Matching the pattern is a little complicated, so we just check if there is any state nodes. - return node::ScaledDotProductAttention::isSupportedOperation(node, errorMsg) && - model->get_variables().size() > 0; - }, - ov::pass::ScaledDotProductAttentionDecomposition); - } // List of enabled/disabled transformations @@ -952,13 +936,6 @@ void Transformations::PostLpt() { } #endif // OPENVINO_ARCH_X86_64 - auto p = std::getenv("USE_OLD"); - bool use_old = p && p[0] == '1'; - if (use_old) { - CPU_REGISTER_PASS_COMMON(postLPTPassManager, ov::pass::transpose_sinking::TSShapeOfForward); - CPU_REGISTER_PASS_COMMON(postLPTPassManager, StatefulSDPAFusion); - CPU_REGISTER_PASS_X64(postLPTPassManager, ov::intel_cpu::SDPAFuseTransposeReshape); - } CPU_REGISTER_PASS_X64(postLPTPassManager, ov::pass::RMSFusion, false); CPU_REGISTER_PASS_X64(postLPTPassManager, ov::intel_cpu::DecomposeRMSNorm); CPU_SET_CALLBACK_X64( @@ -981,20 +958,6 @@ void Transformations::PostLpt() { symbolic_pipeline->get_manager()->register_pass(); postLPTPassManager.run_passes(model); - p = std::getenv("CHECK_SDPA"); - bool check_sdpa = p && p[0] == '1'; - if (check_sdpa) { - size_t count = 0; - for (auto&& node : model->get_ordered_ops()) { - if (node->get_type_name() == std::string("ScaledDotProductAttentionWithKVCache")) { - count++; - } - } - // char buf[128] = {0}; - // sprintf(buf, "KVCACHE=%ld", count); - // std::cout << buf << std::endl; - setenv("KVCACHE", std::to_string(count).c_str(), true); - } } void Transformations::MainSnippets(void) { From 6b2e86bae29a59a60fc35260e9da3c3a72c98eee Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Tue, 24 Dec 2024 19:56:36 +0800 Subject: [PATCH 07/13] use least amount of transformations --- .../cpu_opset/common/pass/stateful_sdpa_fusion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp index e52c2494d82c86..546f0ba397f54d 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp @@ -306,7 +306,7 @@ bool SDPASubgraphFusion::run_on_model(const std::shared_ptr& f) { ov::pass::Manager manager(get_pass_config(), "SDPASubgraphFusion"); manager.set_per_pass_validation(false); - CPU_REGISTER_PASS_COMMON(manager, ov::pass::SimplifyShapeOfSubGraph, true); + CPU_REGISTER_PASS_COMMON(manager, ov::pass::SimplifyGatherShapeOf); CPU_REGISTER_PASS_COMMON(manager, ov::pass::transpose_sinking::TSShapeOfForward); CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion); CPU_REGISTER_PASS_COMMON(manager, ov::pass::Validate); From 657d6add701ce73ee520c95287f4c6a989dc3969 Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Wed, 25 Dec 2024 17:04:38 +0800 Subject: [PATCH 08/13] fix ci error --- .../common/pass/stateful_sdpa_fusion.cpp | 4 +- .../x64/fuse_reshape_transpose_to_sdpa.cpp | 414 +++++++++--------- 2 files changed, 208 insertions(+), 210 deletions(-) diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp index 546f0ba397f54d..f3b1d926a3dbc7 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp @@ -302,9 +302,7 @@ StatefulSDPAFusion::StatefulSDPAFusion() { bool SDPASubgraphFusion::run_on_model(const std::shared_ptr& f) { RUN_ON_FUNCTION_SCOPE(SDPASubgraphFusion); - using namespace ov::pass::pattern; - ov::pass::Manager manager(get_pass_config(), "SDPASubgraphFusion"); - manager.set_per_pass_validation(false); + ov::pass::Manager manager("SDPASubgraphFusion"); CPU_REGISTER_PASS_COMMON(manager, ov::pass::SimplifyGatherShapeOf); CPU_REGISTER_PASS_COMMON(manager, ov::pass::transpose_sinking::TSShapeOfForward); diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp index a75156c0f69fcb..0da3732c295b5c 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp @@ -34,212 +34,212 @@ namespace test { */ // -using InputShapeAndReshapeOrder = std::pair, std::vector>; -using FuseSDPAReshapeTransposeTestParams = std::tuple; -class FuseSDPAReshapeTransposeTest : virtual public ov::test::SubgraphBaseTest, - public testing::WithParamInterface, - public CPUTestsBase { -public: - static std::string getTestCaseName(const testing::TestParamInfo& obj) { - ElementType inType; - InputShapeAndReshapeOrder inputShapeAndOrders; - std::tie(inType, inputShapeAndOrders) = obj.param; - std::ostringstream result; - std::vector& inputShapes = inputShapeAndOrders.first; - auto& reshapeOrderHS = inputShapeAndOrders.second; - result << "IS="; - for (const auto& shape : inputShapes) { - result << ov::test::utils::partialShape2str({shape.first}) << "_"; - } - result << "TS="; - for (const auto& shape : inputShapes) { - result << "("; - if (!shape.second.empty()) { - for (const auto& itr : shape.second) { - result << ov::test::utils::vec2str(itr); - } - } - result << ")_"; - } - result << "Prc=" << inType << "_"; - result << "ReshapeOrderHS="; - result << "("; - for (const auto& itr : reshapeOrderHS) { - result << itr << ","; - } - result << ")"; - - return result.str(); - } - - void SetUp() override { - ElementType inType; - InputShapeAndReshapeOrder inputShapeAndOrders; - std::tie(inType, inputShapeAndOrders) = this->GetParam(); - std::vector& inputShapes = inputShapeAndOrders.first; - auto& reshapeOrderHS = inputShapeAndOrders.second; - targetDevice = ov::test::utils::DEVICE_CPU; - rel_threshold = 1e-2f; - configuration[ov::hint::inference_precision.name()] = ov::element::f32; - if (inType == ElementType::bf16) { - configuration[ov::hint::inference_precision.name()] = ov::element::bf16; - rel_threshold = 0.01f; - } - init_input_shapes(inputShapes); - - // pre SDPA reshape->transpose - ov::ParameterVector inputParams(3); - ov::SinkVector sinkNodes; - OutputVector transposes(3); - for (size_t i = 0; i < 3u; i++) { - inputParams[i] = std::make_shared(inType, inputDynamicShapes[0]); - - auto reshape_axis = - ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 0, reshapeOrderHS[0], reshapeOrderHS[1]}); - - std::shared_ptr reshape_input_1 = inputParams[i]; - if (i > 0) { - auto var = std::make_shared( - ov::op::util::VariableInfo{inputDynamicShapes[0], inType, "var_" + std::to_string(i)}); - auto readvalue = std::make_shared(inputParams[i], var); - auto assign = std::make_shared(readvalue, var); - sinkNodes.emplace_back(assign); - reshape_input_1 = readvalue; - } - - auto reshape = std::make_shared(reshape_input_1, reshape_axis, true); - auto transposeOrder = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}); - transposes[i] = std::make_shared(reshape, transposeOrder); - } - - auto sdpa = std::make_shared(transposes, false); - sdpa->set_friendly_name("mha"); - - // post SDPA transpose + reshape - auto postOrder = - ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 2, 1, 3}); // BHLS -> BLHS - auto transposeSDPA = std::make_shared(sdpa, postOrder); - - auto constReshape = - ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, reshapeOrderHS[0] * reshapeOrderHS[1]}); - auto reshapeSDPA = std::make_shared(transposeSDPA, constReshape, true); // BLHS -> B,L,HxS - - function = std::make_shared(ov::OutputVector{reshapeSDPA}, - sinkNodes, - inputParams, - "FuseSDPAReshapeTranspose"); - targetDevice = ov::test::utils::DEVICE_CPU; - functionRefs = function->clone(); - pass::Manager manager; - // decompose ScaledDotProductAttention - manager.register_pass(); - manager.run_passes(functionRefs); - } - - template - static void strided_iota(IT first, size_t n, T value, T stride) { - for (size_t i = 0; i < n; i++) { - *first++ = value; - value += stride; - } - } - void generate(int idx, const std::vector& targetInputStaticShapes) { - inputs.clear(); - auto create_input = [this] (std::shared_ptr param, ov::Shape shape, float val) { - if (param->get_element_type() == ov::element::i32) { - ov::Tensor t{ov::element::i32, shape}; - auto size = ov::shape_size(shape); - auto* p = static_cast(t.data()); - auto start = static_cast(val); - for (size_t i = 0; i < size; i++) { - p[i] = (start + i) % size; - } - inputs.insert({param, t}); - } else if (param->get_element_type() == ov::element::f32) { - ov::Tensor t{ov::element::f32, shape}; - strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); - inputs.insert({param, t}); - } else { - ASSERT_TRUE(param->get_element_type() == ov::element::bf16); - ov::Tensor t{ov::element::bf16, shape}; - strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); - inputs.insert({param, t}); - } - }; - // q, k, v - create_input(function->get_parameters()[0], targetInputStaticShapes[0], idx + 1.0f); - create_input(function->get_parameters()[1], targetInputStaticShapes[0], idx + 2.0f); - create_input(function->get_parameters()[2], targetInputStaticShapes[0], idx + 3.0f); - } - void prepare() { - compile_model(); - inferRequest = compiledModel.create_infer_request(); - ASSERT_TRUE(inferRequest); - } - void reset() { - for (auto&& state : inferRequest.query_state()) { - state.reset(); - } - } - - std::vector run_test(std::shared_ptr model) { - function = model; - prepare(); - std::vector outputs; - int idx = 0; - for (auto&& shapes : targetStaticShapes) { - generate(idx++, shapes); - for (const auto& input : inputs) { - inferRequest.set_tensor(input.first, input.second); - } - inferRequest.infer(); - auto outputTensor = inferRequest.get_output_tensor(0); - ov::Tensor copy{outputTensor.get_element_type(), outputTensor.get_shape()}; - outputTensor.copy_to(copy); - outputs.push_back(copy); - reset(); - } - return outputs; - } -}; - -TEST_P(FuseSDPAReshapeTransposeTest, CompareWithRefs) { - SKIP_IF_CURRENT_TEST_IS_DISABLED(); - bool reshape_transpose_fused = false; - auto actualOutputs = run_test(function); - CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1); - CheckNumberOfNodesWithType(compiledModel, "Reshape", 0); - CheckNumberOfNodesWithType(compiledModel, "Transpose", 0); - for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) { - if (n->get_friendly_name() == "mha/fused_reshape_transpose") { - reshape_transpose_fused = true; - } - } - ASSERT_TRUE(reshape_transpose_fused); - - auto expectedOutputs = run_test(functionRefs); - for (size_t i = 0; i < actualOutputs.size(); i++) { - ov::test::utils::compare(expectedOutputs[i], actualOutputs[i], abs_threshold, rel_threshold); - } -} - -namespace { -const std::vector inputShapeAndReshapeOrders = { - // - { - {{ - // Q,K,V:[B, L, H*S] - {{-1, -1, 4 * 16}, {{1, 1, 4 * 16}, {1, 2, 4 * 16}, {2, 2, 4 * 16}}}, - }, - // reshapeOrderHS - {4, 16}}, - }}; - -INSTANTIATE_TEST_SUITE_P(smoke_FuseSDPAReshapeTransposeTest, - FuseSDPAReshapeTransposeTest, - ::testing::Combine(::testing::Values(ElementType::f32), - ::testing::ValuesIn(inputShapeAndReshapeOrders)), - FuseSDPAReshapeTransposeTest::getTestCaseName); -} // namespace +// using InputShapeAndReshapeOrder = std::pair, std::vector>; +// using FuseSDPAReshapeTransposeTestParams = std::tuple; +// class FuseSDPAReshapeTransposeTest : virtual public ov::test::SubgraphBaseTest, +// public testing::WithParamInterface, +// public CPUTestsBase { +// public: +// static std::string getTestCaseName(const testing::TestParamInfo& obj) { +// ElementType inType; +// InputShapeAndReshapeOrder inputShapeAndOrders; +// std::tie(inType, inputShapeAndOrders) = obj.param; +// std::ostringstream result; +// std::vector& inputShapes = inputShapeAndOrders.first; +// auto& reshapeOrderHS = inputShapeAndOrders.second; +// result << "IS="; +// for (const auto& shape : inputShapes) { +// result << ov::test::utils::partialShape2str({shape.first}) << "_"; +// } +// result << "TS="; +// for (const auto& shape : inputShapes) { +// result << "("; +// if (!shape.second.empty()) { +// for (const auto& itr : shape.second) { +// result << ov::test::utils::vec2str(itr); +// } +// } +// result << ")_"; +// } +// result << "Prc=" << inType << "_"; +// result << "ReshapeOrderHS="; +// result << "("; +// for (const auto& itr : reshapeOrderHS) { +// result << itr << ","; +// } +// result << ")"; + +// return result.str(); +// } + +// void SetUp() override { +// ElementType inType; +// InputShapeAndReshapeOrder inputShapeAndOrders; +// std::tie(inType, inputShapeAndOrders) = this->GetParam(); +// std::vector& inputShapes = inputShapeAndOrders.first; +// auto& reshapeOrderHS = inputShapeAndOrders.second; +// targetDevice = ov::test::utils::DEVICE_CPU; +// rel_threshold = 1e-2f; +// configuration[ov::hint::inference_precision.name()] = ov::element::f32; +// if (inType == ElementType::bf16) { +// configuration[ov::hint::inference_precision.name()] = ov::element::bf16; +// rel_threshold = 0.01f; +// } +// init_input_shapes(inputShapes); + +// // pre SDPA reshape->transpose +// ov::ParameterVector inputParams(3); +// ov::SinkVector sinkNodes; +// OutputVector transposes(3); +// for (size_t i = 0; i < 3u; i++) { +// inputParams[i] = std::make_shared(inType, inputDynamicShapes[0]); + +// auto reshape_axis = +// ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 0, reshapeOrderHS[0], reshapeOrderHS[1]}); + +// std::shared_ptr reshape_input_1 = inputParams[i]; +// if (i > 0) { +// auto var = std::make_shared( +// ov::op::util::VariableInfo{inputDynamicShapes[0], inType, "var_" + std::to_string(i)}); +// auto readvalue = std::make_shared(inputParams[i], var); +// auto assign = std::make_shared(readvalue, var); +// sinkNodes.emplace_back(assign); +// reshape_input_1 = readvalue; +// } + +// auto reshape = std::make_shared(reshape_input_1, reshape_axis, true); +// auto transposeOrder = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}); +// transposes[i] = std::make_shared(reshape, transposeOrder); +// } + +// auto sdpa = std::make_shared(transposes, false); +// sdpa->set_friendly_name("mha"); + +// // post SDPA transpose + reshape +// auto postOrder = +// ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 2, 1, 3}); // BHLS -> BLHS +// auto transposeSDPA = std::make_shared(sdpa, postOrder); + +// auto constReshape = +// ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, reshapeOrderHS[0] * reshapeOrderHS[1]}); +// auto reshapeSDPA = std::make_shared(transposeSDPA, constReshape, true); // BLHS -> B,L,HxS + +// function = std::make_shared(ov::OutputVector{reshapeSDPA}, +// sinkNodes, +// inputParams, +// "FuseSDPAReshapeTranspose"); +// targetDevice = ov::test::utils::DEVICE_CPU; +// functionRefs = function->clone(); +// pass::Manager manager; +// // decompose ScaledDotProductAttention +// manager.register_pass(); +// manager.run_passes(functionRefs); +// } + +// template +// static void strided_iota(IT first, size_t n, T value, T stride) { +// for (size_t i = 0; i < n; i++) { +// *first++ = value; +// value += stride; +// } +// } +// void generate(int idx, const std::vector& targetInputStaticShapes) { +// inputs.clear(); +// auto create_input = [this] (std::shared_ptr param, ov::Shape shape, float val) { +// if (param->get_element_type() == ov::element::i32) { +// ov::Tensor t{ov::element::i32, shape}; +// auto size = ov::shape_size(shape); +// auto* p = static_cast(t.data()); +// auto start = static_cast(val); +// for (size_t i = 0; i < size; i++) { +// p[i] = (start + i) % size; +// } +// inputs.insert({param, t}); +// } else if (param->get_element_type() == ov::element::f32) { +// ov::Tensor t{ov::element::f32, shape}; +// strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); +// inputs.insert({param, t}); +// } else { +// ASSERT_TRUE(param->get_element_type() == ov::element::bf16); +// ov::Tensor t{ov::element::bf16, shape}; +// strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); +// inputs.insert({param, t}); +// } +// }; +// // q, k, v +// create_input(function->get_parameters()[0], targetInputStaticShapes[0], idx + 1.0f); +// create_input(function->get_parameters()[1], targetInputStaticShapes[0], idx + 2.0f); +// create_input(function->get_parameters()[2], targetInputStaticShapes[0], idx + 3.0f); +// } +// void prepare() { +// compile_model(); +// inferRequest = compiledModel.create_infer_request(); +// ASSERT_TRUE(inferRequest); +// } +// void reset() { +// for (auto&& state : inferRequest.query_state()) { +// state.reset(); +// } +// } + +// std::vector run_test(std::shared_ptr model) { +// function = model; +// prepare(); +// std::vector outputs; +// int idx = 0; +// for (auto&& shapes : targetStaticShapes) { +// generate(idx++, shapes); +// for (const auto& input : inputs) { +// inferRequest.set_tensor(input.first, input.second); +// } +// inferRequest.infer(); +// auto outputTensor = inferRequest.get_output_tensor(0); +// ov::Tensor copy{outputTensor.get_element_type(), outputTensor.get_shape()}; +// outputTensor.copy_to(copy); +// outputs.push_back(copy); +// reset(); +// } +// return outputs; +// } +// }; + +// TEST_P(FuseSDPAReshapeTransposeTest, CompareWithRefs) { +// SKIP_IF_CURRENT_TEST_IS_DISABLED(); +// bool reshape_transpose_fused = false; +// auto actualOutputs = run_test(function); +// CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1); +// CheckNumberOfNodesWithType(compiledModel, "Reshape", 0); +// CheckNumberOfNodesWithType(compiledModel, "Transpose", 0); +// for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) { +// if (n->get_friendly_name() == "mha/fused_reshape_transpose") { +// reshape_transpose_fused = true; +// } +// } +// ASSERT_TRUE(reshape_transpose_fused); + +// auto expectedOutputs = run_test(functionRefs); +// for (size_t i = 0; i < actualOutputs.size(); i++) { +// ov::test::utils::compare(expectedOutputs[i], actualOutputs[i], abs_threshold, rel_threshold); +// } +// } + +// namespace { +// const std::vector inputShapeAndReshapeOrders = { +// // +// { +// {{ +// // Q,K,V:[B, L, H*S] +// {{-1, -1, 4 * 16}, {{1, 1, 4 * 16}, {1, 2, 4 * 16}, {2, 2, 4 * 16}}}, +// }, +// // reshapeOrderHS +// {4, 16}}, +// }}; + +// INSTANTIATE_TEST_SUITE_P(smoke_FuseSDPAReshapeTransposeTest, +// FuseSDPAReshapeTransposeTest, +// ::testing::Combine(::testing::Values(ElementType::f32), +// ::testing::ValuesIn(inputShapeAndReshapeOrders)), +// FuseSDPAReshapeTransposeTest::getTestCaseName); +// } // namespace } // namespace test } // namespace ov From 2c7b9642d98818eb385449e42d17a7b8bffe5e7c Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Fri, 27 Dec 2024 05:44:51 +0100 Subject: [PATCH 09/13] add SDPAFuseTransposeReshape back --- .../common/pass/stateful_sdpa_fusion.cpp | 3 +- .../x64/fuse_reshape_transpose_to_sdpa.cpp | 414 +++++++++--------- 2 files changed, 209 insertions(+), 208 deletions(-) diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp index f3b1d926a3dbc7..fe4a71f44be958 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp @@ -23,6 +23,7 @@ #include "ov_ops/type_relaxed.hpp" #include "transformations/common_optimizations/simplify_shape_of_sub_graph.hpp" #include "transformations/cpu_opset/common/op/sdpa.hpp" +#include "transformations/cpu_opset/x64/pass/sdpa_fuse_transpose_reshape.hpp" #include "transformations/defs.hpp" #include "transformations/op_conversions/convert_broadcast3.hpp" #include "transformations/transpose_sinking/ts_shape_of.hpp" @@ -307,7 +308,7 @@ bool SDPASubgraphFusion::run_on_model(const std::shared_ptr& f) { CPU_REGISTER_PASS_COMMON(manager, ov::pass::SimplifyGatherShapeOf); CPU_REGISTER_PASS_COMMON(manager, ov::pass::transpose_sinking::TSShapeOfForward); CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion); - CPU_REGISTER_PASS_COMMON(manager, ov::pass::Validate); + CPU_REGISTER_PASS_X64(manager, SDPAFuseTransposeReshape); manager.run_passes(f); return false; diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp index 0da3732c295b5c..a75156c0f69fcb 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp @@ -34,212 +34,212 @@ namespace test { */ // -// using InputShapeAndReshapeOrder = std::pair, std::vector>; -// using FuseSDPAReshapeTransposeTestParams = std::tuple; -// class FuseSDPAReshapeTransposeTest : virtual public ov::test::SubgraphBaseTest, -// public testing::WithParamInterface, -// public CPUTestsBase { -// public: -// static std::string getTestCaseName(const testing::TestParamInfo& obj) { -// ElementType inType; -// InputShapeAndReshapeOrder inputShapeAndOrders; -// std::tie(inType, inputShapeAndOrders) = obj.param; -// std::ostringstream result; -// std::vector& inputShapes = inputShapeAndOrders.first; -// auto& reshapeOrderHS = inputShapeAndOrders.second; -// result << "IS="; -// for (const auto& shape : inputShapes) { -// result << ov::test::utils::partialShape2str({shape.first}) << "_"; -// } -// result << "TS="; -// for (const auto& shape : inputShapes) { -// result << "("; -// if (!shape.second.empty()) { -// for (const auto& itr : shape.second) { -// result << ov::test::utils::vec2str(itr); -// } -// } -// result << ")_"; -// } -// result << "Prc=" << inType << "_"; -// result << "ReshapeOrderHS="; -// result << "("; -// for (const auto& itr : reshapeOrderHS) { -// result << itr << ","; -// } -// result << ")"; - -// return result.str(); -// } - -// void SetUp() override { -// ElementType inType; -// InputShapeAndReshapeOrder inputShapeAndOrders; -// std::tie(inType, inputShapeAndOrders) = this->GetParam(); -// std::vector& inputShapes = inputShapeAndOrders.first; -// auto& reshapeOrderHS = inputShapeAndOrders.second; -// targetDevice = ov::test::utils::DEVICE_CPU; -// rel_threshold = 1e-2f; -// configuration[ov::hint::inference_precision.name()] = ov::element::f32; -// if (inType == ElementType::bf16) { -// configuration[ov::hint::inference_precision.name()] = ov::element::bf16; -// rel_threshold = 0.01f; -// } -// init_input_shapes(inputShapes); - -// // pre SDPA reshape->transpose -// ov::ParameterVector inputParams(3); -// ov::SinkVector sinkNodes; -// OutputVector transposes(3); -// for (size_t i = 0; i < 3u; i++) { -// inputParams[i] = std::make_shared(inType, inputDynamicShapes[0]); - -// auto reshape_axis = -// ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 0, reshapeOrderHS[0], reshapeOrderHS[1]}); - -// std::shared_ptr reshape_input_1 = inputParams[i]; -// if (i > 0) { -// auto var = std::make_shared( -// ov::op::util::VariableInfo{inputDynamicShapes[0], inType, "var_" + std::to_string(i)}); -// auto readvalue = std::make_shared(inputParams[i], var); -// auto assign = std::make_shared(readvalue, var); -// sinkNodes.emplace_back(assign); -// reshape_input_1 = readvalue; -// } - -// auto reshape = std::make_shared(reshape_input_1, reshape_axis, true); -// auto transposeOrder = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}); -// transposes[i] = std::make_shared(reshape, transposeOrder); -// } - -// auto sdpa = std::make_shared(transposes, false); -// sdpa->set_friendly_name("mha"); - -// // post SDPA transpose + reshape -// auto postOrder = -// ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 2, 1, 3}); // BHLS -> BLHS -// auto transposeSDPA = std::make_shared(sdpa, postOrder); - -// auto constReshape = -// ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, reshapeOrderHS[0] * reshapeOrderHS[1]}); -// auto reshapeSDPA = std::make_shared(transposeSDPA, constReshape, true); // BLHS -> B,L,HxS - -// function = std::make_shared(ov::OutputVector{reshapeSDPA}, -// sinkNodes, -// inputParams, -// "FuseSDPAReshapeTranspose"); -// targetDevice = ov::test::utils::DEVICE_CPU; -// functionRefs = function->clone(); -// pass::Manager manager; -// // decompose ScaledDotProductAttention -// manager.register_pass(); -// manager.run_passes(functionRefs); -// } - -// template -// static void strided_iota(IT first, size_t n, T value, T stride) { -// for (size_t i = 0; i < n; i++) { -// *first++ = value; -// value += stride; -// } -// } -// void generate(int idx, const std::vector& targetInputStaticShapes) { -// inputs.clear(); -// auto create_input = [this] (std::shared_ptr param, ov::Shape shape, float val) { -// if (param->get_element_type() == ov::element::i32) { -// ov::Tensor t{ov::element::i32, shape}; -// auto size = ov::shape_size(shape); -// auto* p = static_cast(t.data()); -// auto start = static_cast(val); -// for (size_t i = 0; i < size; i++) { -// p[i] = (start + i) % size; -// } -// inputs.insert({param, t}); -// } else if (param->get_element_type() == ov::element::f32) { -// ov::Tensor t{ov::element::f32, shape}; -// strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); -// inputs.insert({param, t}); -// } else { -// ASSERT_TRUE(param->get_element_type() == ov::element::bf16); -// ov::Tensor t{ov::element::bf16, shape}; -// strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); -// inputs.insert({param, t}); -// } -// }; -// // q, k, v -// create_input(function->get_parameters()[0], targetInputStaticShapes[0], idx + 1.0f); -// create_input(function->get_parameters()[1], targetInputStaticShapes[0], idx + 2.0f); -// create_input(function->get_parameters()[2], targetInputStaticShapes[0], idx + 3.0f); -// } -// void prepare() { -// compile_model(); -// inferRequest = compiledModel.create_infer_request(); -// ASSERT_TRUE(inferRequest); -// } -// void reset() { -// for (auto&& state : inferRequest.query_state()) { -// state.reset(); -// } -// } - -// std::vector run_test(std::shared_ptr model) { -// function = model; -// prepare(); -// std::vector outputs; -// int idx = 0; -// for (auto&& shapes : targetStaticShapes) { -// generate(idx++, shapes); -// for (const auto& input : inputs) { -// inferRequest.set_tensor(input.first, input.second); -// } -// inferRequest.infer(); -// auto outputTensor = inferRequest.get_output_tensor(0); -// ov::Tensor copy{outputTensor.get_element_type(), outputTensor.get_shape()}; -// outputTensor.copy_to(copy); -// outputs.push_back(copy); -// reset(); -// } -// return outputs; -// } -// }; - -// TEST_P(FuseSDPAReshapeTransposeTest, CompareWithRefs) { -// SKIP_IF_CURRENT_TEST_IS_DISABLED(); -// bool reshape_transpose_fused = false; -// auto actualOutputs = run_test(function); -// CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1); -// CheckNumberOfNodesWithType(compiledModel, "Reshape", 0); -// CheckNumberOfNodesWithType(compiledModel, "Transpose", 0); -// for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) { -// if (n->get_friendly_name() == "mha/fused_reshape_transpose") { -// reshape_transpose_fused = true; -// } -// } -// ASSERT_TRUE(reshape_transpose_fused); - -// auto expectedOutputs = run_test(functionRefs); -// for (size_t i = 0; i < actualOutputs.size(); i++) { -// ov::test::utils::compare(expectedOutputs[i], actualOutputs[i], abs_threshold, rel_threshold); -// } -// } - -// namespace { -// const std::vector inputShapeAndReshapeOrders = { -// // -// { -// {{ -// // Q,K,V:[B, L, H*S] -// {{-1, -1, 4 * 16}, {{1, 1, 4 * 16}, {1, 2, 4 * 16}, {2, 2, 4 * 16}}}, -// }, -// // reshapeOrderHS -// {4, 16}}, -// }}; - -// INSTANTIATE_TEST_SUITE_P(smoke_FuseSDPAReshapeTransposeTest, -// FuseSDPAReshapeTransposeTest, -// ::testing::Combine(::testing::Values(ElementType::f32), -// ::testing::ValuesIn(inputShapeAndReshapeOrders)), -// FuseSDPAReshapeTransposeTest::getTestCaseName); -// } // namespace +using InputShapeAndReshapeOrder = std::pair, std::vector>; +using FuseSDPAReshapeTransposeTestParams = std::tuple; +class FuseSDPAReshapeTransposeTest : virtual public ov::test::SubgraphBaseTest, + public testing::WithParamInterface, + public CPUTestsBase { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + ElementType inType; + InputShapeAndReshapeOrder inputShapeAndOrders; + std::tie(inType, inputShapeAndOrders) = obj.param; + std::ostringstream result; + std::vector& inputShapes = inputShapeAndOrders.first; + auto& reshapeOrderHS = inputShapeAndOrders.second; + result << "IS="; + for (const auto& shape : inputShapes) { + result << ov::test::utils::partialShape2str({shape.first}) << "_"; + } + result << "TS="; + for (const auto& shape : inputShapes) { + result << "("; + if (!shape.second.empty()) { + for (const auto& itr : shape.second) { + result << ov::test::utils::vec2str(itr); + } + } + result << ")_"; + } + result << "Prc=" << inType << "_"; + result << "ReshapeOrderHS="; + result << "("; + for (const auto& itr : reshapeOrderHS) { + result << itr << ","; + } + result << ")"; + + return result.str(); + } + + void SetUp() override { + ElementType inType; + InputShapeAndReshapeOrder inputShapeAndOrders; + std::tie(inType, inputShapeAndOrders) = this->GetParam(); + std::vector& inputShapes = inputShapeAndOrders.first; + auto& reshapeOrderHS = inputShapeAndOrders.second; + targetDevice = ov::test::utils::DEVICE_CPU; + rel_threshold = 1e-2f; + configuration[ov::hint::inference_precision.name()] = ov::element::f32; + if (inType == ElementType::bf16) { + configuration[ov::hint::inference_precision.name()] = ov::element::bf16; + rel_threshold = 0.01f; + } + init_input_shapes(inputShapes); + + // pre SDPA reshape->transpose + ov::ParameterVector inputParams(3); + ov::SinkVector sinkNodes; + OutputVector transposes(3); + for (size_t i = 0; i < 3u; i++) { + inputParams[i] = std::make_shared(inType, inputDynamicShapes[0]); + + auto reshape_axis = + ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 0, reshapeOrderHS[0], reshapeOrderHS[1]}); + + std::shared_ptr reshape_input_1 = inputParams[i]; + if (i > 0) { + auto var = std::make_shared( + ov::op::util::VariableInfo{inputDynamicShapes[0], inType, "var_" + std::to_string(i)}); + auto readvalue = std::make_shared(inputParams[i], var); + auto assign = std::make_shared(readvalue, var); + sinkNodes.emplace_back(assign); + reshape_input_1 = readvalue; + } + + auto reshape = std::make_shared(reshape_input_1, reshape_axis, true); + auto transposeOrder = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}); + transposes[i] = std::make_shared(reshape, transposeOrder); + } + + auto sdpa = std::make_shared(transposes, false); + sdpa->set_friendly_name("mha"); + + // post SDPA transpose + reshape + auto postOrder = + ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 2, 1, 3}); // BHLS -> BLHS + auto transposeSDPA = std::make_shared(sdpa, postOrder); + + auto constReshape = + ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, reshapeOrderHS[0] * reshapeOrderHS[1]}); + auto reshapeSDPA = std::make_shared(transposeSDPA, constReshape, true); // BLHS -> B,L,HxS + + function = std::make_shared(ov::OutputVector{reshapeSDPA}, + sinkNodes, + inputParams, + "FuseSDPAReshapeTranspose"); + targetDevice = ov::test::utils::DEVICE_CPU; + functionRefs = function->clone(); + pass::Manager manager; + // decompose ScaledDotProductAttention + manager.register_pass(); + manager.run_passes(functionRefs); + } + + template + static void strided_iota(IT first, size_t n, T value, T stride) { + for (size_t i = 0; i < n; i++) { + *first++ = value; + value += stride; + } + } + void generate(int idx, const std::vector& targetInputStaticShapes) { + inputs.clear(); + auto create_input = [this] (std::shared_ptr param, ov::Shape shape, float val) { + if (param->get_element_type() == ov::element::i32) { + ov::Tensor t{ov::element::i32, shape}; + auto size = ov::shape_size(shape); + auto* p = static_cast(t.data()); + auto start = static_cast(val); + for (size_t i = 0; i < size; i++) { + p[i] = (start + i) % size; + } + inputs.insert({param, t}); + } else if (param->get_element_type() == ov::element::f32) { + ov::Tensor t{ov::element::f32, shape}; + strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); + inputs.insert({param, t}); + } else { + ASSERT_TRUE(param->get_element_type() == ov::element::bf16); + ov::Tensor t{ov::element::bf16, shape}; + strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); + inputs.insert({param, t}); + } + }; + // q, k, v + create_input(function->get_parameters()[0], targetInputStaticShapes[0], idx + 1.0f); + create_input(function->get_parameters()[1], targetInputStaticShapes[0], idx + 2.0f); + create_input(function->get_parameters()[2], targetInputStaticShapes[0], idx + 3.0f); + } + void prepare() { + compile_model(); + inferRequest = compiledModel.create_infer_request(); + ASSERT_TRUE(inferRequest); + } + void reset() { + for (auto&& state : inferRequest.query_state()) { + state.reset(); + } + } + + std::vector run_test(std::shared_ptr model) { + function = model; + prepare(); + std::vector outputs; + int idx = 0; + for (auto&& shapes : targetStaticShapes) { + generate(idx++, shapes); + for (const auto& input : inputs) { + inferRequest.set_tensor(input.first, input.second); + } + inferRequest.infer(); + auto outputTensor = inferRequest.get_output_tensor(0); + ov::Tensor copy{outputTensor.get_element_type(), outputTensor.get_shape()}; + outputTensor.copy_to(copy); + outputs.push_back(copy); + reset(); + } + return outputs; + } +}; + +TEST_P(FuseSDPAReshapeTransposeTest, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED(); + bool reshape_transpose_fused = false; + auto actualOutputs = run_test(function); + CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1); + CheckNumberOfNodesWithType(compiledModel, "Reshape", 0); + CheckNumberOfNodesWithType(compiledModel, "Transpose", 0); + for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) { + if (n->get_friendly_name() == "mha/fused_reshape_transpose") { + reshape_transpose_fused = true; + } + } + ASSERT_TRUE(reshape_transpose_fused); + + auto expectedOutputs = run_test(functionRefs); + for (size_t i = 0; i < actualOutputs.size(); i++) { + ov::test::utils::compare(expectedOutputs[i], actualOutputs[i], abs_threshold, rel_threshold); + } +} + +namespace { +const std::vector inputShapeAndReshapeOrders = { + // + { + {{ + // Q,K,V:[B, L, H*S] + {{-1, -1, 4 * 16}, {{1, 1, 4 * 16}, {1, 2, 4 * 16}, {2, 2, 4 * 16}}}, + }, + // reshapeOrderHS + {4, 16}}, + }}; + +INSTANTIATE_TEST_SUITE_P(smoke_FuseSDPAReshapeTransposeTest, + FuseSDPAReshapeTransposeTest, + ::testing::Combine(::testing::Values(ElementType::f32), + ::testing::ValuesIn(inputShapeAndReshapeOrders)), + FuseSDPAReshapeTransposeTest::getTestCaseName); +} // namespace } // namespace test } // namespace ov From e2857157c1beb84cbcbd3a20fb4432f8345ec4e1 Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Fri, 27 Dec 2024 08:20:34 +0100 Subject: [PATCH 10/13] modify test to cover the change --- .../src/common/concat_multiple_query_sdp.cpp | 6 +++--- .../src/common/concat_transpose_sdp_transpose.cpp | 14 +++++++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp index d74ab99fb3d5ab..fe5ba2b7eac5e7 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_multiple_query_sdp.cpp @@ -152,9 +152,9 @@ class ConcatMultiQuerySDPTest : public testing::WithParamInterface(concatK, unsquezeAxis); auto unsqueezeV = std::make_shared(concatV, unsquezeAxis); - auto targetShape = ov::op::v0::Constant::create(qkvType, {1, 1, 1, 4, 1}, {1}); - auto broadcastK = std::make_shared(unsqueezeK, targetShape); - auto broadcastV = std::make_shared(unsqueezeV, targetShape); + auto targetShape = ov::op::v0::Constant::create(element::i32, {5}, {1, 1, 1, 4, 1}); + auto broadcastK = std::make_shared(unsqueezeK, targetShape, op::BroadcastType::BIDIRECTIONAL); + auto broadcastV = std::make_shared(unsqueezeV, targetShape, op::BroadcastType::BIDIRECTIONAL); auto target4D = ov::op::v0::Constant::create(ov::element::i32, {4}, {0, 0, 8, 64}); diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp index f4166544af2bf2..8ba978e32c4b9c 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/concat_transpose_sdp_transpose.cpp @@ -71,7 +71,7 @@ class ConcatSDPTransposeTestBase : public testing::WithParamInterfaceGetParam(); std::vector& inputShapes = inputShapeAndOrders.first; transposeOrder = inputShapeAndOrders.second; @@ -124,6 +123,10 @@ class ConcatSDPTransposeTestBase : public testing::WithParamInterface(inputParams[0], preOrder); + std::shared_ptr transposeQ_shapeof; + if (hasShapeOf) { + transposeQ_shapeof = std::make_shared(transposeQ); + } auto concat_axis = transposeOrder[2]; auto beam_idx = std::make_shared(ElementType::i32, ov::PartialShape{-1}); @@ -166,6 +169,7 @@ class ConcatSDPTransposeTestBase : public testing::WithParamInterface(results, sinks, inputParams, "ConcatTranposeSDP"); @@ -237,6 +241,7 @@ class ConcatSDPTransposeTestBase : public testing::WithParamInterface transposeOrder; + bool hasShapeOf; }; class ConcatSDPTransposeTest : public ConcatSDPTransposeTestBase { @@ -287,7 +292,10 @@ TEST_P(ConcatSDPTransposeTest, CompareWithRefs) { CheckNumberOfNodesWithType(compiledModel, "Concatenation", 0); CheckNumberOfNodesWithType(compiledModel, "Reorder", 0); CheckNumberOfNodesWithType(compiledModel, "Transpose", 1); - CheckNumberOfNodesWithType(compiledModel, "Gather", 0); + // Transformation TSShapeOfForward will change: + // ?->transpose->shapeof ==> ?-->shapeof->gather + // |->transpose + CheckNumberOfNodesWithType(compiledModel, "Gather", hasShapeOf ? 1 : 0); auto expectedOutputs = run_test(functionRefs); CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 0); for (size_t i = 0; i < actualOutputs.size(); i++) { From 02c2d1938704d956275c1c1d2fad6cc7726c688a Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Mon, 30 Dec 2024 11:57:01 +0100 Subject: [PATCH 11/13] disable SDPAFuseTransposeReshape --- .../common/pass/stateful_sdpa_fusion.cpp | 1 - .../x64/fuse_reshape_transpose_to_sdpa.cpp | 414 +++++++++--------- 2 files changed, 207 insertions(+), 208 deletions(-) diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp index fe4a71f44be958..e930abf1102a8c 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp @@ -308,7 +308,6 @@ bool SDPASubgraphFusion::run_on_model(const std::shared_ptr& f) { CPU_REGISTER_PASS_COMMON(manager, ov::pass::SimplifyGatherShapeOf); CPU_REGISTER_PASS_COMMON(manager, ov::pass::transpose_sinking::TSShapeOfForward); CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion); - CPU_REGISTER_PASS_X64(manager, SDPAFuseTransposeReshape); manager.run_passes(f); return false; diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp index a75156c0f69fcb..0da3732c295b5c 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp @@ -34,212 +34,212 @@ namespace test { */ // -using InputShapeAndReshapeOrder = std::pair, std::vector>; -using FuseSDPAReshapeTransposeTestParams = std::tuple; -class FuseSDPAReshapeTransposeTest : virtual public ov::test::SubgraphBaseTest, - public testing::WithParamInterface, - public CPUTestsBase { -public: - static std::string getTestCaseName(const testing::TestParamInfo& obj) { - ElementType inType; - InputShapeAndReshapeOrder inputShapeAndOrders; - std::tie(inType, inputShapeAndOrders) = obj.param; - std::ostringstream result; - std::vector& inputShapes = inputShapeAndOrders.first; - auto& reshapeOrderHS = inputShapeAndOrders.second; - result << "IS="; - for (const auto& shape : inputShapes) { - result << ov::test::utils::partialShape2str({shape.first}) << "_"; - } - result << "TS="; - for (const auto& shape : inputShapes) { - result << "("; - if (!shape.second.empty()) { - for (const auto& itr : shape.second) { - result << ov::test::utils::vec2str(itr); - } - } - result << ")_"; - } - result << "Prc=" << inType << "_"; - result << "ReshapeOrderHS="; - result << "("; - for (const auto& itr : reshapeOrderHS) { - result << itr << ","; - } - result << ")"; - - return result.str(); - } - - void SetUp() override { - ElementType inType; - InputShapeAndReshapeOrder inputShapeAndOrders; - std::tie(inType, inputShapeAndOrders) = this->GetParam(); - std::vector& inputShapes = inputShapeAndOrders.first; - auto& reshapeOrderHS = inputShapeAndOrders.second; - targetDevice = ov::test::utils::DEVICE_CPU; - rel_threshold = 1e-2f; - configuration[ov::hint::inference_precision.name()] = ov::element::f32; - if (inType == ElementType::bf16) { - configuration[ov::hint::inference_precision.name()] = ov::element::bf16; - rel_threshold = 0.01f; - } - init_input_shapes(inputShapes); - - // pre SDPA reshape->transpose - ov::ParameterVector inputParams(3); - ov::SinkVector sinkNodes; - OutputVector transposes(3); - for (size_t i = 0; i < 3u; i++) { - inputParams[i] = std::make_shared(inType, inputDynamicShapes[0]); - - auto reshape_axis = - ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 0, reshapeOrderHS[0], reshapeOrderHS[1]}); - - std::shared_ptr reshape_input_1 = inputParams[i]; - if (i > 0) { - auto var = std::make_shared( - ov::op::util::VariableInfo{inputDynamicShapes[0], inType, "var_" + std::to_string(i)}); - auto readvalue = std::make_shared(inputParams[i], var); - auto assign = std::make_shared(readvalue, var); - sinkNodes.emplace_back(assign); - reshape_input_1 = readvalue; - } - - auto reshape = std::make_shared(reshape_input_1, reshape_axis, true); - auto transposeOrder = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}); - transposes[i] = std::make_shared(reshape, transposeOrder); - } - - auto sdpa = std::make_shared(transposes, false); - sdpa->set_friendly_name("mha"); - - // post SDPA transpose + reshape - auto postOrder = - ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 2, 1, 3}); // BHLS -> BLHS - auto transposeSDPA = std::make_shared(sdpa, postOrder); - - auto constReshape = - ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, reshapeOrderHS[0] * reshapeOrderHS[1]}); - auto reshapeSDPA = std::make_shared(transposeSDPA, constReshape, true); // BLHS -> B,L,HxS - - function = std::make_shared(ov::OutputVector{reshapeSDPA}, - sinkNodes, - inputParams, - "FuseSDPAReshapeTranspose"); - targetDevice = ov::test::utils::DEVICE_CPU; - functionRefs = function->clone(); - pass::Manager manager; - // decompose ScaledDotProductAttention - manager.register_pass(); - manager.run_passes(functionRefs); - } - - template - static void strided_iota(IT first, size_t n, T value, T stride) { - for (size_t i = 0; i < n; i++) { - *first++ = value; - value += stride; - } - } - void generate(int idx, const std::vector& targetInputStaticShapes) { - inputs.clear(); - auto create_input = [this] (std::shared_ptr param, ov::Shape shape, float val) { - if (param->get_element_type() == ov::element::i32) { - ov::Tensor t{ov::element::i32, shape}; - auto size = ov::shape_size(shape); - auto* p = static_cast(t.data()); - auto start = static_cast(val); - for (size_t i = 0; i < size; i++) { - p[i] = (start + i) % size; - } - inputs.insert({param, t}); - } else if (param->get_element_type() == ov::element::f32) { - ov::Tensor t{ov::element::f32, shape}; - strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); - inputs.insert({param, t}); - } else { - ASSERT_TRUE(param->get_element_type() == ov::element::bf16); - ov::Tensor t{ov::element::bf16, shape}; - strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); - inputs.insert({param, t}); - } - }; - // q, k, v - create_input(function->get_parameters()[0], targetInputStaticShapes[0], idx + 1.0f); - create_input(function->get_parameters()[1], targetInputStaticShapes[0], idx + 2.0f); - create_input(function->get_parameters()[2], targetInputStaticShapes[0], idx + 3.0f); - } - void prepare() { - compile_model(); - inferRequest = compiledModel.create_infer_request(); - ASSERT_TRUE(inferRequest); - } - void reset() { - for (auto&& state : inferRequest.query_state()) { - state.reset(); - } - } - - std::vector run_test(std::shared_ptr model) { - function = model; - prepare(); - std::vector outputs; - int idx = 0; - for (auto&& shapes : targetStaticShapes) { - generate(idx++, shapes); - for (const auto& input : inputs) { - inferRequest.set_tensor(input.first, input.second); - } - inferRequest.infer(); - auto outputTensor = inferRequest.get_output_tensor(0); - ov::Tensor copy{outputTensor.get_element_type(), outputTensor.get_shape()}; - outputTensor.copy_to(copy); - outputs.push_back(copy); - reset(); - } - return outputs; - } -}; - -TEST_P(FuseSDPAReshapeTransposeTest, CompareWithRefs) { - SKIP_IF_CURRENT_TEST_IS_DISABLED(); - bool reshape_transpose_fused = false; - auto actualOutputs = run_test(function); - CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1); - CheckNumberOfNodesWithType(compiledModel, "Reshape", 0); - CheckNumberOfNodesWithType(compiledModel, "Transpose", 0); - for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) { - if (n->get_friendly_name() == "mha/fused_reshape_transpose") { - reshape_transpose_fused = true; - } - } - ASSERT_TRUE(reshape_transpose_fused); - - auto expectedOutputs = run_test(functionRefs); - for (size_t i = 0; i < actualOutputs.size(); i++) { - ov::test::utils::compare(expectedOutputs[i], actualOutputs[i], abs_threshold, rel_threshold); - } -} - -namespace { -const std::vector inputShapeAndReshapeOrders = { - // - { - {{ - // Q,K,V:[B, L, H*S] - {{-1, -1, 4 * 16}, {{1, 1, 4 * 16}, {1, 2, 4 * 16}, {2, 2, 4 * 16}}}, - }, - // reshapeOrderHS - {4, 16}}, - }}; - -INSTANTIATE_TEST_SUITE_P(smoke_FuseSDPAReshapeTransposeTest, - FuseSDPAReshapeTransposeTest, - ::testing::Combine(::testing::Values(ElementType::f32), - ::testing::ValuesIn(inputShapeAndReshapeOrders)), - FuseSDPAReshapeTransposeTest::getTestCaseName); -} // namespace +// using InputShapeAndReshapeOrder = std::pair, std::vector>; +// using FuseSDPAReshapeTransposeTestParams = std::tuple; +// class FuseSDPAReshapeTransposeTest : virtual public ov::test::SubgraphBaseTest, +// public testing::WithParamInterface, +// public CPUTestsBase { +// public: +// static std::string getTestCaseName(const testing::TestParamInfo& obj) { +// ElementType inType; +// InputShapeAndReshapeOrder inputShapeAndOrders; +// std::tie(inType, inputShapeAndOrders) = obj.param; +// std::ostringstream result; +// std::vector& inputShapes = inputShapeAndOrders.first; +// auto& reshapeOrderHS = inputShapeAndOrders.second; +// result << "IS="; +// for (const auto& shape : inputShapes) { +// result << ov::test::utils::partialShape2str({shape.first}) << "_"; +// } +// result << "TS="; +// for (const auto& shape : inputShapes) { +// result << "("; +// if (!shape.second.empty()) { +// for (const auto& itr : shape.second) { +// result << ov::test::utils::vec2str(itr); +// } +// } +// result << ")_"; +// } +// result << "Prc=" << inType << "_"; +// result << "ReshapeOrderHS="; +// result << "("; +// for (const auto& itr : reshapeOrderHS) { +// result << itr << ","; +// } +// result << ")"; + +// return result.str(); +// } + +// void SetUp() override { +// ElementType inType; +// InputShapeAndReshapeOrder inputShapeAndOrders; +// std::tie(inType, inputShapeAndOrders) = this->GetParam(); +// std::vector& inputShapes = inputShapeAndOrders.first; +// auto& reshapeOrderHS = inputShapeAndOrders.second; +// targetDevice = ov::test::utils::DEVICE_CPU; +// rel_threshold = 1e-2f; +// configuration[ov::hint::inference_precision.name()] = ov::element::f32; +// if (inType == ElementType::bf16) { +// configuration[ov::hint::inference_precision.name()] = ov::element::bf16; +// rel_threshold = 0.01f; +// } +// init_input_shapes(inputShapes); + +// // pre SDPA reshape->transpose +// ov::ParameterVector inputParams(3); +// ov::SinkVector sinkNodes; +// OutputVector transposes(3); +// for (size_t i = 0; i < 3u; i++) { +// inputParams[i] = std::make_shared(inType, inputDynamicShapes[0]); + +// auto reshape_axis = +// ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 0, reshapeOrderHS[0], reshapeOrderHS[1]}); + +// std::shared_ptr reshape_input_1 = inputParams[i]; +// if (i > 0) { +// auto var = std::make_shared( +// ov::op::util::VariableInfo{inputDynamicShapes[0], inType, "var_" + std::to_string(i)}); +// auto readvalue = std::make_shared(inputParams[i], var); +// auto assign = std::make_shared(readvalue, var); +// sinkNodes.emplace_back(assign); +// reshape_input_1 = readvalue; +// } + +// auto reshape = std::make_shared(reshape_input_1, reshape_axis, true); +// auto transposeOrder = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}); +// transposes[i] = std::make_shared(reshape, transposeOrder); +// } + +// auto sdpa = std::make_shared(transposes, false); +// sdpa->set_friendly_name("mha"); + +// // post SDPA transpose + reshape +// auto postOrder = +// ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 2, 1, 3}); // BHLS -> BLHS +// auto transposeSDPA = std::make_shared(sdpa, postOrder); + +// auto constReshape = +// ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, reshapeOrderHS[0] * reshapeOrderHS[1]}); +// auto reshapeSDPA = std::make_shared(transposeSDPA, constReshape, true); // BLHS -> B,L,HxS + +// function = std::make_shared(ov::OutputVector{reshapeSDPA}, +// sinkNodes, +// inputParams, +// "FuseSDPAReshapeTranspose"); +// targetDevice = ov::test::utils::DEVICE_CPU; +// functionRefs = function->clone(); +// pass::Manager manager; +// // decompose ScaledDotProductAttention +// manager.register_pass(); +// manager.run_passes(functionRefs); +// } + +// template +// static void strided_iota(IT first, size_t n, T value, T stride) { +// for (size_t i = 0; i < n; i++) { +// *first++ = value; +// value += stride; +// } +// } +// void generate(int idx, const std::vector& targetInputStaticShapes) { +// inputs.clear(); +// auto create_input = [this] (std::shared_ptr param, ov::Shape shape, float val) { +// if (param->get_element_type() == ov::element::i32) { +// ov::Tensor t{ov::element::i32, shape}; +// auto size = ov::shape_size(shape); +// auto* p = static_cast(t.data()); +// auto start = static_cast(val); +// for (size_t i = 0; i < size; i++) { +// p[i] = (start + i) % size; +// } +// inputs.insert({param, t}); +// } else if (param->get_element_type() == ov::element::f32) { +// ov::Tensor t{ov::element::f32, shape}; +// strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); +// inputs.insert({param, t}); +// } else { +// ASSERT_TRUE(param->get_element_type() == ov::element::bf16); +// ov::Tensor t{ov::element::bf16, shape}; +// strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); +// inputs.insert({param, t}); +// } +// }; +// // q, k, v +// create_input(function->get_parameters()[0], targetInputStaticShapes[0], idx + 1.0f); +// create_input(function->get_parameters()[1], targetInputStaticShapes[0], idx + 2.0f); +// create_input(function->get_parameters()[2], targetInputStaticShapes[0], idx + 3.0f); +// } +// void prepare() { +// compile_model(); +// inferRequest = compiledModel.create_infer_request(); +// ASSERT_TRUE(inferRequest); +// } +// void reset() { +// for (auto&& state : inferRequest.query_state()) { +// state.reset(); +// } +// } + +// std::vector run_test(std::shared_ptr model) { +// function = model; +// prepare(); +// std::vector outputs; +// int idx = 0; +// for (auto&& shapes : targetStaticShapes) { +// generate(idx++, shapes); +// for (const auto& input : inputs) { +// inferRequest.set_tensor(input.first, input.second); +// } +// inferRequest.infer(); +// auto outputTensor = inferRequest.get_output_tensor(0); +// ov::Tensor copy{outputTensor.get_element_type(), outputTensor.get_shape()}; +// outputTensor.copy_to(copy); +// outputs.push_back(copy); +// reset(); +// } +// return outputs; +// } +// }; + +// TEST_P(FuseSDPAReshapeTransposeTest, CompareWithRefs) { +// SKIP_IF_CURRENT_TEST_IS_DISABLED(); +// bool reshape_transpose_fused = false; +// auto actualOutputs = run_test(function); +// CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1); +// CheckNumberOfNodesWithType(compiledModel, "Reshape", 0); +// CheckNumberOfNodesWithType(compiledModel, "Transpose", 0); +// for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) { +// if (n->get_friendly_name() == "mha/fused_reshape_transpose") { +// reshape_transpose_fused = true; +// } +// } +// ASSERT_TRUE(reshape_transpose_fused); + +// auto expectedOutputs = run_test(functionRefs); +// for (size_t i = 0; i < actualOutputs.size(); i++) { +// ov::test::utils::compare(expectedOutputs[i], actualOutputs[i], abs_threshold, rel_threshold); +// } +// } + +// namespace { +// const std::vector inputShapeAndReshapeOrders = { +// // +// { +// {{ +// // Q,K,V:[B, L, H*S] +// {{-1, -1, 4 * 16}, {{1, 1, 4 * 16}, {1, 2, 4 * 16}, {2, 2, 4 * 16}}}, +// }, +// // reshapeOrderHS +// {4, 16}}, +// }}; + +// INSTANTIATE_TEST_SUITE_P(smoke_FuseSDPAReshapeTransposeTest, +// FuseSDPAReshapeTransposeTest, +// ::testing::Combine(::testing::Values(ElementType::f32), +// ::testing::ValuesIn(inputShapeAndReshapeOrders)), +// FuseSDPAReshapeTransposeTest::getTestCaseName); +// } // namespace } // namespace test } // namespace ov From 1b9357717af73003e9e3a3c53e3238b151df5bcc Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Fri, 3 Jan 2025 05:31:50 +0100 Subject: [PATCH 12/13] apply review comments --- .../common/pass/stateful_sdpa_fusion.cpp | 2 + .../x64/fuse_reshape_transpose_to_sdpa.cpp | 414 +++++++++--------- 2 files changed, 209 insertions(+), 207 deletions(-) diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp index e930abf1102a8c..adc590b41cc948 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp @@ -308,6 +308,8 @@ bool SDPASubgraphFusion::run_on_model(const std::shared_ptr& f) { CPU_REGISTER_PASS_COMMON(manager, ov::pass::SimplifyGatherShapeOf); CPU_REGISTER_PASS_COMMON(manager, ov::pass::transpose_sinking::TSShapeOfForward); CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion); + // TODO: SDPAFuseTransposeReshape may cause regressions in icx. + // CPU_REGISTER_PASS_X64(manager, ov::intel_cpu::SDPAFuseTransposeReshape); manager.run_passes(f); return false; diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp index 0da3732c295b5c..a646eb03df1a31 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp @@ -34,212 +34,212 @@ namespace test { */ // -// using InputShapeAndReshapeOrder = std::pair, std::vector>; -// using FuseSDPAReshapeTransposeTestParams = std::tuple; -// class FuseSDPAReshapeTransposeTest : virtual public ov::test::SubgraphBaseTest, -// public testing::WithParamInterface, -// public CPUTestsBase { -// public: -// static std::string getTestCaseName(const testing::TestParamInfo& obj) { -// ElementType inType; -// InputShapeAndReshapeOrder inputShapeAndOrders; -// std::tie(inType, inputShapeAndOrders) = obj.param; -// std::ostringstream result; -// std::vector& inputShapes = inputShapeAndOrders.first; -// auto& reshapeOrderHS = inputShapeAndOrders.second; -// result << "IS="; -// for (const auto& shape : inputShapes) { -// result << ov::test::utils::partialShape2str({shape.first}) << "_"; -// } -// result << "TS="; -// for (const auto& shape : inputShapes) { -// result << "("; -// if (!shape.second.empty()) { -// for (const auto& itr : shape.second) { -// result << ov::test::utils::vec2str(itr); -// } -// } -// result << ")_"; -// } -// result << "Prc=" << inType << "_"; -// result << "ReshapeOrderHS="; -// result << "("; -// for (const auto& itr : reshapeOrderHS) { -// result << itr << ","; -// } -// result << ")"; - -// return result.str(); -// } - -// void SetUp() override { -// ElementType inType; -// InputShapeAndReshapeOrder inputShapeAndOrders; -// std::tie(inType, inputShapeAndOrders) = this->GetParam(); -// std::vector& inputShapes = inputShapeAndOrders.first; -// auto& reshapeOrderHS = inputShapeAndOrders.second; -// targetDevice = ov::test::utils::DEVICE_CPU; -// rel_threshold = 1e-2f; -// configuration[ov::hint::inference_precision.name()] = ov::element::f32; -// if (inType == ElementType::bf16) { -// configuration[ov::hint::inference_precision.name()] = ov::element::bf16; -// rel_threshold = 0.01f; -// } -// init_input_shapes(inputShapes); - -// // pre SDPA reshape->transpose -// ov::ParameterVector inputParams(3); -// ov::SinkVector sinkNodes; -// OutputVector transposes(3); -// for (size_t i = 0; i < 3u; i++) { -// inputParams[i] = std::make_shared(inType, inputDynamicShapes[0]); - -// auto reshape_axis = -// ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 0, reshapeOrderHS[0], reshapeOrderHS[1]}); - -// std::shared_ptr reshape_input_1 = inputParams[i]; -// if (i > 0) { -// auto var = std::make_shared( -// ov::op::util::VariableInfo{inputDynamicShapes[0], inType, "var_" + std::to_string(i)}); -// auto readvalue = std::make_shared(inputParams[i], var); -// auto assign = std::make_shared(readvalue, var); -// sinkNodes.emplace_back(assign); -// reshape_input_1 = readvalue; -// } - -// auto reshape = std::make_shared(reshape_input_1, reshape_axis, true); -// auto transposeOrder = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}); -// transposes[i] = std::make_shared(reshape, transposeOrder); -// } - -// auto sdpa = std::make_shared(transposes, false); -// sdpa->set_friendly_name("mha"); - -// // post SDPA transpose + reshape -// auto postOrder = -// ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 2, 1, 3}); // BHLS -> BLHS -// auto transposeSDPA = std::make_shared(sdpa, postOrder); - -// auto constReshape = -// ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, reshapeOrderHS[0] * reshapeOrderHS[1]}); -// auto reshapeSDPA = std::make_shared(transposeSDPA, constReshape, true); // BLHS -> B,L,HxS - -// function = std::make_shared(ov::OutputVector{reshapeSDPA}, -// sinkNodes, -// inputParams, -// "FuseSDPAReshapeTranspose"); -// targetDevice = ov::test::utils::DEVICE_CPU; -// functionRefs = function->clone(); -// pass::Manager manager; -// // decompose ScaledDotProductAttention -// manager.register_pass(); -// manager.run_passes(functionRefs); -// } - -// template -// static void strided_iota(IT first, size_t n, T value, T stride) { -// for (size_t i = 0; i < n; i++) { -// *first++ = value; -// value += stride; -// } -// } -// void generate(int idx, const std::vector& targetInputStaticShapes) { -// inputs.clear(); -// auto create_input = [this] (std::shared_ptr param, ov::Shape shape, float val) { -// if (param->get_element_type() == ov::element::i32) { -// ov::Tensor t{ov::element::i32, shape}; -// auto size = ov::shape_size(shape); -// auto* p = static_cast(t.data()); -// auto start = static_cast(val); -// for (size_t i = 0; i < size; i++) { -// p[i] = (start + i) % size; -// } -// inputs.insert({param, t}); -// } else if (param->get_element_type() == ov::element::f32) { -// ov::Tensor t{ov::element::f32, shape}; -// strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); -// inputs.insert({param, t}); -// } else { -// ASSERT_TRUE(param->get_element_type() == ov::element::bf16); -// ov::Tensor t{ov::element::bf16, shape}; -// strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); -// inputs.insert({param, t}); -// } -// }; -// // q, k, v -// create_input(function->get_parameters()[0], targetInputStaticShapes[0], idx + 1.0f); -// create_input(function->get_parameters()[1], targetInputStaticShapes[0], idx + 2.0f); -// create_input(function->get_parameters()[2], targetInputStaticShapes[0], idx + 3.0f); -// } -// void prepare() { -// compile_model(); -// inferRequest = compiledModel.create_infer_request(); -// ASSERT_TRUE(inferRequest); -// } -// void reset() { -// for (auto&& state : inferRequest.query_state()) { -// state.reset(); -// } -// } - -// std::vector run_test(std::shared_ptr model) { -// function = model; -// prepare(); -// std::vector outputs; -// int idx = 0; -// for (auto&& shapes : targetStaticShapes) { -// generate(idx++, shapes); -// for (const auto& input : inputs) { -// inferRequest.set_tensor(input.first, input.second); -// } -// inferRequest.infer(); -// auto outputTensor = inferRequest.get_output_tensor(0); -// ov::Tensor copy{outputTensor.get_element_type(), outputTensor.get_shape()}; -// outputTensor.copy_to(copy); -// outputs.push_back(copy); -// reset(); -// } -// return outputs; -// } -// }; - -// TEST_P(FuseSDPAReshapeTransposeTest, CompareWithRefs) { -// SKIP_IF_CURRENT_TEST_IS_DISABLED(); -// bool reshape_transpose_fused = false; -// auto actualOutputs = run_test(function); -// CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1); -// CheckNumberOfNodesWithType(compiledModel, "Reshape", 0); -// CheckNumberOfNodesWithType(compiledModel, "Transpose", 0); -// for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) { -// if (n->get_friendly_name() == "mha/fused_reshape_transpose") { -// reshape_transpose_fused = true; -// } -// } -// ASSERT_TRUE(reshape_transpose_fused); - -// auto expectedOutputs = run_test(functionRefs); -// for (size_t i = 0; i < actualOutputs.size(); i++) { -// ov::test::utils::compare(expectedOutputs[i], actualOutputs[i], abs_threshold, rel_threshold); -// } -// } - -// namespace { -// const std::vector inputShapeAndReshapeOrders = { -// // -// { -// {{ -// // Q,K,V:[B, L, H*S] -// {{-1, -1, 4 * 16}, {{1, 1, 4 * 16}, {1, 2, 4 * 16}, {2, 2, 4 * 16}}}, -// }, -// // reshapeOrderHS -// {4, 16}}, -// }}; - -// INSTANTIATE_TEST_SUITE_P(smoke_FuseSDPAReshapeTransposeTest, -// FuseSDPAReshapeTransposeTest, -// ::testing::Combine(::testing::Values(ElementType::f32), -// ::testing::ValuesIn(inputShapeAndReshapeOrders)), -// FuseSDPAReshapeTransposeTest::getTestCaseName); -// } // namespace +using InputShapeAndReshapeOrder = std::pair, std::vector>; +using FuseSDPAReshapeTransposeTestParams = std::tuple; +class FuseSDPAReshapeTransposeTest : virtual public ov::test::SubgraphBaseTest, + public testing::WithParamInterface, + public CPUTestsBase { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + ElementType inType; + InputShapeAndReshapeOrder inputShapeAndOrders; + std::tie(inType, inputShapeAndOrders) = obj.param; + std::ostringstream result; + std::vector& inputShapes = inputShapeAndOrders.first; + auto& reshapeOrderHS = inputShapeAndOrders.second; + result << "IS="; + for (const auto& shape : inputShapes) { + result << ov::test::utils::partialShape2str({shape.first}) << "_"; + } + result << "TS="; + for (const auto& shape : inputShapes) { + result << "("; + if (!shape.second.empty()) { + for (const auto& itr : shape.second) { + result << ov::test::utils::vec2str(itr); + } + } + result << ")_"; + } + result << "Prc=" << inType << "_"; + result << "ReshapeOrderHS="; + result << "("; + for (const auto& itr : reshapeOrderHS) { + result << itr << ","; + } + result << ")"; + + return result.str(); + } + + void SetUp() override { + ElementType inType; + InputShapeAndReshapeOrder inputShapeAndOrders; + std::tie(inType, inputShapeAndOrders) = this->GetParam(); + std::vector& inputShapes = inputShapeAndOrders.first; + auto& reshapeOrderHS = inputShapeAndOrders.second; + targetDevice = ov::test::utils::DEVICE_CPU; + rel_threshold = 1e-2f; + configuration[ov::hint::inference_precision.name()] = ov::element::f32; + if (inType == ElementType::bf16) { + configuration[ov::hint::inference_precision.name()] = ov::element::bf16; + rel_threshold = 0.01f; + } + init_input_shapes(inputShapes); + + // pre SDPA reshape->transpose + ov::ParameterVector inputParams(3); + ov::SinkVector sinkNodes; + OutputVector transposes(3); + for (size_t i = 0; i < 3u; i++) { + inputParams[i] = std::make_shared(inType, inputDynamicShapes[0]); + + auto reshape_axis = + ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 0, reshapeOrderHS[0], reshapeOrderHS[1]}); + + std::shared_ptr reshape_input_1 = inputParams[i]; + if (i > 0) { + auto var = std::make_shared( + ov::op::util::VariableInfo{inputDynamicShapes[0], inType, "var_" + std::to_string(i)}); + auto readvalue = std::make_shared(inputParams[i], var); + auto assign = std::make_shared(readvalue, var); + sinkNodes.emplace_back(assign); + reshape_input_1 = readvalue; + } + + auto reshape = std::make_shared(reshape_input_1, reshape_axis, true); + auto transposeOrder = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}); + transposes[i] = std::make_shared(reshape, transposeOrder); + } + + auto sdpa = std::make_shared(transposes, false); + sdpa->set_friendly_name("mha"); + + // post SDPA transpose + reshape + auto postOrder = + ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 2, 1, 3}); // BHLS -> BLHS + auto transposeSDPA = std::make_shared(sdpa, postOrder); + + auto constReshape = + ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 0, reshapeOrderHS[0] * reshapeOrderHS[1]}); + auto reshapeSDPA = std::make_shared(transposeSDPA, constReshape, true); // BLHS -> B,L,HxS + + function = std::make_shared(ov::OutputVector{reshapeSDPA}, + sinkNodes, + inputParams, + "FuseSDPAReshapeTranspose"); + targetDevice = ov::test::utils::DEVICE_CPU; + functionRefs = function->clone(); + pass::Manager manager; + // decompose ScaledDotProductAttention + manager.register_pass(); + manager.run_passes(functionRefs); + } + + template + static void strided_iota(IT first, size_t n, T value, T stride) { + for (size_t i = 0; i < n; i++) { + *first++ = value; + value += stride; + } + } + void generate(int idx, const std::vector& targetInputStaticShapes) { + inputs.clear(); + auto create_input = [this] (std::shared_ptr param, ov::Shape shape, float val) { + if (param->get_element_type() == ov::element::i32) { + ov::Tensor t{ov::element::i32, shape}; + auto size = ov::shape_size(shape); + auto* p = static_cast(t.data()); + auto start = static_cast(val); + for (size_t i = 0; i < size; i++) { + p[i] = (start + i) % size; + } + inputs.insert({param, t}); + } else if (param->get_element_type() == ov::element::f32) { + ov::Tensor t{ov::element::f32, shape}; + strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); + inputs.insert({param, t}); + } else { + ASSERT_TRUE(param->get_element_type() == ov::element::bf16); + ov::Tensor t{ov::element::bf16, shape}; + strided_iota(static_cast(t.data()), t.get_size(), val, 0.1f); + inputs.insert({param, t}); + } + }; + // q, k, v + create_input(function->get_parameters()[0], targetInputStaticShapes[0], idx + 1.0f); + create_input(function->get_parameters()[1], targetInputStaticShapes[0], idx + 2.0f); + create_input(function->get_parameters()[2], targetInputStaticShapes[0], idx + 3.0f); + } + void prepare() { + compile_model(); + inferRequest = compiledModel.create_infer_request(); + ASSERT_TRUE(inferRequest); + } + void reset() { + for (auto&& state : inferRequest.query_state()) { + state.reset(); + } + } + + std::vector run_test(std::shared_ptr model) { + function = model; + prepare(); + std::vector outputs; + int idx = 0; + for (auto&& shapes : targetStaticShapes) { + generate(idx++, shapes); + for (const auto& input : inputs) { + inferRequest.set_tensor(input.first, input.second); + } + inferRequest.infer(); + auto outputTensor = inferRequest.get_output_tensor(0); + ov::Tensor copy{outputTensor.get_element_type(), outputTensor.get_shape()}; + outputTensor.copy_to(copy); + outputs.push_back(copy); + reset(); + } + return outputs; + } +}; + +TEST_P(FuseSDPAReshapeTransposeTest, CompareWithRefs) { + GTEST_SKIP() << "TODO: investigate perf-regression on ICX." << std::endl; + bool reshape_transpose_fused = false; + auto actualOutputs = run_test(function); + CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1); + CheckNumberOfNodesWithType(compiledModel, "Reshape", 0); + CheckNumberOfNodesWithType(compiledModel, "Transpose", 0); + for (const auto& n : compiledModel.get_runtime_model()->get_ordered_ops()) { + if (n->get_friendly_name() == "mha/fused_reshape_transpose") { + reshape_transpose_fused = true; + } + } + ASSERT_TRUE(reshape_transpose_fused); + + auto expectedOutputs = run_test(functionRefs); + for (size_t i = 0; i < actualOutputs.size(); i++) { + ov::test::utils::compare(expectedOutputs[i], actualOutputs[i], abs_threshold, rel_threshold); + } +} + +namespace { +const std::vector inputShapeAndReshapeOrders = { + // + { + {{ + // Q,K,V:[B, L, H*S] + {{-1, -1, 4 * 16}, {{1, 1, 4 * 16}, {1, 2, 4 * 16}, {2, 2, 4 * 16}}}, + }, + // reshapeOrderHS + {4, 16}}, + }}; + +INSTANTIATE_TEST_SUITE_P(smoke_FuseSDPAReshapeTransposeTest, + FuseSDPAReshapeTransposeTest, + ::testing::Combine(::testing::Values(ElementType::f32), + ::testing::ValuesIn(inputShapeAndReshapeOrders)), + FuseSDPAReshapeTransposeTest::getTestCaseName); +} // namespace } // namespace test } // namespace ov From a8750e045fca7d4efc79c3f12ef9d699c0f6ae30 Mon Sep 17 00:00:00 2001 From: Luo Cheng Date: Mon, 6 Jan 2025 07:54:22 +0000 Subject: [PATCH 13/13] enable SDPAFuseTransposeReshape with stateful --- .../cpu_opset/common/pass/stateful_sdpa_fusion.cpp | 4 ++-- .../x64/pass/sdpa_fuse_transpose_reshape.cpp | 12 ++++++------ .../src/x64/fuse_reshape_transpose_to_sdpa.cpp | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp index adc590b41cc948..9b9aa4f4b34e48 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/stateful_sdpa_fusion.cpp @@ -308,8 +308,8 @@ bool SDPASubgraphFusion::run_on_model(const std::shared_ptr& f) { CPU_REGISTER_PASS_COMMON(manager, ov::pass::SimplifyGatherShapeOf); CPU_REGISTER_PASS_COMMON(manager, ov::pass::transpose_sinking::TSShapeOfForward); CPU_REGISTER_PASS_COMMON(manager, StatefulSDPAFusion); - // TODO: SDPAFuseTransposeReshape may cause regressions in icx. - // CPU_REGISTER_PASS_X64(manager, ov::intel_cpu::SDPAFuseTransposeReshape); + // TODO: remove the following after snippets support patterns with dynamic shapes + CPU_REGISTER_PASS_X64(manager, ov::intel_cpu::SDPAFuseTransposeReshape); manager.run_passes(f); return false; diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/sdpa_fuse_transpose_reshape.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/sdpa_fuse_transpose_reshape.cpp index 9b48708bc8ed5a..e33b468917c51a 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/sdpa_fuse_transpose_reshape.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/sdpa_fuse_transpose_reshape.cpp @@ -18,13 +18,13 @@ * Description: SDPA fuse transpose and reshape. * Original pattern Fused pattern * - * input1 input2 input3 + * input1 readvalue readvalue * | | | * q_reshape k_reshape v_reshap * | | | (qkv transpose and reshape's orders) - * q_transpose k_transpose v_transpose | - * \ | / input1 input2 input3 | - * \ | / \ | / / + * q_transpose k_transpose v_transpose | + * \ | / input1 ReadValue ReadValue | + * \ | / \ | / / * ScaledDotProductAttention ---------> SDPAWithTransposeReshape * | | * out_transpose | @@ -41,8 +41,8 @@ intel_cpu::SDPAFuseTransposeReshape::SDPAFuseTransposeReshape() { MATCHER_SCOPE(SDPAFuseTransposeReshape); auto q_reshape_node = wrap_type({any_input(), any_input()}); - auto k_reshape_node = wrap_type({any_input(), any_input()}); - auto v_reshape_node = wrap_type({any_input(), any_input()}); + auto k_reshape_node = wrap_type({wrap_type(), any_input()}); + auto v_reshape_node = wrap_type({wrap_type(), any_input()}); auto q_transpose_order_node = wrap_type(); auto k_transpose_order_node = wrap_type(); diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp index a646eb03df1a31..a75156c0f69fcb 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/fuse_reshape_transpose_to_sdpa.cpp @@ -204,7 +204,7 @@ class FuseSDPAReshapeTransposeTest : virtual public ov::test::SubgraphBaseTest, }; TEST_P(FuseSDPAReshapeTransposeTest, CompareWithRefs) { - GTEST_SKIP() << "TODO: investigate perf-regression on ICX." << std::endl; + SKIP_IF_CURRENT_TEST_IS_DISABLED(); bool reshape_transpose_fused = false; auto actualOutputs = run_test(function); CheckNumberOfNodesWithType(compiledModel, "ScaledDotProductAttention", 1);