From dcc06d74d47b6785ba3a9cafcc181f93d65990f0 Mon Sep 17 00:00:00 2001 From: Lzx Date: Thu, 1 Feb 2024 21:39:46 -0800 Subject: [PATCH 01/46] add cma --- .../page_fault/cma/cma_monitor.bpf.c | 62 +++++++++++++++++++ .../mem_watcher/page_fault/cma/cma_monitor.c | 51 +++++++++++++++ .../mem_watcher/page_fault/cma/cma_monitor.h | 10 +++ 3 files changed, 123 insertions(+) create mode 100644 eBPF_Supermarket/mem_watcher/page_fault/cma/cma_monitor.bpf.c create mode 100644 eBPF_Supermarket/mem_watcher/page_fault/cma/cma_monitor.c create mode 100644 eBPF_Supermarket/mem_watcher/page_fault/cma/cma_monitor.h diff --git a/eBPF_Supermarket/mem_watcher/page_fault/cma/cma_monitor.bpf.c b/eBPF_Supermarket/mem_watcher/page_fault/cma/cma_monitor.bpf.c new file mode 100644 index 000000000..dd18dd50c --- /dev/null +++ b/eBPF_Supermarket/mem_watcher/page_fault/cma/cma_monitor.bpf.c @@ -0,0 +1,62 @@ +#include "vmlinux.h" +#include +#include +#include +#include "cma_monitor.h" + +#define INTERVAL_MAX 6U +char LICENSE[] SEC("license") = "Dual BSD/GPL"; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 8192); + __type(key, unsigned); + __type(value, u64); +} count_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 8192); + __type(key, u32); + __type(value, u64); +} time_map SEC(".maps"); + + +SEC("kretprobe/cma_alloc") +int BPF_KRETPROBE(cma_alloc) +{ + u32 pid = bpf_get_current_pid_tgid(); + u64 ts = bpf_ktime_get_ns(); + + bpf_map_update_elem(&time_map, &pid, &ts, BPF_ANY); + + return 0; +} + +SEC("kprobe/alloc_contig_range") +int BPF_KRETPROBE(alloc_contig_range) +{ + u32 pid = bpf_get_current_pid_tgid(); + u64 tm = bpf_ktime_get_ns(); + u64 *tsp = bpf_map_lookup_elem(&time_map, &pid); + + if (tsp) + tm -= *tsp; + else + return 1; + + unsigned key = tm / 10000000; + if (key > INTERVAL_MAX - 1) + key = INTERVAL_MAX - 1; + + u64 *value = bpf_map_lookup_elem(&count_map, &key); + if (value) + *value += 1; + else { + u64 init_value = 1; + bpf_map_update_elem(&count_map, &key, &init_value, BPF_ANY); + } + + bpf_map_delete_elem(&time_map, &pid); + + return 0; +} diff --git a/eBPF_Supermarket/mem_watcher/page_fault/cma/cma_monitor.c b/eBPF_Supermarket/mem_watcher/page_fault/cma/cma_monitor.c new file mode 100644 index 000000000..cecf5bc82 --- /dev/null +++ b/eBPF_Supermarket/mem_watcher/page_fault/cma/cma_monitor.c @@ -0,0 +1,51 @@ +#include +#include +#include +#include +#include +#include +#include +//#include +#include "cma_monitor.h" +#include "cma_monitor.skel.h" + +#define INTERVAL_MAX 6U + +int main(int argc, char **argv) +{ + /* + char file_name[200]; + + snprintf(file_name, sizeof(file_name), "%s_kern.o", argv[0]); + if (load_bpf_file(file_name)) { + printf("%s", bpf_log_buf); + + return 1; + }*/ + struct cma_monitor_bpf *skel = cma_monitor_bpf__open_and_load(); + if (!skel) { + fprintf(stderr, "Failed to open BPF skeleton\n"); + return 1; + } + int fd = bpf_map__fd(skel->maps.time_map); + int key; + + for (;;) { + sleep(5); + + for (key = 0; key < INTERVAL_MAX; key++) { + unsigned long long value = 0; + bpf_map_lookup_elem(fd, &key, &value); + + if (key < INTERVAL_MAX - 1) + printf("Range %dms - %dms\tCount:%llu\n", + key * 10, (key + 1) * 10, value); + else + printf("Over 50ms\t\tCount:%llu\n", value); + } + + printf("=========================================\n"); + } + + return 0; +} diff --git a/eBPF_Supermarket/mem_watcher/page_fault/cma/cma_monitor.h b/eBPF_Supermarket/mem_watcher/page_fault/cma/cma_monitor.h new file mode 100644 index 000000000..b9b3eddb1 --- /dev/null +++ b/eBPF_Supermarket/mem_watcher/page_fault/cma/cma_monitor.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* Copyright (c) 2022 Jacky Yin */ +#ifndef __CMA_MONITOR_H +#define __CMA_MONTOR_H + + + + + +#endif /* __CMA_MONTOR_H */ From 6bc2674f40b53257ccaa7ddca20a245c5544d5a0 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 1 Mar 2024 13:52:13 +0800 Subject: [PATCH 02/46] update mk --- eBPF_Supermarket/kvm_watcher/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eBPF_Supermarket/kvm_watcher/Makefile b/eBPF_Supermarket/kvm_watcher/Makefile index b330f8503..24e5bc390 100644 --- a/eBPF_Supermarket/kvm_watcher/Makefile +++ b/eBPF_Supermarket/kvm_watcher/Makefile @@ -7,7 +7,7 @@ ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \ | sed 's/riscv64/riscv/' \ | sed 's/loongarch64/loongarch/') APP = src/kvm_watcher -OPTIONS = -f -w -n -d -c '-e -s' +OPTIONS = -f -w -n -d -c -h -e # 共同规则1 define common_rules1 From 244a659d76fece658f0bded463235c403c2ace7a Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 1 Mar 2024 13:55:25 +0800 Subject: [PATCH 03/46] update mk --- eBPF_Supermarket/kvm_watcher/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/Makefile b/eBPF_Supermarket/kvm_watcher/Makefile index 24e5bc390..fd2b1da4b 100644 --- a/eBPF_Supermarket/kvm_watcher/Makefile +++ b/eBPF_Supermarket/kvm_watcher/Makefile @@ -44,11 +44,11 @@ ifeq ($(MAKECMDGOALS),test) ifeq ($(shell grep -Eoc '(vmx|svm)' /proc/cpuinfo),0) $(error "The CPU in your device does not support virtualization!") endif + $(common_rules1) + $(common_rules2) ifeq ($(wildcard ./cirros-0.5.2-x86_64-disk.img),) wget https://gitee.com/nan-shuaibo/cirros/releases/download/0.5.2/cirros-0.5.2-x86_64-disk.img endif - $(common_rules1) - $(common_rules2) # 安装 qemu $(INSTALL_QEMU) # 启动虚拟机 From 3ca0f8d042e147233db47bf1a6f353ab6ed61b49 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 1 Mar 2024 14:01:38 +0800 Subject: [PATCH 04/46] =?UTF-8?q?=E4=BF=AE=E6=94=B9action?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/workflows/kvm_watcher.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/kvm_watcher.yml b/.github/workflows/kvm_watcher.yml index 49b22a6c0..12a57d6cf 100644 --- a/.github/workflows/kvm_watcher.yml +++ b/.github/workflows/kvm_watcher.yml @@ -22,6 +22,7 @@ jobs: - name: Test program execution run: | + sudo modprobe kvm && sudo modprobe kvm-intel cd eBPF_Supermarket/kvm_watcher/ make test From ed2f1775b576f02e65279f56bd88c5a8e0fb55ff Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 1 Mar 2024 14:09:06 +0800 Subject: [PATCH 05/46] update yml --- .github/workflows/kvm_watcher.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/kvm_watcher.yml b/.github/workflows/kvm_watcher.yml index 12a57d6cf..0a3f7525a 100644 --- a/.github/workflows/kvm_watcher.yml +++ b/.github/workflows/kvm_watcher.yml @@ -22,7 +22,6 @@ jobs: - name: Test program execution run: | - sudo modprobe kvm && sudo modprobe kvm-intel cd eBPF_Supermarket/kvm_watcher/ - make test + make From 2b84b8675d464cecf4f6085fbf9d390e921437f2 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 1 Mar 2024 14:11:28 +0800 Subject: [PATCH 06/46] update mk --- eBPF_Supermarket/kvm_watcher/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/Makefile b/eBPF_Supermarket/kvm_watcher/Makefile index fd2b1da4b..66e6f6737 100644 --- a/eBPF_Supermarket/kvm_watcher/Makefile +++ b/eBPF_Supermarket/kvm_watcher/Makefile @@ -13,8 +13,6 @@ OPTIONS = -f -w -n -d -c -h -e define common_rules1 # 安装依赖 sudo apt install clang libelf1 libelf-dev zlib1g-dev libbpf-dev linux-tools-$$(uname -r) linux-cloud-tools-$$(uname -r) - # 加载KVM模块 - sudo modprobe kvm && sudo modprobe kvm-intel # 生成vmlinux.h文件 bpftool btf dump file /sys/kernel/btf/kvm format c > ./include/vmlinux.h endef @@ -44,6 +42,8 @@ ifeq ($(MAKECMDGOALS),test) ifeq ($(shell grep -Eoc '(vmx|svm)' /proc/cpuinfo),0) $(error "The CPU in your device does not support virtualization!") endif + # 加载KVM模块 + sudo modprobe kvm && sudo modprobe kvm-intel $(common_rules1) $(common_rules2) ifeq ($(wildcard ./cirros-0.5.2-x86_64-disk.img),) From 3bfa9a481b7e77d3c75f13d15b3aa623a91a907e Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 1 Mar 2024 16:32:03 +0800 Subject: [PATCH 07/46] =?UTF-8?q?=E6=B7=BB=E5=8A=A0hypercall=EF=BC=88?= =?UTF-8?q?=E8=B6=85=E7=BA=A7=E8=B0=83=E7=94=A8=EF=BC=89=E7=BB=9F=E8=AE=A1?= =?UTF-8?q?=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kvm_watcher/include/kvm_hypercall.h | 98 ++++++ .../kvm_watcher/include/kvm_watcher.h | 39 ++- .../kvm_watcher/src/kvm_watcher.bpf.c | 8 +- .../kvm_watcher/src/kvm_watcher.c | 329 +++++++++++++----- 4 files changed, 382 insertions(+), 92 deletions(-) create mode 100644 eBPF_Supermarket/kvm_watcher/include/kvm_hypercall.h diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_hypercall.h b/eBPF_Supermarket/kvm_watcher/include/kvm_hypercall.h new file mode 100644 index 000000000..dba8bf0d8 --- /dev/null +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_hypercall.h @@ -0,0 +1,98 @@ +// Copyright 2023 The LMP Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/linuxkerneltravel/lmp/blob/develop/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// author: nanshuaibo811@163.com +// +// Kernel space BPF program used for monitoring data for KVM HYPERCALL. + +#ifndef __KVM_HYPERCALL_H +#define __KVM_HYPERCALL_H + +#include "kvm_watcher.h" +#include "vmlinux.h" +#include +#include +#include + +// 定义宏从寄存器读取超级调用信息 +// 代码来源:arch/x86/kvm/kvm_cache_regs.h +#define BUILD_KVM_GPR_ACCESSORS(lname, uname) \ + static __always_inline unsigned long kvm_##lname##_read( \ + struct kvm_vcpu *vcpu) { \ + return vcpu->arch.regs[VCPU_REGS_##uname]; \ + } + +BUILD_KVM_GPR_ACCESSORS(rax, RAX) +BUILD_KVM_GPR_ACCESSORS(rbx, RBX) +BUILD_KVM_GPR_ACCESSORS(rcx, RCX) +BUILD_KVM_GPR_ACCESSORS(rdx, RDX) +BUILD_KVM_GPR_ACCESSORS(rsi, RSI) + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1024); + __type(key, struct hc_key); + __type(value, struct hc_value); +} hc_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1024); + __type(key, struct hc_key); + __type(value, u32); +} hc_count SEC(".maps"); + +static int entry_emulate_hypercall(struct kvm_vcpu *vcpu, void *rb, + struct common_event *e, pid_t vm_pid) { + CHECK_PID(vm_pid); + u64 nr, a0, a1, a2, a3; + nr = kvm_rax_read(vcpu); // 超级调用号 + // 超级调用参数 + a0 = kvm_rbx_read(vcpu); + a1 = kvm_rcx_read(vcpu); + a2 = kvm_rdx_read(vcpu); + a3 = kvm_rsi_read(vcpu); + RESERVE_RINGBUF_ENTRY(rb, e); + e->process.pid = pid; + e->process.tid = (u32)bpf_get_current_pid_tgid(); + e->time = bpf_ktime_get_ns(); + bpf_get_current_comm(&e->process.comm, sizeof(e->process.comm)); + e->hypercall_data.a0 = a0; + e->hypercall_data.a1 = a1; + e->hypercall_data.a2 = a2; + e->hypercall_data.a3 = a3; + e->hypercall_data.vcpu_id = vcpu->vcpu_id; + e->hypercall_data.hc_nr = nr; + e->hypercall_data.hypercalls = vcpu->stat.hypercalls; + bpf_ringbuf_submit(e, 0); + struct hc_key hc_key = {.pid = pid, .nr = nr, .vcpu_id = vcpu->vcpu_id}; + struct hc_value hc_value = {.a0 = a0, + .a1 = a1, + .a2 = a2, + .a3 = a3, + .counts = 1, + .hypercalls = vcpu->stat.hypercalls}; + u32 *count; + count = bpf_map_lookup_elem(&hc_count, &hc_key); + if (count) { + __sync_fetch_and_add(count, 1); + hc_value.counts = *count; + } else { + bpf_map_update_elem(&hc_count, &hc_key, &hc_value.counts, BPF_NOEXIST); + } + bpf_map_update_elem(&hc_map, &hc_key, &hc_value, BPF_ANY); + return 0; +} + +#endif /* __KVM_HYPERCALL_H */ \ No newline at end of file diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h b/eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h index 55d55343b..f2ac6b8b9 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h @@ -29,11 +29,11 @@ #define NS_TO_MS_WITH_DECIMAL(ns) ((double)(ns) / NS_TO_MS_FACTOR) #define MICROSECONDS_IN_SECOND 1000000 -#define OUTPUT_INTERVAL_SECONDS 0.5 +#define OUTPUT_INTERVAL_SECONDS 2 #define OUTPUT_INTERVAL(us) usleep((__u32)(us * MICROSECONDS_IN_SECOND)) -#define OPTIONS_LIST "-w, -p, -d, -f, -c, -i, or -e" +#define OPTIONS_LIST "-w, -p, -d, -f, -c, -i, ,-h or -e" #define PFERR_PRESENT_BIT 0 #define PFERR_WRITE_BIT 1 @@ -70,7 +70,7 @@ } while (0) // 定义清屏宏 -#define CLEAR_SCREEN() printf("\033[2J\033[H") +#define CLEAR_SCREEN() printf("\033[2J\033[H\n") #define RING_BUFFER_TIMEOUT_MS 100 @@ -88,11 +88,6 @@ return 0; \ } -struct ExitReason { - __u32 number; - const char *name; -}; - struct reason_info { __u64 time; __u64 reason; @@ -107,6 +102,22 @@ struct dirty_page_info { __u32 pid; }; +struct hc_value { + __u64 a0; + __u64 a1; + __u64 a2; + __u64 a3; + __u64 hypercalls; // vcpu上hypercall发生的次数 + __u32 counts; // 特定hypercall发生的次数 + __u32 pad; +}; + +struct hc_key { + __u64 nr; + pid_t pid; + __u32 vcpu_id; +}; + struct process { __u32 pid; __u32 tid; @@ -122,6 +133,7 @@ enum EventType { PAGE_FAULT, IRQCHIP, IRQ_INJECT, + HYPERCALL, } event_type; struct common_event { @@ -200,6 +212,17 @@ struct common_event { __u64 injections; // IRQ_INJECT 特有成员 } irq_inject_data; + + struct { + __u64 hc_nr; + __u64 a0; + __u64 a1; + __u64 a2; + __u64 a3; + __u64 hypercalls; + __u32 vcpu_id; + // HYPERCALL 特有成员 + } hypercall_data; }; }; diff --git a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c index 0564e4704..4118983fc 100644 --- a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c +++ b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c @@ -24,6 +24,7 @@ #include "../include/kvm_vcpu.h" #include "../include/kvm_mmu.h" #include "../include/kvm_irq.h" +#include "../include/kvm_hypercall.h" #include "../include/kvm_watcher.h" char LICENSE[] SEC("license") = "Dual BSD/GPL"; @@ -138,4 +139,9 @@ int BPF_PROG(fentry_vmx_inject_irq, struct kvm_vcpu *vcpu, bool reinjected) { SEC("fexit/vmx_inject_irq") int BPF_PROG(fexit_vmx_inject_irq, struct kvm_vcpu *vcpu, bool reinjected) { return exit_vmx_inject_irq(vcpu, &rb, e); -} \ No newline at end of file +} + +SEC("fentry/kvm_emulate_hypercall") +int BPF_PROG(fentry_emulate_hypercall, struct kvm_vcpu *vcpu) { + return entry_emulate_hypercall(vcpu, &rb, e, vm_pid); +} diff --git a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c index 127b5c69c..300540f64 100644 --- a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c +++ b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c @@ -30,71 +30,119 @@ #include "../include/kvm_watcher.h" #include "kvm_watcher.skel.h" -// 定义具体的退出原因 arch/x86/include/uapi/asm/vmx.h -struct ExitReason exitReasons[] = {{0, "EXCEPTION_NMI"}, - {1, "EXTERNAL_INTERRUPT"}, - {2, "TRIPLE_FAULT"}, - {3, "INIT_SIGNAL"}, - {4, "SIPI_SIGNAL"}, - {7, "INTERRUPT_WINDOW"}, - {8, "NMI_WINDOW"}, - {9, "TASK_SWITCH"}, - {10, "CPUID"}, - {12, "HLT"}, - {13, "INVD"}, - {14, "INVLPG"}, - {15, "RDPMC"}, - {16, "RDTSC"}, - {18, "VMCALL"}, - {19, "VMCLEAR"}, - {20, "VMLAUNCH"}, - {21, "VMPTRLD"}, - {22, "VMPTRST"}, - {23, "VMREAD"}, - {24, "VMRESUME"}, - {25, "VMWRITE"}, - {26, "VMOFF"}, - {27, "VMON"}, - {28, "CR_ACCESS"}, - {29, "DR_ACCESS"}, - {30, "IO_INSTRUCTION"}, - {31, "MSR_READ"}, - {32, "MSR_WRITE"}, - {33, "INVALID_STATE"}, - {34, "MSR_LOAD_FAIL"}, - {36, "MWAIT_INSTRUCTION"}, - {37, "MONITOR_TRAP_FLAG"}, - {39, "MONITOR_INSTRUCTION"}, - {40, "PAUSE_INSTRUCTION"}, - {41, "MCE_DURING_VMENTRY"}, - {43, "TPR_BELOW_THRESHOLD"}, - {44, "APIC_ACCESS"}, - {45, "EOI_INDUCED"}, - {46, "GDTR_IDTR"}, - {47, "LDTR_TR"}, - {48, "EPT_VIOLATION"}, - {49, "EPT_MISCONFIG"}, - {50, "INVEPT"}, - {51, "RDTSCP"}, - {52, "PREEMPTION_TIMER"}, - {53, "INVVPID"}, - {54, "WBINVD"}, - {55, "XSETBV"}, - {56, "APIC_WRITE"}, - {57, "RDRAND"}, - {58, "INVPCID"}, - {59, "VMFUNC"}, - {60, "ENCLS"}, - {61, "RDSEED"}, - {62, "PML_FULL"}, - {63, "XSAVES"}, - {64, "XRSTORS"}, - {67, "UMWAIT"}, - {68, "TPAUSE"}, - {74, "BUS_LOCK"}, - {75, "NOTIFY"}}; +// 创建并打开临时文件 +FILE *create_temp_file(const char *filename) { + const char *directory = "./temp"; + char filepath[256]; + + // 构建文件的完整路径 + snprintf(filepath, sizeof(filepath), "%s/%s", directory, filename); + + // 创建目录,如果不存在 + if (mkdir(directory, 0777) == -1 && errno != EEXIST) { + perror("Failed to create directory"); + return NULL; + } + + // 尝试打开文件 + FILE *output = fopen(filepath, "w"); + if (!output) { + perror("Failed to open output file"); + return NULL; + } + + return output; +} + +const char *getHypercallName(int number) { + struct Hypercall { + int number; + const char *name; + }; + + // 定义超级调用 include\uapi\linux\kvm_para.h + struct Hypercall hypercalls[] = { + {1, "VAPIC_POLL_IRQ"}, {5, "KICK_CPU"}, {9, "CLOCK_PAIRING"}, + {10, "SEND_IPI"}, {11, "SCHED_YIELD"}, {12, "MAP_GPA_RANGE"}}; + + for (int i = 0; i < sizeof(hypercalls) / sizeof(hypercalls[0]); i++) { + if (hypercalls[i].number == number) { + return hypercalls[i].name; + } + } + return "Unknown"; // 如果找不到对应的超级调用号,返回一个默认值 +} const char *getExitReasonName(int number) { + struct ExitReason { + int number; + const char *name; + }; + + // 定义具体的退出原因 arch/x86/include/uapi/asm/vmx.h + struct ExitReason exitReasons[] = {{0, "EXCEPTION_NMI"}, + {1, "EXTERNAL_INTERRUPT"}, + {2, "TRIPLE_FAULT"}, + {3, "INIT_SIGNAL"}, + {4, "SIPI_SIGNAL"}, + {7, "INTERRUPT_WINDOW"}, + {8, "NMI_WINDOW"}, + {9, "TASK_SWITCH"}, + {10, "CPUID"}, + {12, "HLT"}, + {13, "INVD"}, + {14, "INVLPG"}, + {15, "RDPMC"}, + {16, "RDTSC"}, + {18, "VMCALL"}, + {19, "VMCLEAR"}, + {20, "VMLAUNCH"}, + {21, "VMPTRLD"}, + {22, "VMPTRST"}, + {23, "VMREAD"}, + {24, "VMRESUME"}, + {25, "VMWRITE"}, + {26, "VMOFF"}, + {27, "VMON"}, + {28, "CR_ACCESS"}, + {29, "DR_ACCESS"}, + {30, "IO_INSTRUCTION"}, + {31, "MSR_READ"}, + {32, "MSR_WRITE"}, + {33, "INVALID_STATE"}, + {34, "MSR_LOAD_FAIL"}, + {36, "MWAIT_INSTRUCTION"}, + {37, "MONITOR_TRAP_FLAG"}, + {39, "MONITOR_INSTRUCTION"}, + {40, "PAUSE_INSTRUCTION"}, + {41, "MCE_DURING_VMENTRY"}, + {43, "TPR_BELOW_THRESHOLD"}, + {44, "APIC_ACCESS"}, + {45, "EOI_INDUCED"}, + {46, "GDTR_IDTR"}, + {47, "LDTR_TR"}, + {48, "EPT_VIOLATION"}, + {49, "EPT_MISCONFIG"}, + {50, "INVEPT"}, + {51, "RDTSCP"}, + {52, "PREEMPTION_TIMER"}, + {53, "INVVPID"}, + {54, "WBINVD"}, + {55, "XSETBV"}, + {56, "APIC_WRITE"}, + {57, "RDRAND"}, + {58, "INVPCID"}, + {59, "VMFUNC"}, + {60, "ENCLS"}, + {61, "RDSEED"}, + {62, "PML_FULL"}, + {63, "XSAVES"}, + {64, "XRSTORS"}, + {67, "UMWAIT"}, + {68, "TPAUSE"}, + {74, "BUS_LOCK"}, + {75, "NOTIFY"}}; + for (int i = 0; i < sizeof(exitReasons) / sizeof(exitReasons[0]); i++) { if (exitReasons[i].number == number) { return exitReasons[i].name; @@ -228,25 +276,12 @@ int compare(const void *a, const void *b) { // 保存脏页信息到文件 int save_count_dirtypagemap_to_file(struct bpf_map *map) { - const char *directory = "./temp"; - const char *filename = "./temp/dirty_temp"; - - // 创建目录,如果不存在 - if (mkdir(directory, 0777) == -1) { - // 如果目录已经存在,这里的错误是预期的,可以忽略 - // 否则,打印错误信息并返回 - if (errno != EEXIST) { - perror("Failed to create directory"); - return -1; - } - } - - FILE *output = fopen(filename, "w"); + const char *filename = "dirty_temp"; + FILE *output = create_temp_file(filename); if (!output) { - perror("Failed to open output file"); - return -1; + fprintf(stderr, "Failed to create file in directory\n"); + return 1; } - int count_dirty_fd = bpf_map__fd(map); struct dirty_page_info lookup_key = {}; struct dirty_page_info next_key = {}; @@ -311,6 +346,7 @@ static struct env { bool mmio_page_fault; bool execute_irqchip; bool execute_irq_inject; + bool execute_hypercall; int monitoring_time; pid_t vm_pid; enum EventType event_type; @@ -324,6 +360,7 @@ static struct env { .execute_irqchip = false, .execute_irq_inject = false, .mmio_page_fault = false, + .execute_hypercall = false, .monitoring_time = 0, .vm_pid = -1, .event_type = NONE_TYPE, @@ -347,6 +384,7 @@ static const struct argp_option opts[] = { "Monitor the irqchip setting information in KVM VM."}, {"irq_inject(x86)", 'i', NULL, 0, "Monitor the virq injection information in KVM VM "}, + {"hypercall", 'h', NULL, 0, "Monitor the hypercall information in KVM VM "}, {"stat", 's', NULL, 0, "Display statistical data.(The -e option must be specified.)"}, {"mmio", 'm', NULL, 0, @@ -354,13 +392,13 @@ static const struct argp_option opts[] = { "specified.)"}, {"vm_pid", 'p', "PID", 0, "Specify the virtual machine pid to monitor."}, {"monitoring_time", 't', "SEC", 0, "Time for monitoring."}, - {NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help"}, + {NULL, 'H', NULL, OPTION_HIDDEN, "Show the full help"}, {}, }; // 解析命令行参数 static error_t parse_arg(int key, char *arg, struct argp_state *state) { switch (key) { - case 'h': + case 'H': argp_state_help(state, stderr, ARGP_HELP_STD_HELP); break; case 'w': @@ -387,6 +425,9 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) { case 'i': SET_OPTION_AND_CHECK_USAGE(option_selected, env.execute_irq_inject); break; + case 'h': + SET_OPTION_AND_CHECK_USAGE(option_selected, env.execute_hypercall); + break; case 's': if (env.execute_exit) { env.ShowStats = true; @@ -467,6 +508,8 @@ static int determineEventType(struct env *env) { env->event_type = IRQCHIP; } else if (env->execute_irq_inject) { env->event_type = IRQ_INJECT; + } else if (env->execute_hypercall) { + env->event_type = HYPERCALL; } else { env->event_type = NONE_TYPE; // 或者根据需要设置一个默认的事件类型 } @@ -641,6 +684,44 @@ static int handle_event(void *ctx, void *data, size_t data_sz) { e->irq_inject_data.soft ? "Soft/INTn" : "IRQ"); break; } + case HYPERCALL: { + const char *filename = "./temp/hc_temp"; + FILE *output = fopen(filename, "a"); + if (!output) { + perror("Failed to open output file"); + return -1; + } + fprintf(output, "%-18.6f %-15s %-10d %-10d %-10s %-11llu", + timestamp_ms, e->process.comm, e->process.pid, + e->hypercall_data.vcpu_id, + getHypercallName(e->hypercall_data.hc_nr), + e->hypercall_data.hypercalls); + if (e->hypercall_data.hc_nr == 5) { + fprintf(output, "apic_id:%llu\n", e->hypercall_data.a1); + } else if (e->hypercall_data.hc_nr == 9) { + fprintf( + output, "GPA:%#llx CLOCK_TYPE:%s\n", e->hypercall_data.a0, + e->hypercall_data.a1 == 0 ? "KVM_CLOCK_PAIRING_WALLCLOCK" + : ""); + } else if (e->hypercall_data.hc_nr == 10) { + fprintf(output, + "ipi_bitmap_low:%#llx,ipi_bitmap_high:%#llx,min(apic_" + "id):%llu,icr:%#llx\n", + e->hypercall_data.a0, e->hypercall_data.a1, + e->hypercall_data.a2, e->hypercall_data.a3); + } else if (e->hypercall_data.hc_nr == 11) { + fprintf(output, "dest apic_id:%llu\n", e->hypercall_data.a0); + } else if (e->hypercall_data.hc_nr == 12) { + fprintf(output, + "GPA start:%#llx,PAGE_NR(4KB):%llu,Attributes:%#llx\n", + e->hypercall_data.a0, e->hypercall_data.a1, + e->hypercall_data.a2); + } else { + fprintf(output, "\n"); + } + fclose(output); + break; + } default: // 处理未知事件类型 break; @@ -687,6 +768,19 @@ static int print_event_head(struct env *env) { "TIME(ms)", "COMM", "PID", "DELAY", "IRQ_NR", "VCPU_ID", "INJECTIONS", "TYPE"); break; + case HYPERCALL: { + const char *filename = "hc_temp"; + FILE *output = create_temp_file(filename); + if (!output) { + fprintf(stderr, "Failed to create file in directory\n"); + return 1; + } + fprintf(output, "%-18s %-15s %-10s %-10s %-10s %-10s %-10s\n", + "TIME(ms)", "COMM", "PID", "VCPU_ID", "NAME", "HYPERCALLS", + "ARGS"); + fclose(output); + break; + } default: // Handle default case or display an error message break; @@ -732,6 +826,67 @@ static void set_disable_load(struct kvm_watcher_bpf *skel) { env.execute_irq_inject ? true : false); bpf_program__set_autoload(skel->progs.fexit_vmx_inject_irq, env.execute_irq_inject ? true : false); + bpf_program__set_autoload(skel->progs.fentry_emulate_hypercall, + env.execute_hypercall ? true : false); +} + +int print_hc_map(struct kvm_watcher_bpf *skel) { + int fd = bpf_map__fd(skel->maps.hc_map); + int count_fd = bpf_map__fd(skel->maps.hc_count); + int err; + struct hc_key lookup_key = {}; + struct hc_key next_key = {}; + struct hc_value hc_value = {}; + struct tm *tm; + char ts[32]; + time_t t; + time(&t); + tm = localtime(&t); + strftime(ts, sizeof(ts), "%H:%M:%S", tm); + int first_run = 1; + // Iterate over the map + while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) { + if (first_run) { + first_run = 0; + printf( + "--------------------------------------------------------------" + "----------" + "\n"); + printf("TIME:%s\n", ts); + printf("%-12s %-12s %-12s %-12s %-12s\n", "PID", "VCPU_ID", "NAME", + "COUNTS", "HYPERCALLS"); + } + // Print the current entry + err = bpf_map_lookup_elem(fd, &next_key, &hc_value); + if (err < 0) { + fprintf(stderr, "failed to lookup hc_value: %d\n", err); + return -1; + } + printf("%-12d %-12d %-12s %-12d %-12lld\n", next_key.pid, + next_key.vcpu_id, getHypercallName(next_key.nr), hc_value.counts, + hc_value.hypercalls); + // // Move to the next key + lookup_key = next_key; + } + memset(&lookup_key, 0, sizeof(struct hc_key)); + while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) { + err = bpf_map_delete_elem(fd, &next_key); + if (err < 0) { + fprintf(stderr, "failed to cleanup hc_map: %d\n", err); + return -1; + } + lookup_key = next_key; + } + memset(&lookup_key, 0, sizeof(struct hc_key)); + while (!bpf_map_get_next_key(count_fd, &lookup_key, &next_key)) { + err = bpf_map_delete_elem(count_fd, &next_key); + if (err < 0) { + fprintf(stderr, "failed to cleanup hc_count: %d\n", err); + return -1; + } + lookup_key = next_key; + } + return 0; } int main(int argc, char **argv) { @@ -808,6 +963,14 @@ int main(int argc, char **argv) { while (!exiting) { // OUTPUT_INTERVAL(OUTPUT_INTERVAL_SECONDS); // 输出间隔 err = ring_buffer__poll(rb, RING_BUFFER_TIMEOUT_MS /* timeout, ms */); + if (env.execute_hypercall) { + OUTPUT_INTERVAL(OUTPUT_INTERVAL_SECONDS); + print_hc_map(skel); + if (err < 0) { + printf("Error print map: %d\n", err); + break; + } + } /* Ctrl-C will cause -EINTR */ if (err == -EINTR) { err = 0; @@ -826,7 +989,7 @@ int main(int argc, char **argv) { if (err < 0) { printf("Save count dirty page map to file fail: %d\n", err); goto cleanup; - }else{ + } else { printf("\nSave count dirty page map to file success!\n"); goto cleanup; } From 4c7521716bb29eaffd01b497e1b35a84b92e0dcc Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 1 Mar 2024 16:32:26 +0800 Subject: [PATCH 08/46] =?UTF-8?q?=E6=B7=BB=E5=8A=A0hypercall=E5=8A=9F?= =?UTF-8?q?=E8=83=BD=E5=AE=9E=E7=8E=B0=E8=BF=87=E7=A8=8B=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kvm_watcher/docs/Hypercall.md | 369 ++++++++++++++++++ 1 file changed, 369 insertions(+) create mode 100644 eBPF_Supermarket/kvm_watcher/docs/Hypercall.md diff --git a/eBPF_Supermarket/kvm_watcher/docs/Hypercall.md b/eBPF_Supermarket/kvm_watcher/docs/Hypercall.md new file mode 100644 index 000000000..a855470d2 --- /dev/null +++ b/eBPF_Supermarket/kvm_watcher/docs/Hypercall.md @@ -0,0 +1,369 @@ +> 在Linux中,大家应该对syscall非常的了解和熟悉,其是用户态进入内核态的一种途径或者说是一种方式,完成了两个模式之间的切换;而在虚拟环境中,有没有一种类似于syscall这种方式,能够从no root模式切换到root模式呢?答案是肯定的,KVM提供了Hypercall机制,x86体系架构也有相关的指令支持。 +> +> hypercall:当虚拟机的Guest OS需要执行一些更高权限的操作(如:页表的更新、对物理资源的访问等)时,由于自身在非特权域无法完成这些操作,于是便通过调用Hypercall交给Hypervisor来完成这些操作。 + +## Hypercall的发起 + +KVM代码中提供了五种形式的Hypercall接口: + +``` +file: arch/x86/include/asm/kvm_para.h, line: 34 +static inline long kvm_hypercall0(unsigned int nr); +static inline long kvm_hypercall1(unsigned int nr, unsigned long p1); +static inline long kvm_hypercall2(unsigned int nr, unsigned long p1, unsigned long p2); +static inline long kvm_hypercall3(unsigned int nr, unsigned long p1, unsigned long p2, unsigned long p3) +static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, unsigned long p2, unsigned long p3, unsigned long p4) +``` + +这几个接口的区别在于参数个数的不用,本质是一样的。挑个参数最多的看下: + +``` +static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, + unsigned long p2, unsigned long p3, + unsigned long p4) +{ + long ret; + asm volatile(KVM_HYPERCALL + : "=a"(ret) + : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4) + : "memory"); + return ret; +} +``` + +Hypercall内部实现是标准的内嵌汇编,稍作分析: + +### KVM_HYPERCALL + +``` +#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1" +``` + +对于KVM hypercall来说,KVM_HYPERCALL是一个三字节的指令序列,x86体系架构下即是vmcall指令,官方手册解释: + +``` +vmcall: + op code:0F 01 C1 -- VMCALL Call to VM + monitor +by causing VM exit +``` + +言简意赅,vmcall会导致VM exit到VMM。 + +### 返回值 + +: “=a”(ret),表示返回值放在eax寄存器中输出。 + +### 输入 + +: “a”(nr), “b”(p1), “c”(p2), “d”(p3), “S”(p4),表示输入参数放在对应的eax,ebx,ecx,edx,esi中,而nr其实就是可以认为是系统调用号。 + +## hypercall的处理 + +当Guest发起一次hypercall后,VMM会接管到该call导致的VM Exit。 + +``` +static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { + ...... + [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, + ...... +} +``` + +进入kvm_emulate_hypercall()处理,过程非常简单: + +``` +int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) +{ + unsigned long nr, a0, a1, a2, a3, ret; + int op_64_bit; + + // 检查是否启用了Xen超级调用,如果是,则调用Xen超级调用处理函数 + if (kvm_xen_hypercall_enabled(vcpu->kvm)) + return kvm_xen_hypercall(vcpu); + + // 检查是否启用了Hypervisor超级调用,如果是,则调用Hypervisor超级调用处理函数 + if (kvm_hv_hypercall_enabled(vcpu)) + return kvm_hv_hypercall(vcpu); + + // 从寄存器中读取超级调用号及参数 + nr = kvm_rax_read(vcpu); + a0 = kvm_rbx_read(vcpu); + a1 = kvm_rcx_read(vcpu); + a2 = kvm_rdx_read(vcpu); + a3 = kvm_rsi_read(vcpu); + + // 记录超级调用的追踪信息 + trace_kvm_hypercall(nr, a0, a1, a2, a3); + + // 检查是否为64位超级调用 + op_64_bit = is_64_bit_hypercall(vcpu); + if (!op_64_bit) { + nr &= 0xFFFFFFFF; + a0 &= 0xFFFFFFFF; + a1 &= 0xFFFFFFFF; + a2 &= 0xFFFFFFFF; + a3 &= 0xFFFFFFFF; + } + + // 检查当前CPU的特权级是否为0 + if (static_call(kvm_x86_get_cpl)(vcpu) != 0) { + ret = -KVM_EPERM; + goto out; + } + + ret = -KVM_ENOSYS; + + // 根据超级调用号执行相应的操作 + switch (nr) { + case KVM_HC_VAPIC_POLL_IRQ: + ret = 0; + break; + case KVM_HC_KICK_CPU: + // 处理CPU唤醒的超级调用 + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT)) + break; + + kvm_pv_kick_cpu_op(vcpu->kvm, a1); + kvm_sched_yield(vcpu, a1); + ret = 0; + break; +#ifdef CONFIG_X86_64 + case KVM_HC_CLOCK_PAIRING: + // 处理时钟配对的超级调用 + ret = kvm_pv_clock_pairing(vcpu, a0, a1); + break; +#endif + case KVM_HC_SEND_IPI: + // 处理发送中断请求的超级调用 + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI)) + break; + + ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); + break; + case KVM_HC_SCHED_YIELD: + // 处理调度让出的超级调用 + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD)) + break; + + kvm_sched_yield(vcpu, a0); + ret = 0; + break; + case KVM_HC_MAP_GPA_RANGE: + // 处理GPA范围映射的超级调用 + ret = -KVM_ENOSYS; + if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) + break; + + // 设置KVM_EXIT_HYPERCALL退出类型,并填充相关信息 + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; + vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + vcpu->run->hypercall.args[0] = a0; + vcpu->run->hypercall.args[1] = a1; + vcpu->run->hypercall.args[2] = a2; + vcpu->run->hypercall.longmode = op_64_bit; + vcpu->arch.complete_userspace_io = complete_hypercall_exit; + return 0; + default: + ret = -KVM_ENOSYS; + break; + } + +out: + // 如果不是64位超级调用,则返回值需要截断为32位 + if (!op_64_bit) + ret = (u32)ret; + kvm_rax_write(vcpu, ret); + + // 更新超级调用统计信息,并跳过被模拟的指令 + ++vcpu->stat.hypercalls; + return kvm_skip_emulated_instruction(vcpu); +} +``` + +### Conclusion + +整个过程非常简洁和简单,hypercall机制给了Guest能够主动进入VMM的一种方式。 + +## 调用号 + +``` +#define KVM_HC_VAPIC_POLL_IRQ 1 +#define KVM_HC_MMU_OP 2 +#define KVM_HC_FEATURES 3 +#define KVM_HC_PPC_MAP_MAGIC_PAGE 4 +#define KVM_HC_KICK_CPU 5 +#define KVM_HC_MIPS_GET_CLOCK_FREQ 6 +#define KVM_HC_MIPS_EXIT_VM 7 +#define KVM_HC_MIPS_CONSOLE_OUTPUT 8 +#define KVM_HC_CLOCK_PAIRING 9 +#define KVM_HC_SEND_IPI 10 +#define KVM_HC_SCHED_YIELD 11 +#define KVM_HC_MAP_GPA_RANGE 12 +``` + + +1. ##### KVM_HC_VAPIC_POLL_IRQ + +------------------------ + +Architecture: x86 +Status: active +Purpose: 触发客户机退出,以便在重新进入时主机可以检查待处理的中断。 + +2. ##### KVM_HC_MMU_OP + +---------------- + +Architecture: x86 +Status: deprecated. +Purpose: 支持内存管理单元(MMU)操作,例如写入页表项(PTE)、刷新转换后备缓冲(TLB)以及释放页表(PT)。 + +3. ##### KVM_HC_FEATURES + +------------------ + +Architecture: PPC +Status: active +Purpose: 向客户机公开超级调用的可用性。在 x86 平台上,使用 cpuid 来列举可用的超级调用。在 PPC(PowerPC)上,可以使用基于设备树的查找(也是 EPAPR 规定的方式)或 KVM 特定的列举机制(即这个超级调用)。 + +4. ##### KVM_HC_PPC_MAP_MAGIC_PAGE + +---------------------------- + +Architecture: PPC +Status: active +Purpose:为了实现超级监视器与客户机之间的通信,存在一个共享页面,其中包含了监视器可见寄存器状态的部分。客户机可以通过使用此超级调用将这个共享页面映射,以通过内存访问其监视器寄存器。 + +5. ##### KVM_HC_KICK_CPU + +------------------ + +Architecture: x86 +Status: active +Purpose: 用于唤醒处于 HLT(Halt)状态的vCPU 。 +Usage example: +一个使用了半虚拟化的客户机的虚拟 CPU,在内核模式下忙等待某个事件的发生(例如,自旋锁变为可用)时,如果其忙等待时间超过了一个阈值时间间隔,就可以执行 HLT 指令。执行 HLT 指令将导致 hypervisor 将虚拟 CPU 置于休眠状态,直到发生适当的事件。同一客户机的另一个虚拟 CPU 可以通过发出 KVM_HC_KICK_CPU 超级调用来唤醒正在睡眠的虚拟 CPU,指定要唤醒的虚拟 CPU 的 APIC ID(a1)。另外一个参数(a0)在这个超级调用中用于将来的用途。 + + +6. ##### KVM_HC_CLOCK_PAIRING + +----------------------- + +Architecture: x86 +Status: active +Purpose: 用于同步主机和客户机时钟。 + +Usage: +a0:客户机物理地址,用于存储主机复制的 "struct kvm_clock_offset" 结构。 + +a1:时钟类型,目前只支持 KVM_CLOCK_PAIRING_WALLCLOCK(0)(对应主机的 CLOCK_REALTIME 时钟)。 + +```c +struct kvm_clock_pairing { + __s64 sec; // 从 clock_type 时钟起的秒数。 + __s64 nsec; // 从 clock_type 时钟起的纳秒数。 + __u64 tsc; // 用于计算 sec/nsec 对的客户机 TSC(时间戳计数)值。 + __u32 flags; // 标志,目前未使用(为 0)。 + __u32 pad[9]; // 填充字段,目前未使用。 +}; +``` + +这个超级调用允许客户机在主机和客户机之间计算精确的时间戳。客户机可以使用返回的 TSC(时间戳计数)值来计算其时钟的 CLOCK_REALTIME,即在同一时刻。 + +如果主机不使用 TSC 时钟源,或者时钟类型不同于 KVM_CLOCK_PAIRING_WALLCLOCK,则返回 KVM_EOPNOTSUPP。 + +7. ##### KVM_HC_SEND_IPI + +------------------ + +Architecture: x86 +Status: active +Purpose: 向多个vcpu发生ipi。 + +- `a0`: 目标 APIC ID 位图的低位部分。 +- `a1`: 目标 APIC ID 位图的高位部分。 +- `a2`: 位图中最低的 。 +- `a3`: 中断命令寄存器。 + +这个超级调用允许客户机发送组播中断处理请求(IPIs),每次调用最多可以有 128 个目标(在 64 位模式下)或者 64 个虚拟中央处理单元(vCPU)(在 32 位模式下)。目标由位图表示,位图包含在前两个参数中(a0 和 a1)。a0 的第 0 位对应于第三个参数 a2 中的 APIC ID,a0 的第 1 位对应于 a2+1 的 APIC ID,以此类推。 + +返回成功传递 IPIs 的 CPU 数量。 + +8. ##### KVM_HC_SCHED_YIELD + +--------------------- + +Architecture: x86 +Status: active +Purpose: 用于在目标vCPU被抢占时进行让步。 + +a0: destination APIC ID + +Usage example: 当向多个vCPU发送调用函数中断(call-function IPI)时,如果任何目标 vCPU 被抢占,进行让步。 + +9. ##### KVM_HC_MAP_GPA_RANGE + +------------------------- + +Architecture: x86 +Status: active +Purpose: 请求 KVM 映射一个具有指定属性的 GPA 范围。 + +`a0`: 起始页面的客户机物理地址 +`a1`: (4KB)页面的数量(在 GPA 空间中必须是连续的) +`a2`: 属性 + + 属性: + 位 3:0 - 首选页大小编码,0 = 4KB,1 = 2MB,2 = 1GB,以此类推... + 位 4 - 明文 = 0,加密 = 1 + 位 63:5 - 保留(必须为零) + +**实现注意事项** + +此超级调用通过 KVM_CAP_EXIT_HYPERCALL 能力在用户空间中实现。在向客户机 CPUID 中添加 KVM_FEATURE_HC_MAP_GPA_RANGE 之前,用户空间必须启用该能力。此外,如果客户机支持 KVM_FEATURE_MIGRATION_CONTROL,用户空间还必须设置一个 MSR 过滤器来处理对 MSR_KVM_MIGRATION_CONTROL 的写入。 + +可以通过如下查看发生的hypercall信息: + +``` +root@nans:/sys/kernel/debug/tracing/events/kvm# echo 0 > ../../tracing_on +root@nans:/sys/kernel/debug/tracing/events/kvm# echo 1 > kvm_hypercall/enable +root@nans:/sys/kernel/debug/tracing/events/kvm# echo 1 > ../../tracing_on +root@nans:/sys/kernel/debug/tracing/events/kvm# cat ../../trace_pipe +``` + +输出如下: + +![image-20240110125350965](https://gitee.com/nan-shuaibo/image/raw/master/202401101258714.png) + +使用ebpf技术统计hypercall信息: + +统计两秒内的每个hypercall发生的次数,和自客户机启动以来每个vcpu上发生的hypercall的次数 + +``` +------------------------------------------------------------------------ +TIME:16:22:05 +PID VCPU_ID NAME COUNTS HYPERCALLS +68453 4 KICK_CPU 1 0 +68453 2 KICK_CPU 1 0 +68453 1 SEND_IPI 6 5 +68453 0 SEND_IPI 7 7 +68453 7 KICK_CPU 1 0 +68453 0 KICK_CPU 1 0 +------------------------------------------------------------------------ +TIME:16:22:07 +PID VCPU_ID NAME COUNTS HYPERCALLS +68082 4 KICK_CPU 2 45 +68453 5 SEND_IPI 3 2 +68453 6 SCHED_YIELD 2 66 +68453 6 SEND_IPI 79 80 +68453 3 SEND_IPI 45 44 +68453 1 SEND_IPI 23 28 +68453 0 SEND_IPI 7 14 +68453 4 SEND_IPI 145 145 +``` + +并将详细信息输出至临时文件 + +![image-20240301162527679](https://gitee.com/nan-shuaibo/image/raw/master/202403011629545.png) + From 69b6efc3237eb454ca813a50f3f36fe456edd321 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 1 Mar 2024 19:26:55 +0800 Subject: [PATCH 09/46] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=B3=A8=E9=87=8A?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c index 300540f64..576f1ca23 100644 --- a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c +++ b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c @@ -380,9 +380,9 @@ static const struct argp_option opts[] = { "Monitor virtual machine dirty page information."}, {"kvmmmu_page_fault", 'f', NULL, 0, "Monitoring the data of kvmmmu page fault."}, - {"kvm_irqchip", 'c', NULL, 0, + {"kvm_irqchip(software)", 'c', NULL, 0, "Monitor the irqchip setting information in KVM VM."}, - {"irq_inject(x86)", 'i', NULL, 0, + {"irq_inject(hardware)", 'i', NULL, 0, "Monitor the virq injection information in KVM VM "}, {"hypercall", 'h', NULL, 0, "Monitor the hypercall information in KVM VM "}, {"stat", 's', NULL, 0, From 2db53ea03d51be0ad88d6f149f510743d5834fb0 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 1 Mar 2024 21:41:47 +0800 Subject: [PATCH 10/46] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E5=86=B2=E7=AA=81=20?= =?UTF-8?q?=E5=88=A0=E9=99=A4=E5=86=97=E4=BD=99=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kvm_watcher/src/kvm_watcher.c | 99 +------------------ 1 file changed, 4 insertions(+), 95 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c index cafd36491..089c045f5 100644 --- a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c +++ b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c @@ -151,89 +151,6 @@ const char *getExitReasonName(int number) { return "Unknown"; // 如果找不到对应的退出原因,返回一个默认值 } -typedef struct { - int exit_reason; - char info[256]; - unsigned long long total_dur; - unsigned long long avg_dur; -} ExitInfo; - -// 链表节点 -typedef struct Node { - ExitInfo data; - struct Node *next; -} Node; - -Node *exitInfoBuffer = NULL; - -void addExitInfo(Node **head, int exit_reason, const char *info, - unsigned long long dur, int count) { - Node *newNode = (Node *)malloc(sizeof(Node)); - newNode->data.exit_reason = exit_reason; - strncpy(newNode->data.info, info, sizeof(newNode->data.info)); - newNode->next = NULL; - newNode->data.total_dur = dur; - newNode->data.avg_dur = dur / count; - - // 检查是否已经存在相同 exit reason 的信息 - Node *current = *head; - Node *previous = NULL; - while (current != NULL) { - if (current->data.exit_reason == exit_reason) { - // 更新已存在的信息 - strncpy(current->data.info, info, sizeof(current->data.info)); - current->data.total_dur = dur + current->data.total_dur; - current->data.avg_dur = current->data.total_dur / count; - free(newNode); // 释放新节点,因为信息已经更新 - return; - } - previous = current; - current = current->next; - } - // 没有找到相同的 exit reason,将新节点添加到链表 - if (previous != NULL) { - previous->next = newNode; - } else { - *head = newNode; - } -} - -// 查找指定退出原因的信息 -const char *findExitInfo(Node *head, int exit_reason) { - Node *current = head; - while (current != NULL) { - if (current->data.exit_reason == exit_reason) { - return current->data.info; - } - current = current->next; - } - return NULL; -} - -// 释放链表 -void freeExitInfoList(Node *head) { - while (head != NULL) { - Node *temp = head; - head = head->next; - free(temp); - } -} -// 打印退出的信息 -void printExitInfo(Node *head) { - Node *current = head; - CLEAR_SCREEN(); - printf( - "-----------------------------------------------------------------" - "----------\n"); - printf("%-21s %-18s %-8s %-8s %-13s \n", "EXIT_REASON", "COMM", "PID", - "COUNT", "AVG_DURATION(us)"); - while (current != NULL) { - printf("%-2d/%-18s %-33s %-13.4f \n", current->data.exit_reason, - getExitReasonName(current->data.exit_reason), current->data.info, - NS_TO_US_WITH_DECIMAL(current->data.avg_dur)); - current = current->next; - } -} // 检查具有给定 PID 的进程是否存在 int doesVmProcessExist(pid_t pid) { char proc_name[256]; @@ -385,8 +302,6 @@ static const struct argp_option opts[] = { {"irq_inject(hardware)", 'i', NULL, 0, "Monitor the virq injection information in KVM VM "}, {"hypercall", 'h', NULL, 0, "Monitor the hypercall information in KVM VM "}, - {"stat", 's', NULL, 0, - "Display statistical data.(The -e option must be specified.)"}, {"mmio", 'm', NULL, 0, "Monitoring the data of mmio page fault.(The -f option must be " "specified.)"}, @@ -726,9 +641,6 @@ static int print_event_head(struct env *env) { "VAILD?"); break; case EXIT: - // printf("%-18s %-21s %-18s %-15s %-8s %-13s \n", "TIME(ms)", - // "EXIT_REASON", "COMM", "PID/TID", "COUNT", - // "DURATION(us)"); break; case HALT_POLL: printf("%-18s %-15s %-15s %-10s %-7s %-11s %-10s\n", "TIME(ms)", @@ -926,7 +838,7 @@ int print_exit_map(struct kvm_watcher_bpf *skel) { return 0; } -void print_map_and_check_error(int (*print_func)(struct skel *skel), const char *map_name, int err) { +void print_map_and_check_error(int (*print_func)(struct kvm_watcher_bpf *), struct kvm_watcher_bpf *skel, const char *map_name, int err) { OUTPUT_INTERVAL(OUTPUT_INTERVAL_SECONDS); print_func(skel); if (err < 0) { @@ -1009,10 +921,10 @@ int main(int argc, char **argv) { err = ring_buffer__poll(rb, RING_BUFFER_TIMEOUT_MS /* timeout, ms */); if (env.execute_hypercall) { - print_map_and_check_error(print_hc_map, "hypercall", err); + print_map_and_check_error(print_hc_map, skel, "hypercall", err); } if (env.execute_exit) { - print_map_and_check_error(print_exit_map, "exit", err); + print_map_and_check_error(print_exit_map, skel, "exit", err); } /* Ctrl-C will cause -EINTR */ if (err == -EINTR) { @@ -1024,10 +936,7 @@ int main(int argc, char **argv) { break; } } - if (env.ShowStats) { - printExitInfo(exitInfoBuffer); - freeExitInfoList(exitInfoBuffer); - } else if (env.execute_mark_page_dirty) { + if (env.execute_mark_page_dirty) { err = save_count_dirtypagemap_to_file(skel->maps.count_dirty_map); if (err < 0) { printf("Save count dirty page map to file fail: %d\n", err); From 390e4ca39de90378c516e4bc5f080355100fa3a1 Mon Sep 17 00:00:00 2001 From: zmx Date: Fri, 8 Mar 2024 14:56:07 +0800 Subject: [PATCH 11/46] fix bug and trace drop packet --- .../net_watcher/netwatcher.bpf.c | 254 +++++++++++++++++- .../net_watcher/netwatcher.c | 113 +++++++- .../net_watcher/netwatcher.h | 22 ++ 3 files changed, 376 insertions(+), 13 deletions(-) diff --git a/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.bpf.c b/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.bpf.c index 3ae3a6591..383a0e81b 100644 --- a/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.bpf.c +++ b/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.bpf.c @@ -48,7 +48,17 @@ struct packet_tuple { unsigned int tran_flag; // 1:tcp 2:udp unsigned int len; }; +struct filtertime { + unsigned long long ip_rcv_time; + unsigned long long ip_local_deliver_time; + unsigned long long ip_local_deliver_finish_time; + unsigned long long ip__forward_time; + unsigned long long ip_local_out_time; + unsigned long long ip_output_time; + unsigned long long ip_finish_output_time; + unsigned long long ipv6_rcv_time; +}; // 操作BPF映射的一个辅助函数 static __always_inline void * //__always_inline强制内联 bpf_map_lookup_or_try_init(void *map, const void *key, const void *init) { @@ -89,6 +99,17 @@ struct { __uint(type, BPF_MAP_TYPE_RINGBUF); __uint(max_entries, 256 * 1024); } udp_rb SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 256 * 1024); +} netfilter_rb SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 256 * 1024); +} kfree_rb SEC(".maps"); + // 存储每个tcp连接所对应的conn_t struct { __uint(type, BPF_MAP_TYPE_LRU_HASH); @@ -113,10 +134,31 @@ struct { __type(value, struct packet_tuple); } pid_UDP SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, MAX_CONN *MAX_PACKET); + __type(key, int); + __type(value, struct packet_tuple); +} pid_filter SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, MAX_CONN *MAX_PACKET); + __type(key, u32); + __type(value, struct filtertime); +} netfilter_time SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_LRU_HASH); + __uint(max_entries, MAX_CONN *MAX_PACKET); + __type(key, int); + __type(value, struct packet_tuple); +} kfree SEC(".maps"); + const volatile int filter_dport = 0; const volatile int filter_sport = 0; const volatile int all_conn = 0, err_packet = 0, extra_conn_info = 0, - layer_time = 0, http_info = 0, retrans_info = 0, udp_info; + layer_time = 0, http_info = 0, retrans_info = 0, udp_info =0,net_filter = 0,kfree_info = 0; /* help macro */ @@ -1154,10 +1196,7 @@ int BPF_KPROBE(dev_hard_start_xmit, struct sk_buff *skb) { if (layer_time) { packet->tran_time = tinfo->ip_time - tinfo->tran_time; packet->ip_time = tinfo->mac_time - tinfo->ip_time; - packet->mac_time = - tinfo->qdisc_time - - tinfo - ->mac_time; // 队列纪律层,处于网络协议栈最底层,负责实际数据传输与接收 + packet->mac_time =tinfo->qdisc_time -tinfo->mac_time; // 队列纪律层,处于网络协议栈最底层,负责实际数据传输与接收 } packet->rx = 0; // 发送一个数据包 @@ -1327,6 +1366,9 @@ int BPF_KPROBE(ip_send_skb, struct net *net,struct sk_buff *skb) { if (tinfo == NULL) { return 0; } + + bpf_map_update_elem(&pid_filter, &pid, &pt, BPF_ANY); + struct udp_message *message; struct udp_message *udp_message = bpf_map_lookup_elem(×tamps, pt); @@ -1343,4 +1385,204 @@ int BPF_KPROBE(ip_send_skb, struct net *net,struct sk_buff *skb) { message->len=__bpf_ntohs(BPF_CORE_READ(udp,len)); bpf_ringbuf_submit(message, 0); return 0; -} \ No newline at end of file +} + +//netfilter +SEC("kprobe/ip_rcv") +int BPF_KPROBE(ip_rcv, struct sk_buff *skb) { + if (skb == NULL) // 判断是否为空 + return 0; + struct iphdr *ip = skb_to_iphdr(skb); + struct tcphdr *tcp = skb_to_tcphdr(skb); + struct packet_tuple pkt_tuple = {0}; + get_pkt_tuple(&pkt_tuple, ip, tcp); + unsigned int pid = bpf_get_current_pid_tgid(); + struct filtertime *tinfo, zero = {0}; + tinfo = (struct filtertime *)bpf_map_lookup_or_try_init(&netfilter_time, + &pid, &zero); + bpf_map_update_elem(&pid_filter, &pid, &pkt_tuple, BPF_ANY); + if (tinfo == NULL) { + return 0; + } + tinfo->ip_rcv_time = bpf_ktime_get_ns() / 1000; + return 0; +} + +SEC("kprobe/ip_local_deliver") +int BPF_KPROBE(ip_local_deliver) { + unsigned int pid = bpf_get_current_pid_tgid(); + struct filtertime *tinfo, zero = {0}; + tinfo = (struct filtertime *)bpf_map_lookup_or_try_init(&netfilter_time, + &pid, &zero); + if (tinfo == NULL) { + return 0; + } + tinfo->ip_local_deliver_time = bpf_ktime_get_ns() / 1000; + return 0; +} + +SEC("kprobe/ip_local_deliver_finish") +int BPF_KPROBE(ip_local_deliver_finish) { + unsigned int pid = bpf_get_current_pid_tgid(); + struct packet_tuple *pkt_tuple = bpf_map_lookup_elem(&pid_filter, &pid); + if (!pkt_tuple) { + return 0; + } + struct filtertime *tinfo, zero = {0}; + tinfo = (struct filtertime *)bpf_map_lookup_or_try_init(&netfilter_time, + &pid, &zero); + if (tinfo == NULL) { + return 0; + } + tinfo->ip_local_deliver_finish_time = bpf_ktime_get_ns() / 1000; + + struct netfilter *message; + struct netfilter *netfilter =bpf_map_lookup_elem(&netfilter_time, pkt_tuple); + message = bpf_ringbuf_reserve(&netfilter_rb, sizeof(*message), 0); + if (!message) { + return 0; + } + message->saddr = pkt_tuple->saddr; + message->daddr =pkt_tuple->daddr; + message->sport =pkt_tuple->sport; + message->dport = pkt_tuple->dport; + //message->local_input_time = tinfo->ip_local_deliver_finish_time - tinfo->ip_local_deliver_time; + // message->pre_routing_time = tinfo->ip_local_deliver_time - tinfo->ip_rcv_time; + message->flag=1;//收包 + bpf_ringbuf_submit(message, 0); + return 0; +} + +SEC("kprobe/ip_local_out") +int BPF_KPROBE(ip_local_out, struct sk_buff *skb) { + unsigned int pid = bpf_get_current_pid_tgid(); + struct filtertime *tinfo, zero = {0}; + tinfo = (struct filtertime *)bpf_map_lookup_or_try_init(&netfilter_time, + &pid, &zero); + if (tinfo == NULL) { + return 0; + } + tinfo->ip_local_out_time = bpf_ktime_get_ns() / 1000; + return 0; +} + +SEC("kprobe/ip_output") +int BPF_KPROBE(ip_output) { + unsigned int pid = bpf_get_current_pid_tgid(); + struct filtertime *tinfo, zero = {0}; + tinfo = (struct filtertime *)bpf_map_lookup_or_try_init(&netfilter_time, + &pid, &zero); + if (tinfo == NULL) { + return 0; + } + tinfo->ip_output_time = bpf_ktime_get_ns() / 1000; + return 0; +} + +SEC("kprobe/ip_finish_output") +int BPF_KPROBE(ip_finish_output) { + unsigned int pid = bpf_get_current_pid_tgid(); + struct packet_tuple *pkt_tuple = bpf_map_lookup_elem(&pid_filter, &pid); + if (!pkt_tuple) { + return 0; + } + struct filtertime *tinfo, zero = {0}; + tinfo = (struct filtertime *)bpf_map_lookup_or_try_init(&netfilter_time, + &pid, &zero); + if (tinfo == NULL) { + return 0; + } + tinfo->ip_finish_output_time = bpf_ktime_get_ns() / 1000; + struct netfilter *message; + //struct netfilter *netfilter =bpf_map_lookup_elem(&netfilter_time, pkt_tuple); + message = bpf_ringbuf_reserve(&netfilter_rb, sizeof(*message), 0); + if(!message){ + return 0; + } + message->saddr = pkt_tuple->saddr; + message->daddr =pkt_tuple->daddr; + message->sport =pkt_tuple->sport; + message->dport = pkt_tuple->dport; + message->local_out_time=tinfo->ip_output_time-tinfo->ip_local_out_time; + message->post_routing_time=tinfo->ip_finish_output_time-tinfo->ip_output_time; + message->flag=2; + bpf_ringbuf_submit(message,0); + return 0; +} + +//drop +SEC("kprobe/kfree_skb_reason") +int BPF_KPROBE(kfree_skb_reason,struct sk_buff *skb, enum skb_drop_reason reason) { + if(!kfree_info) + return 0; + if (skb == NULL) // 判断是否为空 + return 0; + struct iphdr *ip = skb_to_iphdr(skb); + struct udphdr *udp = skb_to_udphdr(skb); + struct packet_tuple pkt_tuple = {0}; + get_udp_pkt_tuple(&pkt_tuple, ip, udp); + struct reasonissue *message; + message = bpf_ringbuf_reserve(&kfree_rb, sizeof(*message), 0); + if(!message){ + return 0; + } + message->saddr = pkt_tuple.saddr; + message->daddr = pkt_tuple.daddr; + message->sport = pkt_tuple.sport; + message->dport = pkt_tuple.dport; + message->drop_reason = reason; + bpf_ringbuf_submit(message,0); + return 0; +} +//icmp +// SEC("kprobe/__icmp_send") +// int BPF_KPROBE(__icmp_send,struct sk_buff *skb_in){ +// bpf_printk("111111"); +// if (skb_in== NULL) // 判断是否为空 +// return 0; +// struct iphdr *ip = skb_to_iphdr(skb_in); +// struct udphdr *udp = skb_to_udphdr(skb_in); +// struct packet_tuple pkt_tuple = {0}; +// get_udp_pkt_tuple(&pkt_tuple, ip, udp); +// bpf_printk("%d %d",pkt_tuple.saddr,pkt_tuple.daddr); +// struct time_icmp *tinfo; +// //tinfo = (struct time_icmp *)bpf_map_lookup_elem(&icmp_time,&pkt_tuple); +// //tinfo->icmp_start_time = bpf_ktime_get_ns() / 1000; +// return 0; +// } + +// SEC("kprobe/icmp_rcv") +// int BPF_KPROBE(icmp_rcv,struct sk_buff *skb){ +// bpf_printk("2222222 pid:%d ",bpf_get_current_pid_tgid()); +// if (skb== NULL) // 判断是否为空 +// return 0; +// struct iphdr *ip = skb_to_iphdr(skb); +// struct udphdr *udp = skb_to_udphdr(skb); +// struct packet_tuple pkt_tuple = {0}; +// get_udp_pkt_tuple(&pkt_tuple, ip, udp); +// //bpf_printk("%s %s",inet_ntop(AF_INET, &saddr, s_str, sizeof(s_str)),inet_ntop(AF_INET, &daddr, d_str, sizeof(d_str))); + +// //struct time_icmp *tinfo; +// // tinfo = (struct time_icmp *)bpf_map_lookup_elem(&icmp_time,&pkt_tuple); +// // if (tinfo == NULL) { +// // return 0; +// // } +// // tinfo->icmp_end_time = bpf_ktime_get_ns() / 1000; +// // struct icmptime *message; +// // message = bpf_ringbuf_reserve(&netfilter_rb, sizeof(*message), 0); +// // if(!message){ +// // return 0; +// // } +// // message->saddr = pkt_tuple.saddr; +// // message->daddr =pkt_tuple.daddr; +// // message->sport =pkt_tuple.sport; +// // message->dport = pkt_tuple.dport; +// // message->icmp_tran_time = tinfo->icmp_end_time-tinfo->icmp_start_time; +// // bpf_ringbuf_submit(message,0); +// return 0; +// } +// SEC("kretprobe/icmp_rcv") +// int BPF_KPROBE(icmp_rcv_ret){ +// bpf_printk("33333 pid:%d ",bpf_get_current_pid_tgid()); +// return 0; +// } \ No newline at end of file diff --git a/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.c b/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.c index e19881110..812a02897 100644 --- a/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.c +++ b/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.c @@ -39,7 +39,7 @@ static char udp_file_path[1024]; static int sport = 0, dport = 0; // for filter static int all_conn = 0, err_packet = 0, extra_conn_info = 0, layer_time = 0, - http_info = 0, retrans_info = 0, udp_info; // flag + http_info = 0, retrans_info = 0, udp_info = 0,net_filter = 0,kfree_info = 0; // flag static const char argp_program_doc[] = "Watch tcp/ip in network subsystem \n"; @@ -53,6 +53,8 @@ static const struct argp_option opts[] = { {"sport", 's', "SPORT", 0, "trace this source port only"}, {"dport", 'd', "DPORT", 0, "trace this destination port only"}, {"udp", 'u', 0, 0, "trace the udp message"}, + {"net_filter",'n',0,0,"trace ipv4 packget filter "}, + {"kfree_info",'k',0,0,"trace kfree "}, {}}; static error_t parse_arg(int key, char *arg, struct argp_state *state) { @@ -85,6 +87,12 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) { case 'u': udp_info = 1; break; + case 'n': + net_filter = 1; + break; + case 'k': + kfree_info = 1; + break; default: return ARGP_ERR_UNKNOWN; } @@ -200,7 +208,7 @@ static int print_conns(struct netwatcher_bpf *skel) { } static int print_packet(void *ctx, void *packet_info, size_t size) { - if (udp_info) + if (udp_info || net_filter || kfree_info) return 0; const struct pack_t *pack_info = packet_info; if (pack_info->err) { @@ -299,6 +307,71 @@ static int print_udp(void *ctx, void *packet_info, size_t size) { fclose(file); return 0; } +static int print_netfilter(void *ctx, void *packet_info, size_t size) { + if(!net_filter) + return 0; + char d_str[INET_ADDRSTRLEN]; + char s_str[INET_ADDRSTRLEN]; + const struct netfilter *pack_info = packet_info; + unsigned int saddr = pack_info->saddr; + unsigned int daddr = pack_info->daddr; + printf("%-20s %-20s %-20u %-20u %-20llu %-20llu %-20llu %-20llu %-20d\n", + inet_ntop(AF_INET, &saddr, s_str, sizeof(s_str)), + inet_ntop(AF_INET, &daddr, d_str, sizeof(d_str)), + pack_info->sport,pack_info->dport,pack_info->local_input_time,pack_info->pre_routing_time,pack_info->local_out_time, + pack_info->post_routing_time,pack_info->flag); + return 0; +} + +static int print_kfree(void *ctx, void *packet_info, size_t size) { + if(!kfree_info) + return 0; + char d_str[INET_ADDRSTRLEN]; + char s_str[INET_ADDRSTRLEN]; + const struct reasonissue *pack_info = packet_info; + unsigned int saddr = pack_info->saddr; + unsigned int daddr = pack_info->daddr; + if(saddr == 0 && daddr ==0 ) + { + return 0; + } + printf("%-25s %-25s %-25u %-25u", + inet_ntop(AF_INET, &saddr, s_str, sizeof(s_str)), + inet_ntop(AF_INET, &daddr, d_str, sizeof(d_str)), pack_info->sport,pack_info->dport); + switch (pack_info->drop_reason) { + case 0: + printf("SKB_NOT_DROPPED_YET"); + break; + case 1: + printf("SKB_CONSUMED"); + break; + case 2: + printf("SKB_DROP_REASON_NOT_SPECIFIED"); + break; + case 3: + printf("SKB_DROP_REASON_NO_SOCKET"); + break; + case 4: + printf("SKB_DROP_REASON_PKT_TOO_SMALL"); + break; + case 5: + printf("SKB_DROP_REASON_TCP_CSUM"); + break; + case 6: + printf("SKB_DROP_REASON_SOCKET_FILTER"); + break; + case 7: + printf("SKB_DROP_REASON_UDP_CSUM"); + break; + case 8: + printf("SKB_DROP_REASON_NETFILTER_DROP"); + break; + default: + printf("Unknown SKB Drop Reason"); + } + printf("\n"); + return 0; +} int main(int argc, char **argv) { char *last_slash = strrchr(argv[0], '/'); if (last_slash) { @@ -314,6 +387,8 @@ int main(int argc, char **argv) { strcat(udp_file_path,"data/udp.log"); struct ring_buffer *rb = NULL; struct ring_buffer *udp_rb = NULL; + struct ring_buffer *netfilter_rb = NULL; + struct ring_buffer *kfree_rb = NULL; struct netwatcher_bpf *skel; int err; /* Parse command line arguments */ @@ -344,6 +419,8 @@ int main(int argc, char **argv) { skel->rodata->http_info = http_info; skel->rodata->retrans_info = retrans_info; skel->rodata->udp_info = udp_info; + skel->rodata->net_filter = net_filter; + skel->rodata->kfree_info = kfree_info; err = netwatcher_bpf__load(skel); if (err) { @@ -357,20 +434,40 @@ int main(int argc, char **argv) { fprintf(stderr, "Failed to attach BPF skeleton\n"); goto cleanup; } - if (!udp_info) { - printf("%-22s %-10s %-10s %-10s %-10s %-10s %-5s %s\n", "SOCK", "SEQ", - "ACK", "MAC_TIME", "IP_TIME", "TRAN_TIME", "RX", "HTTP"); - } if (udp_info) { printf("%-20s %-20s %-20s %-20s %-20s %-20s %-20s\n", "saddr", "daddr", "sprot", "dprot", "udp_time","rx","len"); } + else if(net_filter) + { + printf("%-20s %-20s %-20s %-20s %-20s %-20s %-20s %-20s %-20s\n", "saddr", "daddr","dprot", "sprot","local_input","pre_routing","local_out","post_routing","flag"); + } + else if(kfree_info) + { + printf("%-25s %-25s %-25s %-25s %-25s\n", "saddr", "daddr","sprot", "dprot","reason"); + } + else{ + printf("%-22s %-10s %-10s %-10s %-10s %-10s %-5s %s\n", "SOCK", "SEQ", + "ACK", "MAC_TIME", "IP_TIME", "TRAN_TIME", "RX", "HTTP"); + } udp_rb =ring_buffer__new(bpf_map__fd(skel->maps.udp_rb), print_udp, NULL, NULL); if (!udp_rb) { err = -1; fprintf(stderr, "Failed to create ring buffer\n"); goto cleanup; } + netfilter_rb =ring_buffer__new(bpf_map__fd(skel->maps.netfilter_rb), print_netfilter, NULL, NULL); + if (!netfilter_rb) { + err = -1; + fprintf(stderr, "Failed to create ring buffer\n"); + goto cleanup; + } + kfree_rb =ring_buffer__new(bpf_map__fd(skel->maps.kfree_rb), print_kfree, NULL, NULL); + if (!kfree_rb) { + err = -1; + fprintf(stderr, "Failed to create ring buffer\n"); + goto cleanup; + } /* Set up ring buffer polling */ rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), print_packet, NULL, NULL); if (!rb) { @@ -401,6 +498,8 @@ int main(int argc, char **argv) { while (!exiting) { err = ring_buffer__poll(rb, 100 /* timeout, ms */); err = ring_buffer__poll(udp_rb, 100 /* timeout, ms */); + err = ring_buffer__poll(netfilter_rb, 100 /* timeout, ms */); + err = ring_buffer__poll(kfree_rb, 100 /* timeout, ms */); print_conns(skel); sleep(1); /* Ctrl-C will cause -EINTR */ @@ -417,4 +516,4 @@ int main(int argc, char **argv) { cleanup: netwatcher_bpf__destroy(skel); return err < 0 ? -err : 0; -} +} \ No newline at end of file diff --git a/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.h b/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.h index a0902679f..353cf94e3 100644 --- a/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.h +++ b/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.h @@ -94,4 +94,26 @@ struct udp_message { int rx; int len; }; +struct netfilter +{ + unsigned int saddr; + unsigned int daddr; + unsigned short sport; + unsigned short dport; + unsigned long long local_input_time; + unsigned long long pre_routing_time; + unsigned long long forward_time; + unsigned long long local_out_time; + unsigned long long post_routing_time; + unsigned int flag; +}; +struct reasonissue +{ + unsigned int saddr; + unsigned int daddr; + unsigned short sport; + unsigned short dport; + int drop_reason; +}; + #endif /* __NETWATCHER_H */ \ No newline at end of file From 3fd90e61d54303c19ffee2abee432a217815f858 Mon Sep 17 00:00:00 2001 From: zmx Date: Fri, 8 Mar 2024 15:31:16 +0800 Subject: [PATCH 12/46] 111 --- .../Network_Subsystem/net_watcher/netwatcher.bpf.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.bpf.c b/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.bpf.c index 383a0e81b..1a13bb87f 100644 --- a/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.bpf.c +++ b/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.bpf.c @@ -1437,7 +1437,6 @@ int BPF_KPROBE(ip_local_deliver_finish) { tinfo->ip_local_deliver_finish_time = bpf_ktime_get_ns() / 1000; struct netfilter *message; - struct netfilter *netfilter =bpf_map_lookup_elem(&netfilter_time, pkt_tuple); message = bpf_ringbuf_reserve(&netfilter_rb, sizeof(*message), 0); if (!message) { return 0; @@ -1446,8 +1445,8 @@ int BPF_KPROBE(ip_local_deliver_finish) { message->daddr =pkt_tuple->daddr; message->sport =pkt_tuple->sport; message->dport = pkt_tuple->dport; - //message->local_input_time = tinfo->ip_local_deliver_finish_time - tinfo->ip_local_deliver_time; - // message->pre_routing_time = tinfo->ip_local_deliver_time - tinfo->ip_rcv_time; + message->local_input_time = tinfo->ip_local_deliver_finish_time - tinfo->ip_local_deliver_time; + message->pre_routing_time = tinfo->ip_local_deliver_time - tinfo->ip_rcv_time; message->flag=1;//收包 bpf_ringbuf_submit(message, 0); return 0; @@ -1494,7 +1493,6 @@ int BPF_KPROBE(ip_finish_output) { } tinfo->ip_finish_output_time = bpf_ktime_get_ns() / 1000; struct netfilter *message; - //struct netfilter *netfilter =bpf_map_lookup_elem(&netfilter_time, pkt_tuple); message = bpf_ringbuf_reserve(&netfilter_rb, sizeof(*message), 0); if(!message){ return 0; From 490ba262a6847ae01e0f854de8c61db233ca269b Mon Sep 17 00:00:00 2001 From: Lzx Date: Fri, 8 Mar 2024 00:23:48 -0800 Subject: [PATCH 13/46] add ion --- .../mem_watcher/page_fault/ion/Makefile | 114 ++++++++++++++++++ .../page_fault/ion/ion_monitor.bpf.c | 63 ++++++++++ .../mem_watcher/page_fault/ion/ion_monitor.c | 57 +++++++++ .../mem_watcher/page_fault/ion/ion_monitor.h | 10 ++ 4 files changed, 244 insertions(+) create mode 100644 eBPF_Supermarket/mem_watcher/page_fault/ion/Makefile create mode 100644 eBPF_Supermarket/mem_watcher/page_fault/ion/ion_monitor.bpf.c create mode 100644 eBPF_Supermarket/mem_watcher/page_fault/ion/ion_monitor.c create mode 100644 eBPF_Supermarket/mem_watcher/page_fault/ion/ion_monitor.h diff --git a/eBPF_Supermarket/mem_watcher/page_fault/ion/Makefile b/eBPF_Supermarket/mem_watcher/page_fault/ion/Makefile new file mode 100644 index 000000000..e9e8ef726 --- /dev/null +++ b/eBPF_Supermarket/mem_watcher/page_fault/ion/Makefile @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +OUTPUT := .output +CLANG ?= clang +LIBBPF_SRC := $(abspath ../libbpf-bootstrap/libbpf/src) +BPFTOOL_SRC := $(abspath ../libbpf-bootstrap/bpftool/src) +LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a) +BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool) +BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool + +ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \ + | sed 's/arm.*/arm/' \ + | sed 's/aarch64/arm64/' \ + | sed 's/ppc64le/powerpc/' \ + | sed 's/mips.*/mips/' \ + | sed 's/riscv64/riscv/' \ + | sed 's/loongarch64/loongarch/') +VMLINUX := ../libbpf-bootstrap/vmlinux/$(ARCH)/vmlinux.h +# Use our own libbpf API headers and Linux UAPI headers distributed with +# libbpf to avoid dependency on system-wide headers, which could be missing or +# outdated +INCLUDES := -I$(OUTPUT) -I../../libbpf/include/uapi -I$(dir $(VMLINUX)) +CFLAGS := -g -Wall +ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS) + +APPS = ion_monitor + +# Get Clang's default includes on this system. We'll explicitly add these dirs +# to the includes list when compiling with `-target bpf` because otherwise some +# architecture-specific dirs will be "missing" on some architectures/distros - +# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h, +# sys/cdefs.h etc. might be missing. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +CLANG_BPF_SYS_INCLUDES = $(shell $(CLANG) -v -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') + +ifeq ($(V),1) + Q = + msg = +else + Q = @ + msg = @printf ' %-8s %s%s\n' \ + "$(1)" \ + "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \ + "$(if $(3), $(3))"; + MAKEFLAGS += --no-print-directory +endif + +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +$(call allow-override,CC,$(CROSS_COMPILE)cc) +$(call allow-override,LD,$(CROSS_COMPILE)ld) + +.PHONY: all +all: $(APPS) + +.PHONY: clean +clean: + $(call msg,CLEAN) + $(Q)rm -rf $(OUTPUT) $(APPS) + +$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ + +# Build libbpf +$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf + $(call msg,LIB,$@) + $(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \ + OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \ + INCLUDEDIR= LIBDIR= UAPIDIR= \ + install + +# Build bpftool +$(BPFTOOL): | $(BPFTOOL_OUTPUT) + $(call msg,BPFTOOL,$@) + $(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap + +# Build BPF code +$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL) + $(call msg,BPF,$@) + $(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) \ + $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \ + -c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + $(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + +# Generate BPF skeletons +$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL) + $(call msg,GEN-SKEL,$@) + $(Q)$(BPFTOOL) gen skeleton $< > $@ + +# Build user-space code +$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h + +$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT) + $(call msg,CC,$@) + $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ + +# Build application binary +$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT) + $(call msg,BINARY,$@) + $(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@ + +# delete failed targets +.DELETE_ON_ERROR: + +# keep intermediate (.skel.h, .bpf.o, etc) targets +.SECONDARY: + diff --git a/eBPF_Supermarket/mem_watcher/page_fault/ion/ion_monitor.bpf.c b/eBPF_Supermarket/mem_watcher/page_fault/ion/ion_monitor.bpf.c new file mode 100644 index 000000000..e91936618 --- /dev/null +++ b/eBPF_Supermarket/mem_watcher/page_fault/ion/ion_monitor.bpf.c @@ -0,0 +1,63 @@ +#include "vmlinux.h" +#include +#include +#include +#include "ion_monitor.h" + +#define INTERVAL_MAX 6U + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 8192); + __type(key, unsigned); + __type(value, u64); +} count_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 8192); + __type(key, u32); + __type(value, u64); +} time_map SEC(".maps"); + +SEC("kprobe/ion_alloc") +int bpf_prog1(void *ctx) +{ + u32 pid = bpf_get_current_pid_tgid() >> 32; + u64 time = bpf_ktime_get_ns(); + u64 ts = bpf_ktime_get_ns(); + bpf_map_update_elem(&time_map, &pid, &ts, BPF_ANY); + + return 0; +} + +SEC("kprobe/ion_ioctl") +int bpf_prog2(void *ctx) +{ + u32 pid = bpf_get_current_pid_tgid() >> 32; + u64 tm = bpf_ktime_get_ns(); + + u64 *tsp = bpf_map_lookup_elem(&time_map, &pid); + if (tsp) + tm -= *tsp; + else + return -1; + + unsigned key = tm / 10000000;//10ms为区间单位 + if (key > INTERVAL_MAX - 1) + key = INTERVAL_MAX - 1; + u64 *value = bpf_map_lookup_elem(&count_map,&key); + if (value) { + *value += 1; + } else { + u64 init_value = 1; + bpf_map_update_elem(&count_map, &key, &init_value, BPF_ANY); + } + + bpf_map_delete_elem(&time_map, &pid); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/eBPF_Supermarket/mem_watcher/page_fault/ion/ion_monitor.c b/eBPF_Supermarket/mem_watcher/page_fault/ion/ion_monitor.c new file mode 100644 index 000000000..d22a7e004 --- /dev/null +++ b/eBPF_Supermarket/mem_watcher/page_fault/ion/ion_monitor.c @@ -0,0 +1,57 @@ +#include +#include +#include +#include +#include +#include +#include +//#include +#include "ion_monitor.h" +#include "ion_monitor.skel.h" +#include + + + +#define INTERVAL_MAX 6U +int main(int argc, char **argv) +{ + /* + char file_name[200]; + + snprintf(file_name, sizeof(file_name), "%s_kern.o", argv[0]); + if (load_bpf_file(file_name)) { + printf("%s", bpf_log_buf); + + return 1; + }*/ + struct ion_monitor_bpf *skel = ion_monitor_bpf__open_and_load(); + if (!skel) { + fprintf(stderr, "Failed to open BPF skeleton\n"); + return 1; + } + + int fd = bpf_map__fd(skel->maps.time_map); + int key; + + for(;;) { + sleep(10); + + for (key = 0; key < INTERVAL_MAX; key++) { + unsigned long long value = 0; + bpf_map_lookup_elem(fd, &key, &value); + if (key < INTERVAL_MAX - 1) + printf("Range %dms - %dms\tCount:%llu\n", + key * 10, (key + 1) * 10, value); + else + printf("Over 50ms\t\tCount:%llu\n", value); + } + + printf("==========================================\n"); + } + + return 0; +} + + + + diff --git a/eBPF_Supermarket/mem_watcher/page_fault/ion/ion_monitor.h b/eBPF_Supermarket/mem_watcher/page_fault/ion/ion_monitor.h new file mode 100644 index 000000000..e6712713e --- /dev/null +++ b/eBPF_Supermarket/mem_watcher/page_fault/ion/ion_monitor.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* Copyright (c) 2022 Jacky Yin */ +#ifndef __ION_MONITOR_H +#define __ION_MONITOR_H + + + + + +#endif /* __ION_MONTOR_H */ From 599d808a08bfb285d3f50af9aad3782c9d6caa1a Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 8 Mar 2024 16:42:30 +0800 Subject: [PATCH 14/46] =?UTF-8?q?=E6=A2=B3=E7=90=86kvm=20ept=20page=20faul?= =?UTF-8?q?t=E5=A4=84=E7=90=86=E6=B5=81=E7=A8=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- eBPF_Supermarket/kvm_watcher/docs/kvm_mmu.md | 116 ++++++++++++++++++ .../kvm_watcher/include/kvm_mmu.h | 36 +++--- 2 files changed, 138 insertions(+), 14 deletions(-) create mode 100644 eBPF_Supermarket/kvm_watcher/docs/kvm_mmu.md diff --git a/eBPF_Supermarket/kvm_watcher/docs/kvm_mmu.md b/eBPF_Supermarket/kvm_watcher/docs/kvm_mmu.md new file mode 100644 index 000000000..9c23ca138 --- /dev/null +++ b/eBPF_Supermarket/kvm_watcher/docs/kvm_mmu.md @@ -0,0 +1,116 @@ + + +vm exit(EPT_VIOLATION)处理流程: + +``` +static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { +... + [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, +... +}; + +``` + +```c +vmx_handle_exit() { + # 处理 VMX(虚拟机扩展)退出的主要函数 + __vmx_handle_exit() { + handle_ept_violation() { + # 处理 EPT(扩展页表)违规的函数 + kvm_mmu_page_fault() { + # 处理 KVM MMU(内存管理单元)页错误 + kvm_tdp_page_fault() { + # 处理 TDP(两级页表)页错误 + kvm_arch_has_noncoherent_dma(); + direct_page_fault() { + # 处理直接页错误 + kvm_vcpu_gfn_to_memslot(); + page_fault_handle_page_track(); + fast_page_fault(); + mmu_topup_memory_caches() { + # 增加内存缓存 + kvm_mmu_topup_memory_cache() { + __kvm_mmu_topup_memory_cache(); + } + kvm_mmu_topup_memory_cache() { + __kvm_mmu_topup_memory_cache(); + } + kvm_mmu_topup_memory_cache() { + __kvm_mmu_topup_memory_cache(); + } + } + kvm_faultin_pfn() { + # 处理 KVM PFN(物理帧号)故障 + __gfn_to_pfn_memslot() { + # 将 GFN(全局帧号)转换为 PFN(物理帧号)并获取内存插槽 + hva_to_pfn() { + # 将 HVA(主机虚拟地址)转换为 PFN(物理帧号) + get_user_pages_fast_only() { + # 快速获取用户页 + internal_get_user_pages_fast() { + # 内部快速获取用户页 + lockless_pages_from_mm() { + # 从内存管理结构中获取无锁页 + gup_pgd_range() { + # 获取页表项(PGD)范围 + pud_huge(); + gup_pmd_range.constprop.0() { + # 获取中间页表项(PMD)范围 + gup_huge_pmd() { + # 获取巨大页面的中间页表项 + try_grab_folio(); + } + } + } + } + } + } + } + } + } + handle_abnormal_pfn(); + _raw_read_lock(); + is_page_fault_stale(); + kvm_tdp_mmu_map() { + # TDP MMU 映射 + kvm_mmu_hugepage_adjust() { + # 调整 KVM MMU 巨大页面 + kvm_mmu_max_mapping_level() { + # 获取 KVM MMU 最大映射级别 + host_pfn_mapping_level(); + } + } + __rcu_read_lock(); + tdp_iter_start() { + # TDP 迭代器开始 + tdp_iter_restart() { + # TDP 迭代器重新启动 + tdp_iter_refresh_sptep(); + } + } + disallowed_hugepage_adjust(); + tdp_iter_next() { + # TDP 迭代器下一个 + tdp_iter_refresh_sptep(); + } + disallowed_hugepage_adjust(); + tdp_iter_next() { + # TDP 迭代器下一个 + tdp_iter_refresh_sptep(); + } + } + tdp_iter_next() { + # TDP 迭代器下一个 + tdp_iter_refresh_sptep(); + } + } + disallowed_hugepage_adjust(); + } + } + } + } +} + +``` + +![kvm-init-mmu](https://gitee.com/nan-shuaibo/image/raw/master/202403081421893.png) \ No newline at end of file diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_mmu.h b/eBPF_Supermarket/kvm_watcher/include/kvm_mmu.h index 580508f45..52d5b4fa0 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_mmu.h +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_mmu.h @@ -25,6 +25,8 @@ #include #include +#define PAGE_SHIFT 12 + struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, 8192); @@ -56,9 +58,9 @@ static int trace_page_fault(struct page_fault *ctx, pid_t vm_pid) { return 0; } -static int trace_direct_page_fault(struct kvm_vcpu *vcpu, - struct kvm_page_fault *fault, void *rb, - struct common_event *e) { +static int trace_tdp_page_fault(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, void *rb, + struct common_event *e) { u64 addr; bpf_probe_read_kernel(&addr, sizeof(u64), &fault->addr); u64 *ts; @@ -81,7 +83,6 @@ static int trace_direct_page_fault(struct kvm_vcpu *vcpu, if (count) { (*count)++; e->page_fault_data.count = *count; - bpf_map_update_elem(&pf_count, &addr, count, BPF_ANY); } else { e->page_fault_data.count = 1; bpf_map_update_elem(&pf_count, &addr, &new_count, BPF_ANY); @@ -104,36 +105,43 @@ static int trace_kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, CHECK_PID(vm_pid); if (error_code & PFERR_RSVD_MASK) { u64 ts = bpf_ktime_get_ns(); - u64 addr = cr2_or_gpa; - bpf_map_update_elem(&pf_delay, &addr, &ts, BPF_ANY); + u64 gfn = cr2_or_gpa >> PAGE_SHIFT; + bpf_map_update_elem(&pf_delay, &gfn, &ts, BPF_ANY); } return 0; } -static int trace_handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, - bool direct, void *rb, +struct mmio_page_fault { + u64 pad; + u64 addr; + gfn_t gfn; + unsigned access; +}; + +static int trace_handle_mmio_page_fault(struct mmio_page_fault *ctx, void *rb, struct common_event *e) { u64 *ts; - ts = bpf_map_lookup_elem(&pf_delay, &addr); + u64 gfn; + bpf_probe_read_kernel(&gfn, sizeof(u64), &ctx->gfn); + ts = bpf_map_lookup_elem(&pf_delay, &gfn); if (!ts) { return 0; } u32 *count; u32 new_count = 1; u64 delay = bpf_ktime_get_ns() - *ts; - bpf_map_delete_elem(&pf_delay, &addr); + bpf_map_delete_elem(&pf_delay, &gfn); RESERVE_RINGBUF_ENTRY(rb, e); - count = bpf_map_lookup_elem(&pf_count, &addr); + count = bpf_map_lookup_elem(&pf_count, &gfn); if (count) { (*count)++; e->page_fault_data.count = *count; - bpf_map_update_elem(&pf_count, &addr, count, BPF_ANY); } else { e->page_fault_data.count = 1; - bpf_map_update_elem(&pf_count, &addr, &new_count, BPF_ANY); + bpf_map_update_elem(&pf_count, &gfn, &new_count, BPF_ANY); } e->page_fault_data.delay = delay; - e->page_fault_data.addr = addr; + e->page_fault_data.addr = gfn; e->page_fault_data.error_code = PFERR_RSVD_MASK; e->process.pid = bpf_get_current_pid_tgid() >> 32; bpf_get_current_comm(&e->process.comm, sizeof(e->process.comm)); From 9fb6314b9165c346061a5b161c8179278a10c4fb Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 8 Mar 2024 16:47:32 +0800 Subject: [PATCH 15/46] =?UTF-8?q?=E6=9B=B4=E6=94=B9tdp=20page=20fault=20?= =?UTF-8?q?=EF=BC=8C=E5=92=8Cmmio=20page=20fault=20=EF=BC=8C=E7=A1=AE?= =?UTF-8?q?=E4=BF=9D=E6=89=80=E7=BB=9F=E8=AE=A1=E7=9A=84=E6=8C=87=E6=A0=87?= =?UTF-8?q?=E5=AE=8C=E6=95=B4=E6=9C=89=E6=95=88=EF=BC=8C=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?ioctl=E7=9A=84tp=EF=BC=8C=E5=90=8E=E6=9C=9F=E6=96=B9=E4=BE=BF?= =?UTF-8?q?=E6=8F=90=E5=8F=96kvm=20ioctl=E7=9B=B8=E5=85=B3=E4=BF=A1?= =?UTF-8?q?=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kvm_watcher/src/kvm_watcher.bpf.c | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c index 9c32befae..406e5d273 100644 --- a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c +++ b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c @@ -20,12 +20,13 @@ #include #include #include +#include "../include/kvm_watcher.h" #include "../include/kvm_exits.h" #include "../include/kvm_vcpu.h" #include "../include/kvm_mmu.h" #include "../include/kvm_irq.h" #include "../include/kvm_hypercall.h" -#include "../include/kvm_watcher.h" +#include "../include/kvm_ioctl.h" char LICENSE[] SEC("license") = "Dual BSD/GPL"; @@ -75,10 +76,10 @@ int tp_page_fault(struct page_fault *ctx) { return trace_page_fault(ctx, vm_pid); } -SEC("fexit/direct_page_fault") -int BPF_PROG(fexit_direct_page_fault, struct kvm_vcpu *vcpu, +SEC("fexit/kvm_tdp_page_fault") +int BPF_PROG(fexit_tdp_page_fault, struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - return trace_direct_page_fault(vcpu, fault, &rb, e); + return trace_tdp_page_fault(vcpu, fault, &rb, e); } SEC("fentry/kvm_mmu_page_fault") @@ -87,10 +88,9 @@ int BPF_PROG(fentry_kvm_mmu_page_fault, struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, return trace_kvm_mmu_page_fault(vcpu, cr2_or_gpa, error_code, vm_pid); } -SEC("fexit/handle_mmio_page_fault") -int BPF_PROG(fexit_handle_mmio_page_fault, struct kvm_vcpu *vcpu, u64 addr, - bool direct) { - return trace_handle_mmio_page_fault(vcpu, addr, direct, &rb, e); +SEC("tp/kvmmmu/handle_mmio_page_fault") +int tp_handle_mmio_page_fault(struct mmio_page_fault *ctx) { + return trace_handle_mmio_page_fault(ctx, &rb, e); } SEC("fentry/kvm_pic_set_irq") @@ -145,3 +145,8 @@ SEC("fentry/kvm_emulate_hypercall") int BPF_PROG(fentry_emulate_hypercall, struct kvm_vcpu *vcpu) { return entry_emulate_hypercall(vcpu, &rb, e, vm_pid); } + +SEC("tracepoint/syscalls/sys_enter_ioctl") +int tp_ioctl(struct trace_event_raw_sys_enter *args) { + return trace_kvm_ioctl(args); +} \ No newline at end of file From b17bd776a23c2a19d9c154197352fe7027b31d50 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 8 Mar 2024 16:50:46 +0800 Subject: [PATCH 16/46] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=86=85=E6=A0=B8?= =?UTF-8?q?=E6=80=81kvm=20ioctl=E6=8F=90=E5=8F=96=E6=A1=86=E6=9E=B6?= =?UTF-8?q?=EF=BC=8C=E7=9B=AE=E5=89=8D=E5=8F=AA=E7=BB=9F=E8=AE=A1=E9=83=A8?= =?UTF-8?q?=E5=88=86ioctl=E4=BF=A1=E6=81=AF=EF=BC=8C=E5=90=8E=E6=9C=9F?= =?UTF-8?q?=E8=AE=A1=E5=88=92=E5=AE=8C=E5=96=84=E7=A8=8B=E5=BA=8F=E5=B9=B6?= =?UTF-8?q?=E5=AF=B9=E6=95=B0=E6=8D=AE=E8=BF=9B=E8=A1=8C=E5=A4=84=E7=90=86?= =?UTF-8?q?=E8=BE=93=E5=87=BA=E5=88=B0=E7=94=A8=E6=88=B7=E6=80=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kvm_watcher/include/kvm_ioctl.h | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h b/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h new file mode 100644 index 000000000..8c4b73243 --- /dev/null +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h @@ -0,0 +1,110 @@ +// Copyright 2023 The LMP Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/linuxkerneltravel/lmp/blob/develop/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// author: nanshuaibo811@163.com +// +// Kernel space BPF program used for KVM ioctl + +#ifndef __KVM_IOCTL_H +#define __KVM_IOCTL_H + +#include "kvm_watcher.h" +#include "vmlinux.h" +#include +#include +#include +#include + +#define KVMIO 0xAE +#define KVM_CREATE_VM _IO(KVMIO, 0x01) /* returns a VM fd */ +#define KVM_CREATE_VCPU _IO(KVMIO, 0x41) +#define KVM_GET_VCPU_EVENTS _IOR(KVMIO, 0x9f, struct kvm_vcpu_events) +#define KVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events) +#define KVM_SET_USER_MEMORY_REGION \ + _IOW(KVMIO, 0x46, struct kvm_userspace_memory_region) +#define KVM_GET_REGS _IOR(KVMIO, 0x81, struct kvm_regs) +#define KVM_SET_REGS _IOW(KVMIO, 0x82, struct kvm_regs) +#define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation) +#define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt) + +static int trace_kvm_ioctl(struct trace_event_raw_sys_enter *args) { + int fd = (int)args->args[0]; + unsigned int cmd = (unsigned int)args->args[1]; + unsigned long arg = (unsigned long)args->args[2]; + switch (cmd) { + case KVM_CREATE_VM: + bpf_printk("KVM_CREATE_VM: fd=%d\n", fd); + break; + case KVM_CREATE_VCPU: { + int vcpu_id; + bpf_probe_read(&vcpu_id, sizeof(vcpu_id), (void *)arg); + bpf_printk("KVM_CREATE_VCPU: fd=%d, vcpu_id=%d\n", fd, vcpu_id); + break; + } + case KVM_SET_USER_MEMORY_REGION: { + struct kvm_userspace_memory_region region; + bpf_probe_read(®ion, sizeof(region), (void *)arg); + // 打印或处理 region 数据 + bpf_printk( + "KVM_SET_USER_MEMORY_REGION: fd=%d, slot=%u, flags=%u, " + "guest_phys_addr=%llx, memory_size=%lluK,userspace_addr=%llx\n", + fd, region.slot, region.flags, region.guest_phys_addr, + region.memory_size / 1024,region.userspace_addr); + break; + } + case KVM_GET_VCPU_EVENTS: + case KVM_SET_VCPU_EVENTS: { + struct kvm_vcpu_events events; + bpf_probe_read(&events, sizeof(events), (void *)arg); + // 打印或处理 events 数据 + bpf_printk( + "KVM_SET/GET_VCPU_EVENTS: fd=%d, exception=%u, interrupt=%u\n", + fd, events.exception.nr, events.interrupt.nr); + break; + } + case KVM_GET_REGS: + case KVM_SET_REGS: { + struct kvm_regs regs; + bpf_probe_read(®s, sizeof(regs), (void *)arg); + // 此处仅展示部分寄存器值的打印 + bpf_printk( + "KVM_GET/SET_REGS: fd=%d, rax=%llx, rbx=%llx, rcx=%llx, " + "rdx=%llx, rsi=%llx\n", + fd, regs.rax, regs.rbx, regs.rcx, regs.rdx, regs.rsi); + + break; + } + case KVM_TRANSLATE: { + struct kvm_translation tr; + bpf_probe_read(&tr, sizeof(tr), (void *)arg); + bpf_printk( + "KVM_TRANSLATE: fd=%d,linear_address=%llx, " + "physical_address=%llx\n", + fd, tr.linear_address, tr.physical_address); + break; + } + case KVM_INTERRUPT: { + struct kvm_interrupt irq; + bpf_probe_read(&irq, sizeof(irq), (void *)arg); + bpf_printk("KVM_INTERRUPT:fd=%d,interrupt vector:%d\n", fd, + irq.irq); + break; + } + default: + break; + } + return 0; +} + +#endif /* __KVM_IOCTL_H */ \ No newline at end of file From 7b49dc49457962d0aa91ae484933be72b3c1ca12 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 8 Mar 2024 16:51:02 +0800 Subject: [PATCH 17/46] =?UTF-8?q?=E4=BC=98=E5=8C=96=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- eBPF_Supermarket/kvm_watcher/include/kvm_exits.h | 2 +- eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_exits.h b/eBPF_Supermarket/kvm_watcher/include/kvm_exits.h index 4171c89e8..214bf96cf 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_exits.h +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_exits.h @@ -58,7 +58,7 @@ static int trace_kvm_exit(struct exit *ctx, pid_t vm_pid) { CHECK_PID(vm_pid); u32 reason; reason = (u32)ctx->exit_reason; - //如果是节能停止退出,就不采集数据 + // 如果是节能停止退出,就不采集数据 if (reason == EXIT_REASON_HLT) { return 0; } diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h b/eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h index 3a8a8df61..5eace1a93 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h @@ -28,10 +28,7 @@ #define NS_TO_US_WITH_DECIMAL(ns) ((double)(ns) / NS_TO_US_FACTOR) #define NS_TO_MS_WITH_DECIMAL(ns) ((double)(ns) / NS_TO_MS_FACTOR) -#define MICROSECONDS_IN_SECOND 1000000 -#define OUTPUT_INTERVAL_SECONDS 2 - -#define OUTPUT_INTERVAL(us) usleep((__u32)(us * MICROSECONDS_IN_SECOND)) +#define OUTPUT_INTERVAL(SECONDS) sleep(SECONDS) #define OPTIONS_LIST "-w, -p, -d, -f, -c, -i, ,-h or -e" @@ -147,6 +144,7 @@ enum EventType { IRQCHIP, IRQ_INJECT, HYPERCALL, + IOCTL, } event_type; struct common_event { From 218b21fd6b67a227a634fe8f5c0a591872fb60e2 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 8 Mar 2024 17:14:26 +0800 Subject: [PATCH 18/46] =?UTF-8?q?Revert=20"=E6=B7=BB=E5=8A=A0=E5=86=85?= =?UTF-8?q?=E6=A0=B8=E6=80=81kvm=20ioctl=E6=8F=90=E5=8F=96=E6=A1=86?= =?UTF-8?q?=E6=9E=B6=EF=BC=8C=E7=9B=AE=E5=89=8D=E5=8F=AA=E7=BB=9F=E8=AE=A1?= =?UTF-8?q?=E9=83=A8=E5=88=86ioctl=E4=BF=A1=E6=81=AF=EF=BC=8C=E5=90=8E?= =?UTF-8?q?=E6=9C=9F=E8=AE=A1=E5=88=92=E5=AE=8C=E5=96=84=E7=A8=8B=E5=BA=8F?= =?UTF-8?q?=E5=B9=B6=E5=AF=B9=E6=95=B0=E6=8D=AE=E8=BF=9B=E8=A1=8C=E5=A4=84?= =?UTF-8?q?=E7=90=86=E8=BE=93=E5=87=BA=E5=88=B0=E7=94=A8=E6=88=B7=E6=80=81?= =?UTF-8?q?"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit b17bd776a23c2a19d9c154197352fe7027b31d50. --- .../kvm_watcher/include/kvm_ioctl.h | 110 ------------------ 1 file changed, 110 deletions(-) delete mode 100644 eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h b/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h deleted file mode 100644 index 8c4b73243..000000000 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright 2023 The LMP Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://github.com/linuxkerneltravel/lmp/blob/develop/LICENSE -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// author: nanshuaibo811@163.com -// -// Kernel space BPF program used for KVM ioctl - -#ifndef __KVM_IOCTL_H -#define __KVM_IOCTL_H - -#include "kvm_watcher.h" -#include "vmlinux.h" -#include -#include -#include -#include - -#define KVMIO 0xAE -#define KVM_CREATE_VM _IO(KVMIO, 0x01) /* returns a VM fd */ -#define KVM_CREATE_VCPU _IO(KVMIO, 0x41) -#define KVM_GET_VCPU_EVENTS _IOR(KVMIO, 0x9f, struct kvm_vcpu_events) -#define KVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events) -#define KVM_SET_USER_MEMORY_REGION \ - _IOW(KVMIO, 0x46, struct kvm_userspace_memory_region) -#define KVM_GET_REGS _IOR(KVMIO, 0x81, struct kvm_regs) -#define KVM_SET_REGS _IOW(KVMIO, 0x82, struct kvm_regs) -#define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation) -#define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt) - -static int trace_kvm_ioctl(struct trace_event_raw_sys_enter *args) { - int fd = (int)args->args[0]; - unsigned int cmd = (unsigned int)args->args[1]; - unsigned long arg = (unsigned long)args->args[2]; - switch (cmd) { - case KVM_CREATE_VM: - bpf_printk("KVM_CREATE_VM: fd=%d\n", fd); - break; - case KVM_CREATE_VCPU: { - int vcpu_id; - bpf_probe_read(&vcpu_id, sizeof(vcpu_id), (void *)arg); - bpf_printk("KVM_CREATE_VCPU: fd=%d, vcpu_id=%d\n", fd, vcpu_id); - break; - } - case KVM_SET_USER_MEMORY_REGION: { - struct kvm_userspace_memory_region region; - bpf_probe_read(®ion, sizeof(region), (void *)arg); - // 打印或处理 region 数据 - bpf_printk( - "KVM_SET_USER_MEMORY_REGION: fd=%d, slot=%u, flags=%u, " - "guest_phys_addr=%llx, memory_size=%lluK,userspace_addr=%llx\n", - fd, region.slot, region.flags, region.guest_phys_addr, - region.memory_size / 1024,region.userspace_addr); - break; - } - case KVM_GET_VCPU_EVENTS: - case KVM_SET_VCPU_EVENTS: { - struct kvm_vcpu_events events; - bpf_probe_read(&events, sizeof(events), (void *)arg); - // 打印或处理 events 数据 - bpf_printk( - "KVM_SET/GET_VCPU_EVENTS: fd=%d, exception=%u, interrupt=%u\n", - fd, events.exception.nr, events.interrupt.nr); - break; - } - case KVM_GET_REGS: - case KVM_SET_REGS: { - struct kvm_regs regs; - bpf_probe_read(®s, sizeof(regs), (void *)arg); - // 此处仅展示部分寄存器值的打印 - bpf_printk( - "KVM_GET/SET_REGS: fd=%d, rax=%llx, rbx=%llx, rcx=%llx, " - "rdx=%llx, rsi=%llx\n", - fd, regs.rax, regs.rbx, regs.rcx, regs.rdx, regs.rsi); - - break; - } - case KVM_TRANSLATE: { - struct kvm_translation tr; - bpf_probe_read(&tr, sizeof(tr), (void *)arg); - bpf_printk( - "KVM_TRANSLATE: fd=%d,linear_address=%llx, " - "physical_address=%llx\n", - fd, tr.linear_address, tr.physical_address); - break; - } - case KVM_INTERRUPT: { - struct kvm_interrupt irq; - bpf_probe_read(&irq, sizeof(irq), (void *)arg); - bpf_printk("KVM_INTERRUPT:fd=%d,interrupt vector:%d\n", fd, - irq.irq); - break; - } - default: - break; - } - return 0; -} - -#endif /* __KVM_IOCTL_H */ \ No newline at end of file From 4bdcdfc645ebd62ea9f0b1578d7b4b48c4f038bd Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 8 Mar 2024 17:17:59 +0800 Subject: [PATCH 19/46] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=86=85=E6=A0=B8?= =?UTF-8?q?=E6=80=81kvm=20ioctl=E5=A4=84=E7=90=86=E5=87=BD=E6=95=B0?= =?UTF-8?q?=EF=BC=8C=E5=90=8E=E7=BB=AD=E8=AE=A1=E5=88=92=E7=BB=9F=E8=AE=A1?= =?UTF-8?q?=E7=9B=B8=E5=85=B3ioctl=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kvm_watcher/include/kvm_ioctl.h | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h b/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h new file mode 100644 index 000000000..97d13ba03 --- /dev/null +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h @@ -0,0 +1,32 @@ +// Copyright 2023 The LMP Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://github.com/linuxkerneltravel/lmp/blob/develop/LICENSE +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// author: nanshuaibo811@163.com +// +// Kernel space BPF program used for KVM ioctl + +#ifndef __KVM_IOCTL_H +#define __KVM_IOCTL_H + +#include "kvm_watcher.h" +#include "vmlinux.h" +#include +#include +#include + +static int trace_kvm_ioctl(struct trace_event_raw_sys_enter *args) { + return 0; +} + +#endif /* __KVM_IOCTL_H */ \ No newline at end of file From 05788a0157c7c0143e85ae31f445d4f6d927cbf4 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 8 Mar 2024 17:23:30 +0800 Subject: [PATCH 20/46] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E7=94=A8=E6=88=B7?= =?UTF-8?q?=E6=80=81kvm=20=20ioctl=E4=BB=A3=E7=A0=81=E6=A1=86=E6=9E=B6=20?= =?UTF-8?q?=EF=BC=8C=E5=87=8F=E5=B0=91=E9=87=8D=E5=A4=8D=E6=80=A7=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=EF=BC=8C=E5=A2=9E=E5=8A=A0=E4=BB=A3=E7=A0=81=E5=8F=AF?= =?UTF-8?q?=E8=AF=BB=E6=80=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kvm_watcher/src/kvm_watcher.c | 258 +++++++++--------- 1 file changed, 135 insertions(+), 123 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c index 089c045f5..86e216af6 100644 --- a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c +++ b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c @@ -54,101 +54,102 @@ FILE *create_temp_file(const char *filename) { return output; } -const char *getHypercallName(int number) { - struct Hypercall { +const char *getName(int number, enum EventType type) { + struct NameMapping { int number; const char *name; }; - + // 定义具体的退出原因 arch/x86/include/uapi/asm/vmx.h + struct NameMapping exitReasons[] = {{0, "EXCEPTION_NMI"}, + {1, "EXTERNAL_INTERRUPT"}, + {2, "TRIPLE_FAULT"}, + {3, "INIT_SIGNAL"}, + {4, "SIPI_SIGNAL"}, + {7, "INTERRUPT_WINDOW"}, + {8, "NMI_WINDOW"}, + {9, "TASK_SWITCH"}, + {10, "CPUID"}, + {12, "HLT"}, + {13, "INVD"}, + {14, "INVLPG"}, + {15, "RDPMC"}, + {16, "RDTSC"}, + {18, "VMCALL"}, + {19, "VMCLEAR"}, + {20, "VMLAUNCH"}, + {21, "VMPTRLD"}, + {22, "VMPTRST"}, + {23, "VMREAD"}, + {24, "VMRESUME"}, + {25, "VMWRITE"}, + {26, "VMOFF"}, + {27, "VMON"}, + {28, "CR_ACCESS"}, + {29, "DR_ACCESS"}, + {30, "IO_INSTRUCTION"}, + {31, "MSR_READ"}, + {32, "MSR_WRITE"}, + {33, "INVALID_STATE"}, + {34, "MSR_LOAD_FAIL"}, + {36, "MWAIT_INSTRUCTION"}, + {37, "MONITOR_TRAP_FLAG"}, + {39, "MONITOR_INSTRUCTION"}, + {40, "PAUSE_INSTRUCTION"}, + {41, "MCE_DURING_VMENTRY"}, + {43, "TPR_BELOW_THRESHOLD"}, + {44, "APIC_ACCESS"}, + {45, "EOI_INDUCED"}, + {46, "GDTR_IDTR"}, + {47, "LDTR_TR"}, + {48, "EPT_VIOLATION"}, + {49, "EPT_MISCONFIG"}, + {50, "INVEPT"}, + {51, "RDTSCP"}, + {52, "PREEMPTION_TIMER"}, + {53, "INVVPID"}, + {54, "WBINVD"}, + {55, "XSETBV"}, + {56, "APIC_WRITE"}, + {57, "RDRAND"}, + {58, "INVPCID"}, + {59, "VMFUNC"}, + {60, "ENCLS"}, + {61, "RDSEED"}, + {62, "PML_FULL"}, + {63, "XSAVES"}, + {64, "XRSTORS"}, + {67, "UMWAIT"}, + {68, "TPAUSE"}, + {74, "BUS_LOCK"}, + {75, "NOTIFY"}}; // 定义超级调用 include\uapi\linux\kvm_para.h - struct Hypercall hypercalls[] = { + struct NameMapping hypercalls[] = { {1, "VAPIC_POLL_IRQ"}, {5, "KICK_CPU"}, {9, "CLOCK_PAIRING"}, {10, "SEND_IPI"}, {11, "SCHED_YIELD"}, {12, "MAP_GPA_RANGE"}}; - - for (int i = 0; i < sizeof(hypercalls) / sizeof(hypercalls[0]); i++) { - if (hypercalls[i].number == number) { - return hypercalls[i].name; - } + // 根据枚举类型选择使用哪个结构体数组进行转换 + struct NameMapping *mappings; + int count; + switch (type) { + case EXIT: + mappings = exitReasons; + count = sizeof(exitReasons) / sizeof(exitReasons[0]); + break; + case HYPERCALL: + mappings = hypercalls; + count = sizeof(hypercalls) / sizeof(hypercalls[0]); + break; + default: + return "Unknown"; } - return "Unknown"; // 如果找不到对应的超级调用号,返回一个默认值 -} - -const char *getExitReasonName(int number) { - struct ExitReason { - int number; - const char *name; - }; - // 定义具体的退出原因 arch/x86/include/uapi/asm/vmx.h - struct ExitReason exitReasons[] = {{0, "EXCEPTION_NMI"}, - {1, "EXTERNAL_INTERRUPT"}, - {2, "TRIPLE_FAULT"}, - {3, "INIT_SIGNAL"}, - {4, "SIPI_SIGNAL"}, - {7, "INTERRUPT_WINDOW"}, - {8, "NMI_WINDOW"}, - {9, "TASK_SWITCH"}, - {10, "CPUID"}, - {12, "HLT"}, - {13, "INVD"}, - {14, "INVLPG"}, - {15, "RDPMC"}, - {16, "RDTSC"}, - {18, "VMCALL"}, - {19, "VMCLEAR"}, - {20, "VMLAUNCH"}, - {21, "VMPTRLD"}, - {22, "VMPTRST"}, - {23, "VMREAD"}, - {24, "VMRESUME"}, - {25, "VMWRITE"}, - {26, "VMOFF"}, - {27, "VMON"}, - {28, "CR_ACCESS"}, - {29, "DR_ACCESS"}, - {30, "IO_INSTRUCTION"}, - {31, "MSR_READ"}, - {32, "MSR_WRITE"}, - {33, "INVALID_STATE"}, - {34, "MSR_LOAD_FAIL"}, - {36, "MWAIT_INSTRUCTION"}, - {37, "MONITOR_TRAP_FLAG"}, - {39, "MONITOR_INSTRUCTION"}, - {40, "PAUSE_INSTRUCTION"}, - {41, "MCE_DURING_VMENTRY"}, - {43, "TPR_BELOW_THRESHOLD"}, - {44, "APIC_ACCESS"}, - {45, "EOI_INDUCED"}, - {46, "GDTR_IDTR"}, - {47, "LDTR_TR"}, - {48, "EPT_VIOLATION"}, - {49, "EPT_MISCONFIG"}, - {50, "INVEPT"}, - {51, "RDTSCP"}, - {52, "PREEMPTION_TIMER"}, - {53, "INVVPID"}, - {54, "WBINVD"}, - {55, "XSETBV"}, - {56, "APIC_WRITE"}, - {57, "RDRAND"}, - {58, "INVPCID"}, - {59, "VMFUNC"}, - {60, "ENCLS"}, - {61, "RDSEED"}, - {62, "PML_FULL"}, - {63, "XSAVES"}, - {64, "XRSTORS"}, - {67, "UMWAIT"}, - {68, "TPAUSE"}, - {74, "BUS_LOCK"}, - {75, "NOTIFY"}}; - - for (int i = 0; i < sizeof(exitReasons) / sizeof(exitReasons[0]); i++) { - if (exitReasons[i].number == number) { - return exitReasons[i].name; + // 根据给定的编号在选择的结构体数组中搜索对应的名称 + for (int i = 0; i < count; i++) { + if (mappings[i].number == number) { + return mappings[i].name; } } - return "Unknown"; // 如果找不到对应的退出原因,返回一个默认值 + + return "Unknown"; // 如果找不到对应的条目,返回一个默认值 } // 检查具有给定 PID 的进程是否存在 @@ -256,7 +257,6 @@ int save_count_dirtypagemap_to_file(struct bpf_map *map) { static struct env { bool execute_vcpu_wakeup; bool execute_exit; - bool ShowStats; bool execute_halt_poll_ns; bool execute_mark_page_dirty; bool execute_page_fault; @@ -264,13 +264,13 @@ static struct env { bool execute_irqchip; bool execute_irq_inject; bool execute_hypercall; + bool execute_ioctl; int monitoring_time; pid_t vm_pid; enum EventType event_type; } env = { .execute_vcpu_wakeup = false, .execute_exit = false, - .ShowStats = false, .execute_halt_poll_ns = false, .execute_mark_page_dirty = false, .execute_page_fault = false, @@ -278,6 +278,7 @@ static struct env { .execute_irq_inject = false, .mmio_page_fault = false, .execute_hypercall = false, + .execute_ioctl = false, .monitoring_time = 0, .vm_pid = -1, .event_type = NONE_TYPE, @@ -297,9 +298,9 @@ static const struct argp_option opts[] = { "Monitor virtual machine dirty page information."}, {"kvmmmu_page_fault", 'f', NULL, 0, "Monitoring the data of kvmmmu page fault."}, - {"kvm_irqchip(software)", 'c', NULL, 0, + {"kvm_irqchip", 'c', NULL, 0, "Monitor the irqchip setting information in KVM VM."}, - {"irq_inject(hardware)", 'i', NULL, 0, + {"irq_inject", 'i', NULL, 0, "Monitor the virq injection information in KVM VM "}, {"hypercall", 'h', NULL, 0, "Monitor the hypercall information in KVM VM "}, {"mmio", 'm', NULL, 0, @@ -307,6 +308,7 @@ static const struct argp_option opts[] = { "specified.)"}, {"vm_pid", 'p', "PID", 0, "Specify the virtual machine pid to monitor."}, {"monitoring_time", 't', "SEC", 0, "Time for monitoring."}, + {"kvm_ioctl", 'l', NULL, 0, "Monitoring the KVM IOCTL."}, {NULL, 'H', NULL, OPTION_HIDDEN, "Show the full help"}, {}, }; @@ -343,13 +345,8 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) { case 'h': SET_OPTION_AND_CHECK_USAGE(option_selected, env.execute_hypercall); break; - case 's': - if (env.execute_exit) { - env.ShowStats = true; - } else { - fprintf(stderr, "The -e option must be specified.\n"); - argp_state_help(state, stdout, ARGP_HELP_STD_HELP); - } + case 'l': + SET_OPTION_AND_CHECK_USAGE(option_selected, env.execute_ioctl); break; case 'm': if (env.execute_page_fault) { @@ -425,6 +422,8 @@ static int determineEventType(struct env *env) { env->event_type = IRQ_INJECT; } else if (env->execute_hypercall) { env->event_type = HYPERCALL; + } else if (env->execute_ioctl) { + env->event_type = IOCTL; } else { env->event_type = NONE_TYPE; // 或者根据需要设置一个默认的事件类型 } @@ -472,7 +471,7 @@ static int handle_event(void *ctx, void *data, size_t data_sz) { } case PAGE_FAULT: { // 使用 e->page_fault_data 访问 PAGE_FAULT 特有成员 - printf("%-18.6f %-15s %-10u %-12llx %-6u %-10.4f ", timestamp_ms, + printf("%-18.6f %-15s %-10u %-14llx %-6u %-10.4f ", timestamp_ms, e->process.comm, e->process.pid, e->page_fault_data.addr, e->page_fault_data.count, NS_TO_US_WITH_DECIMAL(e->page_fault_data.delay)); @@ -593,7 +592,7 @@ static int handle_event(void *ctx, void *data, size_t data_sz) { fprintf(output, "%-18.6f %-15s %-10d %-10d %-10s %-11llu", timestamp_ms, e->process.comm, e->process.pid, e->hypercall_data.vcpu_id, - getHypercallName(e->hypercall_data.hc_nr), + getName(e->hypercall_data.hc_nr, HYPERCALL), e->hypercall_data.hypercalls); if (e->hypercall_data.hc_nr == 5) { fprintf(output, "apic_id:%llu\n", e->hypercall_data.a1); @@ -621,6 +620,9 @@ static int handle_event(void *ctx, void *data, size_t data_sz) { fclose(output); break; } + case IOCTL: { + break; + } default: // 处理未知事件类型 break; @@ -653,8 +655,8 @@ static int print_event_head(struct env *env) { break; case PAGE_FAULT: printf("%-18s %-15s %-10s %-12s %-6s %-10s %-20s %-17s %-10s %s\n", - "TIME(ms)", "COMM", "PID", "ADDRESS", "COUNT", "DELAY(us)", - "HVA", "PFN", "MEM_SLOTID", "ERROR_TYPE"); + "TIME(ms)", "COMM", "PID", "(f(GPA)m(GFN))", "COUNT", + "DELAY(us)", "HVA", "PFN", "MEM_SLOTID", "ERROR_TYPE"); break; case IRQCHIP: printf("%-18s %-15s %-10s %-10s %-14s %-10s %-10s\n", "TIME(ms)", @@ -678,6 +680,10 @@ static int print_event_head(struct env *env) { fclose(output); break; } + case IOCTL: { + printf("wait....\n"); + break; + } default: // Handle default case or display an error message break; @@ -701,11 +707,11 @@ static void set_disable_load(struct kvm_watcher_bpf *skel) { env.execute_mark_page_dirty ? true : false); bpf_program__set_autoload(skel->progs.tp_page_fault, env.execute_page_fault ? true : false); - bpf_program__set_autoload(skel->progs.fexit_direct_page_fault, + bpf_program__set_autoload(skel->progs.fexit_tdp_page_fault, env.execute_page_fault ? true : false); bpf_program__set_autoload(skel->progs.fentry_kvm_mmu_page_fault, env.mmio_page_fault ? true : false); - bpf_program__set_autoload(skel->progs.fexit_handle_mmio_page_fault, + bpf_program__set_autoload(skel->progs.tp_handle_mmio_page_fault, env.mmio_page_fault ? true : false); bpf_program__set_autoload(skel->progs.fentry_kvm_pic_set_irq, env.execute_irqchip ? true : false); @@ -725,6 +731,23 @@ static void set_disable_load(struct kvm_watcher_bpf *skel) { env.execute_irq_inject ? true : false); bpf_program__set_autoload(skel->progs.fentry_emulate_hypercall, env.execute_hypercall ? true : false); + bpf_program__set_autoload(skel->progs.tp_ioctl, + env.execute_ioctl ? true : false); +} + +// 函数不接受参数,返回一个静态分配的字符串 +const char *getCurrentTimeFormatted() { + static char ts[32]; // 静态分配,每次调用都会覆盖 + time_t t; + struct tm *tm; + + time(&t); + tm = localtime(&t); + + // 格式化时间到静态分配的字符串中 + strftime(ts, sizeof(ts), "%H:%M:%S", tm); + + return ts; // 返回指向静态字符串的指针 } int print_hc_map(struct kvm_watcher_bpf *skel) { @@ -734,12 +757,6 @@ int print_hc_map(struct kvm_watcher_bpf *skel) { struct hc_key lookup_key = {}; struct hc_key next_key = {}; struct hc_value hc_value = {}; - struct tm *tm; - char ts[32]; - time_t t; - time(&t); - tm = localtime(&t); - strftime(ts, sizeof(ts), "%H:%M:%S", tm); int first_run = 1; // Iterate over the map while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) { @@ -749,7 +766,7 @@ int print_hc_map(struct kvm_watcher_bpf *skel) { "--------------------------------------------------------------" "----------" "\n"); - printf("TIME:%s\n", ts); + printf("TIME:%s\n", getCurrentTimeFormatted()); printf("%-12s %-12s %-12s %-12s %-12s\n", "PID", "VCPU_ID", "NAME", "COUNTS", "HYPERCALLS"); } @@ -760,8 +777,8 @@ int print_hc_map(struct kvm_watcher_bpf *skel) { return -1; } printf("%-12d %-12d %-12s %-12d %-12lld\n", next_key.pid, - next_key.vcpu_id, getHypercallName(next_key.nr), hc_value.counts, - hc_value.hypercalls); + next_key.vcpu_id, getName(next_key.nr, HYPERCALL), + hc_value.counts, hc_value.hypercalls); // // Move to the next key lookup_key = next_key; } @@ -792,18 +809,12 @@ int print_exit_map(struct kvm_watcher_bpf *skel) { struct exit_key lookup_key = {}; struct exit_key next_key = {}; struct exit_value exit_value; - struct tm *tm; - char ts[32]; - time_t t; - time(&t); - tm = localtime(&t); - strftime(ts, sizeof(ts), "%H:%M:%S", tm); int first_run = 1; // Iterate over the map while (!bpf_map_get_next_key(fd, &lookup_key, &next_key)) { if (first_run) { first_run = 0; - printf("\nTIME:%s\n", ts); + printf("\nTIME:%s\n", getCurrentTimeFormatted()); printf("%-12s %-12s %-12s %-12s %-12s %-12s\n", "pid", "total_time", "max_time", "min_time", "counts", "reason"); printf( @@ -821,8 +832,7 @@ int print_exit_map(struct kvm_watcher_bpf *skel) { NS_TO_MS_WITH_DECIMAL(exit_value.total_time), NS_TO_MS_WITH_DECIMAL(exit_value.max_time), NS_TO_MS_WITH_DECIMAL(exit_value.min_time), exit_value.count, - getExitReasonName(next_key.reason)); - + getName(next_key.reason, EXIT)); // Move to the next key lookup_key = next_key; } @@ -838,8 +848,10 @@ int print_exit_map(struct kvm_watcher_bpf *skel) { return 0; } -void print_map_and_check_error(int (*print_func)(struct kvm_watcher_bpf *), struct kvm_watcher_bpf *skel, const char *map_name, int err) { - OUTPUT_INTERVAL(OUTPUT_INTERVAL_SECONDS); +void print_map_and_check_error(int (*print_func)(struct kvm_watcher_bpf *), + struct kvm_watcher_bpf *skel, + const char *map_name, int err) { + OUTPUT_INTERVAL(2); print_func(skel); if (err < 0) { printf("Error printing %s map: %d\n", map_name, err); @@ -919,7 +931,7 @@ int main(int argc, char **argv) { } while (!exiting) { err = ring_buffer__poll(rb, RING_BUFFER_TIMEOUT_MS /* timeout, ms */); - + if (env.execute_hypercall) { print_map_and_check_error(print_hc_map, skel, "hypercall", err); } From 660a1c3f166b3b91cab191cdf8e0cd963c16b4ff Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Sun, 10 Mar 2024 14:57:55 +0800 Subject: [PATCH 21/46] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E5=86=B2=E7=AA=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c index 82e410e68..2746149fe 100644 --- a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c +++ b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c @@ -884,13 +884,13 @@ int print_exit_map(struct kvm_watcher_bpf *skel) { NS_TO_MS_WITH_DECIMAL(values[i].total_time), NS_TO_MS_WITH_DECIMAL(values[i].max_time), NS_TO_MS_WITH_DECIMAL(values[i].min_time), values[i].count, - getName(next_key.reason, EXIT)); + getName(keys[i].reason, EXIT)); } else if (tid == keys[i].tid) { printf("%25s %-12.4f %-12.4f %-12.4f %-12u %-12s\n", "", NS_TO_MS_WITH_DECIMAL(values[i].total_time), NS_TO_MS_WITH_DECIMAL(values[i].max_time), NS_TO_MS_WITH_DECIMAL(values[i].min_time), values[i].count, - getName(next_key.reason, EXIT)); + getName(keys[i].reason, EXIT)); } } // clear the maps From 2c79267b8ac79a751ee119cb89efa25f00e7be27 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Tue, 12 Mar 2024 20:08:11 +0800 Subject: [PATCH 22/46] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E5=AD=90=E5=8A=9F?= =?UTF-8?q?=E8=83=BD=E6=A8=A1=E5=9D=97=E8=AF=B4=E6=98=8E=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kvm_watcher/docs/Hypercall.md | 369 ------------------ eBPF_Supermarket/kvm_watcher/docs/kvm_exit.md | 92 +++++ .../kvm_watcher/docs/kvm_hypercall.md | 225 +++++++++++ eBPF_Supermarket/kvm_watcher/docs/kvm_irq.md | 142 +++++++ eBPF_Supermarket/kvm_watcher/docs/kvm_mmu.md | 105 ++++- eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md | 103 +++++ 6 files changed, 661 insertions(+), 375 deletions(-) delete mode 100644 eBPF_Supermarket/kvm_watcher/docs/Hypercall.md create mode 100644 eBPF_Supermarket/kvm_watcher/docs/kvm_exit.md create mode 100644 eBPF_Supermarket/kvm_watcher/docs/kvm_hypercall.md create mode 100644 eBPF_Supermarket/kvm_watcher/docs/kvm_irq.md create mode 100644 eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md diff --git a/eBPF_Supermarket/kvm_watcher/docs/Hypercall.md b/eBPF_Supermarket/kvm_watcher/docs/Hypercall.md deleted file mode 100644 index a855470d2..000000000 --- a/eBPF_Supermarket/kvm_watcher/docs/Hypercall.md +++ /dev/null @@ -1,369 +0,0 @@ -> 在Linux中,大家应该对syscall非常的了解和熟悉,其是用户态进入内核态的一种途径或者说是一种方式,完成了两个模式之间的切换;而在虚拟环境中,有没有一种类似于syscall这种方式,能够从no root模式切换到root模式呢?答案是肯定的,KVM提供了Hypercall机制,x86体系架构也有相关的指令支持。 -> -> hypercall:当虚拟机的Guest OS需要执行一些更高权限的操作(如:页表的更新、对物理资源的访问等)时,由于自身在非特权域无法完成这些操作,于是便通过调用Hypercall交给Hypervisor来完成这些操作。 - -## Hypercall的发起 - -KVM代码中提供了五种形式的Hypercall接口: - -``` -file: arch/x86/include/asm/kvm_para.h, line: 34 -static inline long kvm_hypercall0(unsigned int nr); -static inline long kvm_hypercall1(unsigned int nr, unsigned long p1); -static inline long kvm_hypercall2(unsigned int nr, unsigned long p1, unsigned long p2); -static inline long kvm_hypercall3(unsigned int nr, unsigned long p1, unsigned long p2, unsigned long p3) -static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, unsigned long p2, unsigned long p3, unsigned long p4) -``` - -这几个接口的区别在于参数个数的不用,本质是一样的。挑个参数最多的看下: - -``` -static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, - unsigned long p2, unsigned long p3, - unsigned long p4) -{ - long ret; - asm volatile(KVM_HYPERCALL - : "=a"(ret) - : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4) - : "memory"); - return ret; -} -``` - -Hypercall内部实现是标准的内嵌汇编,稍作分析: - -### KVM_HYPERCALL - -``` -#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1" -``` - -对于KVM hypercall来说,KVM_HYPERCALL是一个三字节的指令序列,x86体系架构下即是vmcall指令,官方手册解释: - -``` -vmcall: - op code:0F 01 C1 -- VMCALL Call to VM - monitor -by causing VM exit -``` - -言简意赅,vmcall会导致VM exit到VMM。 - -### 返回值 - -: “=a”(ret),表示返回值放在eax寄存器中输出。 - -### 输入 - -: “a”(nr), “b”(p1), “c”(p2), “d”(p3), “S”(p4),表示输入参数放在对应的eax,ebx,ecx,edx,esi中,而nr其实就是可以认为是系统调用号。 - -## hypercall的处理 - -当Guest发起一次hypercall后,VMM会接管到该call导致的VM Exit。 - -``` -static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { - ...... - [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, - ...... -} -``` - -进入kvm_emulate_hypercall()处理,过程非常简单: - -``` -int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) -{ - unsigned long nr, a0, a1, a2, a3, ret; - int op_64_bit; - - // 检查是否启用了Xen超级调用,如果是,则调用Xen超级调用处理函数 - if (kvm_xen_hypercall_enabled(vcpu->kvm)) - return kvm_xen_hypercall(vcpu); - - // 检查是否启用了Hypervisor超级调用,如果是,则调用Hypervisor超级调用处理函数 - if (kvm_hv_hypercall_enabled(vcpu)) - return kvm_hv_hypercall(vcpu); - - // 从寄存器中读取超级调用号及参数 - nr = kvm_rax_read(vcpu); - a0 = kvm_rbx_read(vcpu); - a1 = kvm_rcx_read(vcpu); - a2 = kvm_rdx_read(vcpu); - a3 = kvm_rsi_read(vcpu); - - // 记录超级调用的追踪信息 - trace_kvm_hypercall(nr, a0, a1, a2, a3); - - // 检查是否为64位超级调用 - op_64_bit = is_64_bit_hypercall(vcpu); - if (!op_64_bit) { - nr &= 0xFFFFFFFF; - a0 &= 0xFFFFFFFF; - a1 &= 0xFFFFFFFF; - a2 &= 0xFFFFFFFF; - a3 &= 0xFFFFFFFF; - } - - // 检查当前CPU的特权级是否为0 - if (static_call(kvm_x86_get_cpl)(vcpu) != 0) { - ret = -KVM_EPERM; - goto out; - } - - ret = -KVM_ENOSYS; - - // 根据超级调用号执行相应的操作 - switch (nr) { - case KVM_HC_VAPIC_POLL_IRQ: - ret = 0; - break; - case KVM_HC_KICK_CPU: - // 处理CPU唤醒的超级调用 - if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT)) - break; - - kvm_pv_kick_cpu_op(vcpu->kvm, a1); - kvm_sched_yield(vcpu, a1); - ret = 0; - break; -#ifdef CONFIG_X86_64 - case KVM_HC_CLOCK_PAIRING: - // 处理时钟配对的超级调用 - ret = kvm_pv_clock_pairing(vcpu, a0, a1); - break; -#endif - case KVM_HC_SEND_IPI: - // 处理发送中断请求的超级调用 - if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI)) - break; - - ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); - break; - case KVM_HC_SCHED_YIELD: - // 处理调度让出的超级调用 - if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD)) - break; - - kvm_sched_yield(vcpu, a0); - ret = 0; - break; - case KVM_HC_MAP_GPA_RANGE: - // 处理GPA范围映射的超级调用 - ret = -KVM_ENOSYS; - if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) - break; - - // 设置KVM_EXIT_HYPERCALL退出类型,并填充相关信息 - vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; - vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; - vcpu->run->hypercall.args[0] = a0; - vcpu->run->hypercall.args[1] = a1; - vcpu->run->hypercall.args[2] = a2; - vcpu->run->hypercall.longmode = op_64_bit; - vcpu->arch.complete_userspace_io = complete_hypercall_exit; - return 0; - default: - ret = -KVM_ENOSYS; - break; - } - -out: - // 如果不是64位超级调用,则返回值需要截断为32位 - if (!op_64_bit) - ret = (u32)ret; - kvm_rax_write(vcpu, ret); - - // 更新超级调用统计信息,并跳过被模拟的指令 - ++vcpu->stat.hypercalls; - return kvm_skip_emulated_instruction(vcpu); -} -``` - -### Conclusion - -整个过程非常简洁和简单,hypercall机制给了Guest能够主动进入VMM的一种方式。 - -## 调用号 - -``` -#define KVM_HC_VAPIC_POLL_IRQ 1 -#define KVM_HC_MMU_OP 2 -#define KVM_HC_FEATURES 3 -#define KVM_HC_PPC_MAP_MAGIC_PAGE 4 -#define KVM_HC_KICK_CPU 5 -#define KVM_HC_MIPS_GET_CLOCK_FREQ 6 -#define KVM_HC_MIPS_EXIT_VM 7 -#define KVM_HC_MIPS_CONSOLE_OUTPUT 8 -#define KVM_HC_CLOCK_PAIRING 9 -#define KVM_HC_SEND_IPI 10 -#define KVM_HC_SCHED_YIELD 11 -#define KVM_HC_MAP_GPA_RANGE 12 -``` - - -1. ##### KVM_HC_VAPIC_POLL_IRQ - ------------------------- - -Architecture: x86 -Status: active -Purpose: 触发客户机退出,以便在重新进入时主机可以检查待处理的中断。 - -2. ##### KVM_HC_MMU_OP - ----------------- - -Architecture: x86 -Status: deprecated. -Purpose: 支持内存管理单元(MMU)操作,例如写入页表项(PTE)、刷新转换后备缓冲(TLB)以及释放页表(PT)。 - -3. ##### KVM_HC_FEATURES - ------------------- - -Architecture: PPC -Status: active -Purpose: 向客户机公开超级调用的可用性。在 x86 平台上,使用 cpuid 来列举可用的超级调用。在 PPC(PowerPC)上,可以使用基于设备树的查找(也是 EPAPR 规定的方式)或 KVM 特定的列举机制(即这个超级调用)。 - -4. ##### KVM_HC_PPC_MAP_MAGIC_PAGE - ----------------------------- - -Architecture: PPC -Status: active -Purpose:为了实现超级监视器与客户机之间的通信,存在一个共享页面,其中包含了监视器可见寄存器状态的部分。客户机可以通过使用此超级调用将这个共享页面映射,以通过内存访问其监视器寄存器。 - -5. ##### KVM_HC_KICK_CPU - ------------------- - -Architecture: x86 -Status: active -Purpose: 用于唤醒处于 HLT(Halt)状态的vCPU 。 -Usage example: -一个使用了半虚拟化的客户机的虚拟 CPU,在内核模式下忙等待某个事件的发生(例如,自旋锁变为可用)时,如果其忙等待时间超过了一个阈值时间间隔,就可以执行 HLT 指令。执行 HLT 指令将导致 hypervisor 将虚拟 CPU 置于休眠状态,直到发生适当的事件。同一客户机的另一个虚拟 CPU 可以通过发出 KVM_HC_KICK_CPU 超级调用来唤醒正在睡眠的虚拟 CPU,指定要唤醒的虚拟 CPU 的 APIC ID(a1)。另外一个参数(a0)在这个超级调用中用于将来的用途。 - - -6. ##### KVM_HC_CLOCK_PAIRING - ------------------------ - -Architecture: x86 -Status: active -Purpose: 用于同步主机和客户机时钟。 - -Usage: -a0:客户机物理地址,用于存储主机复制的 "struct kvm_clock_offset" 结构。 - -a1:时钟类型,目前只支持 KVM_CLOCK_PAIRING_WALLCLOCK(0)(对应主机的 CLOCK_REALTIME 时钟)。 - -```c -struct kvm_clock_pairing { - __s64 sec; // 从 clock_type 时钟起的秒数。 - __s64 nsec; // 从 clock_type 时钟起的纳秒数。 - __u64 tsc; // 用于计算 sec/nsec 对的客户机 TSC(时间戳计数)值。 - __u32 flags; // 标志,目前未使用(为 0)。 - __u32 pad[9]; // 填充字段,目前未使用。 -}; -``` - -这个超级调用允许客户机在主机和客户机之间计算精确的时间戳。客户机可以使用返回的 TSC(时间戳计数)值来计算其时钟的 CLOCK_REALTIME,即在同一时刻。 - -如果主机不使用 TSC 时钟源,或者时钟类型不同于 KVM_CLOCK_PAIRING_WALLCLOCK,则返回 KVM_EOPNOTSUPP。 - -7. ##### KVM_HC_SEND_IPI - ------------------- - -Architecture: x86 -Status: active -Purpose: 向多个vcpu发生ipi。 - -- `a0`: 目标 APIC ID 位图的低位部分。 -- `a1`: 目标 APIC ID 位图的高位部分。 -- `a2`: 位图中最低的 。 -- `a3`: 中断命令寄存器。 - -这个超级调用允许客户机发送组播中断处理请求(IPIs),每次调用最多可以有 128 个目标(在 64 位模式下)或者 64 个虚拟中央处理单元(vCPU)(在 32 位模式下)。目标由位图表示,位图包含在前两个参数中(a0 和 a1)。a0 的第 0 位对应于第三个参数 a2 中的 APIC ID,a0 的第 1 位对应于 a2+1 的 APIC ID,以此类推。 - -返回成功传递 IPIs 的 CPU 数量。 - -8. ##### KVM_HC_SCHED_YIELD - ---------------------- - -Architecture: x86 -Status: active -Purpose: 用于在目标vCPU被抢占时进行让步。 - -a0: destination APIC ID - -Usage example: 当向多个vCPU发送调用函数中断(call-function IPI)时,如果任何目标 vCPU 被抢占,进行让步。 - -9. ##### KVM_HC_MAP_GPA_RANGE - -------------------------- - -Architecture: x86 -Status: active -Purpose: 请求 KVM 映射一个具有指定属性的 GPA 范围。 - -`a0`: 起始页面的客户机物理地址 -`a1`: (4KB)页面的数量(在 GPA 空间中必须是连续的) -`a2`: 属性 - - 属性: - 位 3:0 - 首选页大小编码,0 = 4KB,1 = 2MB,2 = 1GB,以此类推... - 位 4 - 明文 = 0,加密 = 1 - 位 63:5 - 保留(必须为零) - -**实现注意事项** - -此超级调用通过 KVM_CAP_EXIT_HYPERCALL 能力在用户空间中实现。在向客户机 CPUID 中添加 KVM_FEATURE_HC_MAP_GPA_RANGE 之前,用户空间必须启用该能力。此外,如果客户机支持 KVM_FEATURE_MIGRATION_CONTROL,用户空间还必须设置一个 MSR 过滤器来处理对 MSR_KVM_MIGRATION_CONTROL 的写入。 - -可以通过如下查看发生的hypercall信息: - -``` -root@nans:/sys/kernel/debug/tracing/events/kvm# echo 0 > ../../tracing_on -root@nans:/sys/kernel/debug/tracing/events/kvm# echo 1 > kvm_hypercall/enable -root@nans:/sys/kernel/debug/tracing/events/kvm# echo 1 > ../../tracing_on -root@nans:/sys/kernel/debug/tracing/events/kvm# cat ../../trace_pipe -``` - -输出如下: - -![image-20240110125350965](https://gitee.com/nan-shuaibo/image/raw/master/202401101258714.png) - -使用ebpf技术统计hypercall信息: - -统计两秒内的每个hypercall发生的次数,和自客户机启动以来每个vcpu上发生的hypercall的次数 - -``` ------------------------------------------------------------------------- -TIME:16:22:05 -PID VCPU_ID NAME COUNTS HYPERCALLS -68453 4 KICK_CPU 1 0 -68453 2 KICK_CPU 1 0 -68453 1 SEND_IPI 6 5 -68453 0 SEND_IPI 7 7 -68453 7 KICK_CPU 1 0 -68453 0 KICK_CPU 1 0 ------------------------------------------------------------------------- -TIME:16:22:07 -PID VCPU_ID NAME COUNTS HYPERCALLS -68082 4 KICK_CPU 2 45 -68453 5 SEND_IPI 3 2 -68453 6 SCHED_YIELD 2 66 -68453 6 SEND_IPI 79 80 -68453 3 SEND_IPI 45 44 -68453 1 SEND_IPI 23 28 -68453 0 SEND_IPI 7 14 -68453 4 SEND_IPI 145 145 -``` - -并将详细信息输出至临时文件 - -![image-20240301162527679](https://gitee.com/nan-shuaibo/image/raw/master/202403011629545.png) - diff --git a/eBPF_Supermarket/kvm_watcher/docs/kvm_exit.md b/eBPF_Supermarket/kvm_watcher/docs/kvm_exit.md new file mode 100644 index 000000000..91adc8020 --- /dev/null +++ b/eBPF_Supermarket/kvm_watcher/docs/kvm_exit.md @@ -0,0 +1,92 @@ +# kvm_exit + +考虑到频繁的虚拟机退出事件可能会导致性能问题,kvm_watcher中的kvm_exit子功能通过显示详细的退出原因和在一台主机上运行的所有vm的每个虚拟机的vcpu上的退出计数及处理时延,可以捕获和分析vm exit事件,该工具旨在定位频繁退出的原因(如EPT_VIOLATION、EPT_MISCONFIG、PML_FULL等)。 + +## 原理介绍 + +### VMX 操作模式 + +作为传统的 IA32 架构的扩展,VMX 操作模式在默认下是关闭的,只有当 VMM 需要使用硬件辅助虚拟化功能时才会使用 Intel 提供的两条新指令来开关 VMX 操作模式: + +- `VMXON`:开启 VMX 操作模式。 +- `VMXOFF`:关闭 VMX 操作模式。 + +在 Intel SDM 中描述的 VMX 生命周期如下: + +- 软件通过 `VMXON` 指令进入 VMX 操作模式。 +- VMM 可以通过 `VM entries` 进入 Guest VM(单次只能执行一个 VM),VMM 通过 `VMLAUNCH` (第一次进入 VM)与 `VMRESUME` (从 VMM 中恢复到 VM)指令来使能 `VM entry`,通过 `VM exits` 重获控制权。 +- `VM exits` 通过 VMM 指定的入口点移交控制权,VMM 对 VM 的退出原因进行响应后通过 `VM entry` 返回到 VM 中。 +- 当 VMM 想要停止自身运行并退出 VMX 操作模式时,其通过 `VMXOFF` 指令来完成。 + +![img](https://ctf-wiki.org/pwn/virtualization/basic-knowledge/figure/interaction-of-vmm-and-guest.png) + +### VM exit和VM entry + +**VM exit**:VM-Exit是指CPU从非根模式切换到根模式,从客户机切换到VMM的操作。引发VM-Exit的原因很多,例如在非根模式执行了敏感指令、发生了中断等。处理VM-Exit时间是VMM模拟指令、虚拟特权资源的一大任务。 + +**VM entry**:VM-Entry是指CPU由根模式切换到非根模式,从软件角度看,是指CPU从VMM切换到客户机执行。这个操作通常由VMM主动发起。在发起之前,VMM会设置好VMCS相关域的内容,例如客户机状态域、宿主机状态域等,然后执行VM-Entry指令。 + +以下是VM exit到VM entry的流程: + +![VM entry 与 VM exit](https://ctf-wiki.org/pwn/virtualization/basic-knowledge/figure/vm-entry-and-exit.png) + +## 示例输出 + +4391为主机上的虚拟机进程,4508、4509、4510...分别是虚拟机中的vcpu子进程,每隔两秒输出虚拟机中产生的exit事件及其处理延时等信息。 + +``` +ubuntu@rd350x:~/nans/lmp/eBPF_Supermarket/kvm_watcher$ sudo ./kvm_watcher -e + +TIME:16:33:47 +pid tid total_time max_time min_time counts reason +------------ ------------ ------------ ------------ ------------ ------------ ------------ +4391 4508 0.0067 0.0067 0.0067 1 MSR_READ + 4509 0.0074 0.0038 0.0036 2 MSR_READ + 0.1354 0.0173 0.0006 48 MSR_WRITE + 0.6816 0.0639 0.0036 44 IO_INSTRUCTION + 0.0030 0.0030 0.0030 1 EOI_INDUCED + 4510 0.0043 0.0043 0.0043 1 MSR_READ + 0.0076 0.0049 0.0011 3 MSR_WRITE + 4511 0.0053 0.0053 0.0053 1 MSR_READ + 0.0053 0.0053 0.0053 1 MSR_READ + 0.0288 0.0054 0.0012 9 MSR_WRITE + 4512 0.0101 0.0061 0.0040 2 MSR_READ + 0.0317 0.0053 0.0011 11 MSR_WRITE + 4513 0.0070 0.0036 0.0034 2 MSR_READ + 0.0493 0.0062 0.0010 17 MSR_WRITE + 4514 0.0074 0.0074 0.0074 1 MSR_READ + 0.0254 0.0045 0.0008 10 MSR_WRITE + 4515 0.0620 0.0051 0.0011 25 MSR_WRITE + 0.0079 0.0042 0.0038 2 MSR_READ + +TIME:16:33:49 +pid tid total_time max_time min_time counts reason +------------ ------------ ------------ ------------ ------------ ------------ ------------ +4391 4508 0.0041 0.0041 0.0041 1 MSR_READ + 0.0199 0.0051 0.0012 8 MSR_WRITE + 4509 0.0069 0.0039 0.0030 2 MSR_READ + 0.0063 0.0063 0.0063 1 PAUSE_INSTRUCTION + 0.1592 0.0063 0.0006 68 MSR_WRITE + 0.4385 0.0545 0.0362 10 IO_INSTRUCTION + 4510 0.0035 0.0035 0.0035 1 MSR_READ + 0.0475 0.0063 0.0011 18 MSR_WRITE + 4511 0.0073 0.0037 0.0036 2 MSR_READ + 0.0073 0.0037 0.0036 2 MSR_READ + 0.0179 0.0179 0.0179 1 EPT_VIOLATION + 0.0437 0.0061 0.0011 17 MSR_WRITE + 4512 0.0032 0.0032 0.0032 1 MSR_READ + 0.0699 0.0065 0.0011 30 MSR_WRITE + 4513 0.0085 0.0044 0.0041 2 MSR_READ + 0.0476 0.0068 0.0012 16 MSR_WRITE + 4514 0.0078 0.0045 0.0033 2 MSR_READ + 0.0320 0.0049 0.0011 12 MSR_WRITE + 4515 0.0741 0.0051 0.0005 33 MSR_WRITE + 0.0083 0.0042 0.0041 2 MSR_READ +``` + +## 参数解释 + +- **VM Exit 原因统计**:记录并展示触发 VM Exit 的具体原因,帮助用户理解 VM Exit 发生的上下文和背景。 +- **VM Exit 延时分析**:统计每次 VM Exit 处理的最大、最小和总共延时,为性能分析提供量化数据。 +- **VM Exit 次数计数**:计算每种类型的 VM Exit 发生的次数,帮助识别最频繁的性能瓶颈。 +- **PID、TID:**其中PID为主机侧的虚拟机进程号,TID为虚拟机内部的vcpu的进程号 \ No newline at end of file diff --git a/eBPF_Supermarket/kvm_watcher/docs/kvm_hypercall.md b/eBPF_Supermarket/kvm_watcher/docs/kvm_hypercall.md new file mode 100644 index 000000000..988478ad5 --- /dev/null +++ b/eBPF_Supermarket/kvm_watcher/docs/kvm_hypercall.md @@ -0,0 +1,225 @@ +# kvm hypercall + +## 概述 + +kvm watcher 的 kvm hypercall 子模块是一个专为 KVM 虚拟化环境设计的监控工具,它能够详细记录虚拟机进行 hypercall 时的相关信息。Hypercall 允许 Guest OS 以高效的方式直接与 Hypervisor 通信,从而优化虚拟机的性能,特别是在内存管理、设备 I/O 等方面。 + +## 原理介绍 + +在虚拟化环境中,Hypercall 机制是虚拟机(VM)从非特权模式(no root mode)切换到特权模式(root mode)的一种方式,类似于传统操作系统中从用户态切换到内核态的系统调用(syscall)。KVM(Kernel-based Virtual Machine)通过支持 Hypercall 机制,提供了一种高效的方式让虚拟机的 Guest OS 执行一些需要更高权限的操作,比如更新页表或访问物理资源等,这些操作由于虚拟机的非特权域无法完成,因此通过 Hypercall 交由 Hypervisor 来执行。 + +![wKiom1afF9-BZbZuAAAotW_0zjg092.png](http://s2.51cto.com/wyfs02/M01/79/F3/wKiom1afF9-BZbZuAAAotW_0zjg092.png) + +hypercall的发起需求触发vm exit原因为EXIT_REASON_VMCALL,其对应的处理函数为: + +``` +static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { +... + [EXIT_REASON_VMCALL] = kvm_emulate_hypercall, +... +}; +``` + +进入kvm_emulate_hypercall()处理,过程非常简单: + +``` +int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) +{ + unsigned long nr, a0, a1, a2, a3, ret; + int op_64_bit; + + // 检查是否启用了Xen超级调用,如果是,则调用Xen超级调用处理函数 + if (kvm_xen_hypercall_enabled(vcpu->kvm)) + return kvm_xen_hypercall(vcpu); + + // 检查是否启用了Hypervisor超级调用,如果是,则调用Hypervisor超级调用处理函数 + if (kvm_hv_hypercall_enabled(vcpu)) + return kvm_hv_hypercall(vcpu); + + // 从寄存器中读取超级调用号及参数 + nr = kvm_rax_read(vcpu); + a0 = kvm_rbx_read(vcpu); + a1 = kvm_rcx_read(vcpu); + a2 = kvm_rdx_read(vcpu); + a3 = kvm_rsi_read(vcpu); + + // 记录超级调用的追踪信息 + trace_kvm_hypercall(nr, a0, a1, a2, a3); + + // 检查是否为64位超级调用 + op_64_bit = is_64_bit_hypercall(vcpu); + if (!op_64_bit) { + nr &= 0xFFFFFFFF; + a0 &= 0xFFFFFFFF; + a1 &= 0xFFFFFFFF; + a2 &= 0xFFFFFFFF; + a3 &= 0xFFFFFFFF; + } + + // 检查当前CPU的特权级是否为0 + if (static_call(kvm_x86_get_cpl)(vcpu) != 0) { + ret = -KVM_EPERM; + goto out; + } + + ret = -KVM_ENOSYS; + + // 根据超级调用号执行相应的操作 + switch (nr) { + case KVM_HC_VAPIC_POLL_IRQ: + ret = 0; + break; + case KVM_HC_KICK_CPU: + // 处理CPU唤醒的超级调用 + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT)) + break; + + kvm_pv_kick_cpu_op(vcpu->kvm, a1); + kvm_sched_yield(vcpu, a1); + ret = 0; + break; +#ifdef CONFIG_X86_64 + case KVM_HC_CLOCK_PAIRING: + // 处理时钟配对的超级调用 + ret = kvm_pv_clock_pairing(vcpu, a0, a1); + break; +#endif + case KVM_HC_SEND_IPI: + // 处理发送中断请求的超级调用 + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI)) + break; + + ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); + break; + case KVM_HC_SCHED_YIELD: + // 处理调度让出的超级调用 + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD)) + break; + + kvm_sched_yield(vcpu, a0); + ret = 0; + break; + case KVM_HC_MAP_GPA_RANGE: + // 处理GPA范围映射的超级调用 + ret = -KVM_ENOSYS; + if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) + break; + + // 设置KVM_EXIT_HYPERCALL退出类型,并填充相关信息 + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; + vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + vcpu->run->hypercall.args[0] = a0; + vcpu->run->hypercall.args[1] = a1; + vcpu->run->hypercall.args[2] = a2; + vcpu->run->hypercall.longmode = op_64_bit; + vcpu->arch.complete_userspace_io = complete_hypercall_exit; + return 0; + default: + ret = -KVM_ENOSYS; + break; + } + +out: + // 如果不是64位超级调用,则返回值需要截断为32位 + if (!op_64_bit) + ret = (u32)ret; + kvm_rax_write(vcpu, ret); + + // 更新超级调用统计信息,并跳过被模拟的指令 + ++vcpu->stat.hypercalls; + return kvm_skip_emulated_instruction(vcpu); +} +``` + +## 示例输出 + +``` +#sudo ./kvm_watcher -h +TIME:19:49:29 +PID VCPU_ID NAME COUNTS HYPERCALLS +269394 3 KICK_CPU 1 3599 +------------------------------------------------------------------------ +TIME:19:49:46 +PID VCPU_ID NAME COUNTS HYPERCALLS +426070 0 SEND_IPI 1 503 +------------------------------------------------------------------------ +TIME:19:49:50 +PID VCPU_ID NAME COUNTS HYPERCALLS +269394 1 SEND_IPI 9 2962 +426070 0 SEND_IPI 7 510 +426070 6 KICK_CPU 1 259 +269394 2 KICK_CPU 13 4375 +426070 0 KICK_CPU 1 511 +269394 3 SEND_IPI 9 3611 +426070 2 KICK_CPU 1 135 +269394 0 KICK_CPU 3 2178 +269394 3 KICK_CPU 4 3612 +269394 4 KICK_CPU 10 2409 +426070 4 KICK_CPU 1 216 +269394 2 SEND_IPI 2 4366 +269394 1 KICK_CPU 4 2953 +269394 4 SEND_IPI 10 2410 +269394 0 SEND_IPI 1 2176 +426070 1 KICK_CPU 1 234 +426070 3 KICK_CPU 1 223 +269394 5 KICK_CPU 3 2564 +------------------------------------------------------------------------ +TIME:19:49:52 +PID VCPU_ID NAME COUNTS HYPERCALLS +426070 3 SEND_IPI 1 225 +426070 4 SEND_IPI 6 222 +426070 7 SEND_IPI 13 214 +269394 3 KICK_CPU 2 3614 +269394 2 SEND_IPI 3 4378 +426070 1 KICK_CPU 1 235 +426070 3 KICK_CPU 1 224 +269394 5 KICK_CPU 1 2565 +------------------------------------------------------------------------ +TIME:19:49:54 +PID VCPU_ID NAME COUNTS HYPERCALLS +426070 4 SCHED_YIELD 3 385 +269394 1 SEND_IPI 1 2963 +269394 5 SEND_IPI 3 2568 +426070 1 SEND_IPI 18 253 +426070 3 SEND_IPI 95 321 +426070 0 SEND_IPI 1 512 +426070 4 SEND_IPI 162 387 +426070 7 SEND_IPI 10 224 +269394 3 SEND_IPI 2 3616 +269394 4 SEND_IPI 2 2412 +269394 0 SEND_IPI 1 2179 +426070 3 KICK_CPU 1 262 +``` + +其中详细的参数信息会输出到临时文件: + +``` +TIME(ms) COMM PID VCPU_ID NAME HYPERCALLS ARGS +881915483.793962 CPU 0/KVM 529746 0 SEND_IPI 7 ipi_bitmap_low:0x1,ipi_bitmap_high:0,min(apic_id):1,icr:0xf8 +881915485.648450 CPU 2/KVM 269394 2 KICK_CPU 4360 apic_id:3 +881919197.181233 CPU 3/KVM 269394 3 KICK_CPU 3598 apic_id:4 +881929597.162056 CPU 3/KVM 269394 3 KICK_CPU 3599 apic_id:2 +881946045.818584 CPU 0/KVM 426070 0 SEND_IPI 503 ipi_bitmap_low:0x7f,ipi_bitmap_high:0,min(apic_id):1,icr:0xf8 +881948845.323275 CPU 3/KVM 269394 3 KICK_CPU 3600 apic_id:0 +881949425.157070 CPU 6/KVM 426070 6 KICK_CPU 259 apic_id:1 +881949425.573460 CPU 1/KVM 426070 1 KICK_CPU 234 apic_id:3 +881949426.064405 CPU 3/KVM 426070 3 KICK_CPU 223 apic_id:2 +881949426.514380 CPU 2/KVM 426070 2 KICK_CPU 135 apic_id:4 +881949426.910918 CPU 4/KVM 426070 4 KICK_CPU 216 apic_id:5 +881949459.202569 CPU 0/KVM 426070 0 SEND_IPI 504 ipi_bitmap_low:0x7f,ipi_bitmap_high:0,min(apic_id):1,icr:0xfc +881949459.384313 CPU 0/KVM 426070 0 SEND_IPI 505 ipi_bitmap_low:0x7f,ipi_bitmap_high:0,min(apic_id):1,icr:0xfc +881949459.607809 CPU 0/KVM 426070 0 SEND_IPI 506 ipi_bitmap_low:0x7f,ipi_bitmap_high:0,min(apic_id):1,icr:0xfc +881949459.761529 CPU 0/KVM 426070 0 SEND_IPI 507 ipi_bitmap_low:0x7f,ipi_bitmap_high:0,min(apic_id):1,icr:0xfc +881949485.192198 CPU 0/KVM 426070 0 SEND_IPI 508 ipi_bitmap_low:0x7f,ipi_bitmap_high:0,min(apic_id):1,icr:0xfc +881949485.517598 CPU 0/KVM 426070 0 SEND_IPI 509 ipi_bitmap_low:0x7f,ipi_bitmap_high:0,min(apic_id):1,icr:0xfc +881949485.849330 CPU 0/KVM 426070 0 SEND_IPI 510 ipi_bitmap_low:0x7f,ipi_bitmap_high:0,min(apic_id):1,icr:0xfc +``` + +## 参数解释 + +- **PID**: 相应虚拟机进程的标识符(PID) +- **VCPU_ID:**对应的vcpu标识符 +- **NAME:**所发生的hypercall名称 +- **COUNTS:**当前时间段内hypercall发送的次数 +- **HYPERCALLS:** 自虚拟机启动以来,每个vcpu上发生的hypercall的次数 + diff --git a/eBPF_Supermarket/kvm_watcher/docs/kvm_irq.md b/eBPF_Supermarket/kvm_watcher/docs/kvm_irq.md new file mode 100644 index 000000000..fa0533a73 --- /dev/null +++ b/eBPF_Supermarket/kvm_watcher/docs/kvm_irq.md @@ -0,0 +1,142 @@ +# kvm irq + +## 概述 + +kvm watcher中的kvm irq子功能模块可以对kvm中的虚拟化中断事件的实时监控和分析能力,可以捕获和记录各种中断事件,支持监控传统的PIC中断、高级的IOAPIC中断以及基于消息的MSI中断,覆盖了KVM虚拟化环境中的主要中断类型。对于每个捕获的中断事件,记录详细信息,包括中断类型、中断注入延时、引脚号、触发方式、目标LAPIC的ID、向量号以及是否被屏蔽等关键数据。 + +## 原理介绍 + +x86平台主要使用的中断类型有pic、apic及msi中断,在多核系统下的apic结构图如下所示,每个cpu有一个lapic,外部中断通过ioapic转发到lapic,如果是msi中断,则绕过了io apic直接发给lapic。 + +![img](https://img-blog.csdnimg.cn/20200411174750913.PNG?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3pneTY2Ng==,size_16,color_FFFFFF,t_70) + +KVM_CREATE_IRQCHIP的ioctl用于在虚拟机初始化阶段创建中断请求芯片,当KVM接收到虚拟机相关联的ioctl系统调用时,函数kvm_vm_ioctl()进行处理调用kvm_arch_vm_ioctl()函数,该函数中完成了初始化pic和ioapic控制器模块,配置中断请求默认路由等任务,其中的kvm_setup_default_irq_routing会依次调用kvm_set_irq_routing和setup_routing_entry,最终调用kvm_set_routing_entry函数完成路由配置,核心操作是将各个类型中断控制器的中断置为函数与该类型的控制器路由入口进行绑定,以备后续发生中断请求时调用。 + +``` +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + switch (ue->type) { + case KVM_IRQ_ROUTING_IRQCHIP: //中断路由芯片 + if (irqchip_split(kvm)) + return -EINVAL; + e->irqchip.pin = ue->u.irqchip.pin;//设置中断芯片引脚 + switch (ue->u.irqchip.irqchip) { + case KVM_IRQCHIP_PIC_SLAVE: + e->irqchip.pin += PIC_NUM_PINS / 2; //从片引脚 + fallthrough; + case KVM_IRQCHIP_PIC_MASTER: + if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2) + return -EINVAL; + //// 设置处理 PIC 中断的回调函数 + e->set = kvm_set_pic_irq; + break; + case KVM_IRQCHIP_IOAPIC: + if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS) + return -EINVAL; + // 设置处理 IOPIC 中断的回调函数 + e->set = kvm_set_ioapic_irq; + break; + default: + return -EINVAL; + } + e->irqchip.irqchip = ue->u.irqchip.irqchip; + break; + case KVM_IRQ_ROUTING_MSI: + // 设置处理 MSI 中断的回调函数 + e->set = kvm_set_msi; + e->msi.address_lo = ue->u.msi.address_lo; + e->msi.address_hi = ue->u.msi.address_hi; + e->msi.data = ue->u.msi.data; + + if (kvm_msi_route_invalid(kvm, e)) + return -EINVAL; + break; +..... + + return 0; +} +``` + +KVM_CREATE_IRQCHIP用于虚拟机向VMM的虚拟apic发送中断请求,再有VMM将中断交付虚拟cpu处理,当kvm_vm_ioctl函数被调用并处理KVM_IRQ_LINE请求时会调用kvm_vm_ioctl_irq_line,该函数调用kvm_set_irq完成中断注入,其核心任务就是根据中断控制器类型调用之前所绑定的中断回调函数,这些中断回调函数会将中断请求写入虚拟cpu的vmcs中。 + +``` +/* kvm_set_irq - 设置或清除 KVM 虚拟机中的一个中断 + * @kvm: 指向当前 KVM 实例的指针 + * @irq_source_id: 中断源的标识符,用于区分不同的中断源 + * @irq: 要操作的中断号 + * @level: 指定中断的电平,通常 1 表示触发中断,0 表示清除中断 + * @line_status: 指定中断线路的当前状态 + * + * Return value: + * < 0 中断被忽略(被屏蔽或其他原因未被送达) + * = 0 中断被合并(之前的中断仍在等待处理) + * > 0 中断成功送达的 CPU 数量 + */ + int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, + bool line_status) +{ + struct kvm_kernel_irq_routing_entry irq_set[KVM_NR_IRQCHIPS]; + .... + + while (i--) { + int r; + r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level, + line_status); + if (r < 0) + continue; + + ret = r + ((ret < 0) ? 0 : ret); + } + + return ret; +} +``` + +其中`irq_set[i].set`函数就是不同中断类型的回调函数。 + +当需要注入的中断为msi类型时,kvm_vm_ioctl会处理为KVM_SIGNAL_MSI类型的的请求,依次调用kvm_send_userspace_msi和kvm_set_msi + +其中ioapic的回调函数kvm_set_ioapic_irq依次调用kvm_ioapic_set_irq、ioapic_set_irq最后调用ioapic_service函数,ioapic_service主要是找到中断的重映射表,然后查找中断的目的地信息并转发到对应vcpu的lapic去处理。然后会调用kvm_irq_delivery_to_apic负责将中断分发给lapic。 + +> 中断虚拟化详细介绍可以参考:[kvm中断虚拟化 ]() [内核虚拟化:虚拟中断注入](https://blog.csdn.net/weixin_46324627/article/details/136661252?csdn_share_tail=%7B%22type%22%3A%22blog%22%2C%22rType%22%3A%22article%22%2C%22rId%22%3A%22136661252%22%2C%22source%22%3A%22weixin_46324627%22%7D) + +## 挂载点 + +| 类型 | 名称 | +| ------------- | --------------- | +| fentry、fexit | kvm_pic_set_irq | +| fentry、fexit | ioapic_set_irq | +| fentry、fexit | kvm_set_msi | + +## 示例输出 + +``` +TIME(ms) COMM PID DELAY TYPE/PIN DST/VEC OTHERS +962804587.768667 CPU 0/KVM 269394 773 MSI /- 0x2/40 Fixed |physical|edge |- |- +962805792.231419 vhost-529746 529767 3008 MSI /- 0x1/40 Fixed |physical|edge |- |- +962805792.234556 vhost-269394 269403 1442 MSI /- 0x3/40 Fixed |physical|edge |- |- +962805792.243754 vhost-426070 426078 1323 MSI /- 0x5/35 Fixed |physical|edge |- |- +962806603.650275 CPU 0/KVM 269394 3738 MSI /- 0x2/40 Fixed |physical|edge |- |- +962806603.713743 CPU 0/KVM 269394 1414 MSI /- 0x2/40 Fixed |physical|edge |- |- +962806816.308239 qemu-system-x86 269394 29495 IOAPIC /21 0x4/39 Fixed |physical|level|- |- +962806816.359852 qemu-system-x86 269394 38615 PIC slave /2 - /- - |- |level|masked|- +962806816.400501 qemu-system-x86 269394 1259 IOAPIC /10 0 /0 Fixed |physical|edge |masked|- +962806816.408792 qemu-system-x86 269394 1270 PIC slave /2 - /- - |- |level|masked|- +962806816.410425 qemu-system-x86 269394 226 IOAPIC /10 0 /0 Fixed |physical|edge |masked|- +962809792.316035 vhost-426070 426078 1747 MSI /- 0x5/35 Fixed |physical|edge |- |- +962810635.636493 CPU 0/KVM 269394 3034 MSI /- 0x2/40 Fixed |physical|edge |- |- +962810635.694923 CPU 0/KVM 269394 897 MSI /- 0x2/40 Fixed |physical|edge |- |- +962811776.481253 vhost-269394 269403 3719 MSI /- 0x3/40 Fixed |physical|edge |- |- +962811776.523581 vhost-529746 529767 1664 MSI /- 0x1/40 Fixed |physical|edge |- |- +962811776.654516 vhost-426070 426078 1522 MSI /- 0x5/35 Fixed |physical|edge |- |- +962812652.302519 CPU 2/KVM 269394 2605 MSI /- 0x2/40 Fixed |physical|edge |- |- +962812652.342239 CPU 2/KVM 269394 749 MSI /- 0x2/40 Fixed |physical|edge |- |- +962813344.856419 qemu-system-x86 269394 23230 IOAPIC /21 0x4/39 Fixed |physical|level|- |- +962813344.899277 qemu-system-x86 269394 5472 PIC slave /2 - /- - |- |level|masked|- +``` + +> 对于IOAPIC和PIC类型的中断,pin是引脚号,DST/VEC表示目标vcpu id和中断向量号 +> +> others中的第一列是中断分发方式,第二列是中断目标的寻址模式,第三列表示中断信号的触发方式,第四列表示中断是否被屏蔽,第五列表示中断是否合并。 \ No newline at end of file diff --git a/eBPF_Supermarket/kvm_watcher/docs/kvm_mmu.md b/eBPF_Supermarket/kvm_watcher/docs/kvm_mmu.md index 9c23ca138..81362acc4 100644 --- a/eBPF_Supermarket/kvm_watcher/docs/kvm_mmu.md +++ b/eBPF_Supermarket/kvm_watcher/docs/kvm_mmu.md @@ -1,17 +1,28 @@ +# kvm_mmu +## 概述 -vm exit(EPT_VIOLATION)处理流程: +kvm watcher中的kvm mmu子功能模块特别关注于捕捉和分析两类关键的虚拟化环境中的内存管理事件:MMIO page fault和 EPT page fault。MMIO page fault发生在虚拟机尝试访问已映射为 I/O 设备的内存地址时。EPT page fault指的是当虚拟机访问的虚拟地址未在 EPT 中有效映射时发生的错误。 + +## 原理介绍 + +### EPT page fault + +EPT 并不干扰 Guest VM 操作自身页表的过程,其本质上是**额外提供了一个 Guest 物理地址空间到 Host 物理地址空间转换的页表**,即使用一个额外的页表来完成 `GPA→HPA` 的转换。 + +EPT 方案虽然相比起影子页表而言多了一层转换,但是并不需要干扰 Guest 原有的页表管理,**GVA→GPA→HPA 的过程都由硬件自动完成**,同时 Hypervisor 仅需要截获 `EPT Violation` 异常(EPT 表项为空),效率提高了不少。 + +当虚拟化环境中产生page fault时,其中GVA->GPA的转换不触发vm exit,由虚拟机内部页表完成填充,GPA->HPA的转换会触发vm exit原因为EPT_VIOLATION,由kvm完成对TDP(Two-Dimensional Paging)页表的填充。 + +内核中的vm exit(EPT_VIOLATION)处理流程如下: ``` static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { ... - [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, + [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, ... }; -``` - -```c vmx_handle_exit() { # 处理 VMX(虚拟机扩展)退出的主要函数 __vmx_handle_exit() { @@ -113,4 +124,86 @@ vmx_handle_exit() { ``` -![kvm-init-mmu](https://gitee.com/nan-shuaibo/image/raw/master/202403081421893.png) \ No newline at end of file +![kvm-init-mmu](https://gitee.com/nan-shuaibo/image/raw/master/202403081421893.png) + +### MMIO page fault + +MMIO是直接将设备I/O映射到物理地址空间内,虚拟机物理内存的虚拟化又是通过EPT机制来完成的, 那么模拟设备的MMIO实现也需要利用EPT机制.虚拟机的EPT页表是在EPT_VIOLATION异常处理的时候建立起来的, 对于模拟设备而言访问MMIO肯定要触发VM_EXIT然后交给QEMU/KVM去处理。其中关于MMIO MMIO page fault的vm exit对应的原因处理函数如下: + +``` +static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { +... + [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, +... +}; +``` + +对于上述两种page fault产生的vm exit,`EPT_VIOLATION`表示的是对应的物理页不存在,而`EPT_MISCONFIG`表示EPT页表中有非法的域。 + +### 挂载点 + +**EPT page fault:** + +| 类型 | 名称 | +| ---------- | ------------------ | +| tracepoint | kvm_page_fault | +| fexit | kvm_tdp_page_fault | + +**MMIO page fault:** + +| 类型 | 名称 | +| ---------- | ---------------------- | +| fentry | kvm_mmu_page_fault | +| tracepoint | handle_mmio_page_fault | + +## 示例输出 + +``` + # sudo ./kvm_watcher -fm +TIME(ms) COMM PID GPA COUNT DELAY(us) HVA PFN MEM_SLOTID ERROR_TYPE +877831355.823575 CPU 1/KVM 529746 fc096000 1 2.9200 - - - Reserved(MMIO) +877831356.426116 CPU 1/KVM 529746 a27c000 1 29.0000 7fcb1607c000 205b6b 10 Write +877831356.478689 CPU 1/KVM 529746 a27d000 1 5.5740 7fcb1607d000 218eb5 10 Write +877831356.779868 CPU 1/KVM 529746 d036640 1 2.8680 7fcb18e36000 b550 10 Exec +877831356.814302 CPU 1/KVM 529746 d015a50 1 3.3680 7fcb18e15000 5977 10 Exec +877831356.851781 CPU 1/KVM 529746 cef0c60 1 3.1390 7fcb18cf0000 15d193 10 Exec +877831356.895407 CPU 1/KVM 529746 d026040 1 3.4770 7fcb18e26000 3188 10 Exec +877831356.945211 CPU 1/KVM 529746 febef000 1 1.7750 - - - Reserved(MMIO) +877831357.526563 CPU 1/KVM 529746 da252d0 1 6.8130 7fcb19825000 1c57a8 10 User +877831357.567887 CPU 1/KVM 529746 cef2a72 1 4.0020 7fcb18cf2000 15d19e 10 User +877831365.923055 CPU 0/KVM 529746 a204000 1 36.2550 7fcb16004000 15d1ad 10 Write +877831365.991821 CPU 0/KVM 529746 a205000 1 6.6050 7fcb16005000 189c9b 10 Write +877831366.018521 CPU 0/KVM 529746 a206000 1 4.7600 7fcb16006000 1f3b32 10 Write +877831366.044024 CPU 0/KVM 529746 a207000 1 4.7990 7fcb16007000 1c2335 10 Write +877831366.081925 CPU 0/KVM 529746 cfd8030 1 4.5920 7fcb18dd8000 135be 10 Exec +877831366.118387 CPU 0/KVM 529746 cf7c2f0 1 3.5400 7fcb18d7c000 1766 10 Exec +877831366.151738 CPU 0/KVM 529746 cf7ad00 1 3.0300 7fcb18d7a000 1b943a 10 Exec +877831366.183574 CPU 0/KVM 529746 cf78350 1 2.7500 7fcb18d78000 1b9420 10 Exec +877831367.342812 CPU 0/KVM 529746 b996000 1 12.1780 7fcb17796000 209e26 10 Write +877831367.379176 CPU 0/KVM 529746 b997000 1 5.3060 7fcb17797000 1969b2 10 Write +877831367.420309 CPU 0/KVM 529746 c734dd0 1 4.4100 7fcb18534000 15ae97 10 Exec +877831367.454443 CPU 0/KVM 529746 c733cf0 1 2.9190 7fcb18533000 15ae96 10 Exec +877831369.203014 CPU 0/KVM 529746 febef000 1 3.4450 - - - Reserved(MMIO) +877831385.337715 CPU 0/KVM 529746 d054670 1 15.3860 7fcb18e54000 575e 10 Exec +877831400.891921 CPU 0/KVM 529746 d051900 1 14.7630 7fcb18e51000 575b 10 Exec +877831400.951377 CPU 0/KVM 529746 d0502f0 1 3.5300 7fcb18e50000 575a 10 Exec +877831400.983426 CPU 0/KVM 529746 cef1440 1 3.8650 7fcb18cf1000 15d19d 10 Exec +877831401.016302 CPU 0/KVM 529746 cefc7a0 1 3.0840 7fcb18cfc000 15cd9f 10 Exec +877831401.052207 CPU 0/KVM 529746 cee3ed0 1 3.1400 7fcb18ce3000 15e3f1 10 Exec +877831401.162631 CPU 0/KVM 529746 d052077 1 13.8780 7fcb18e52000 575c 10 Exec +877831401.334995 CPU 0/KVM 529746 fc096000 1 3.9740 - - - Reserved(MMIO) + +``` + +## 参数介绍 + +- **TIME(ms)**: 事件发生的时间,以毫秒为单位。 +- **COMM**: 触发页面故障的进程名称。 +- **PID**: 相应虚拟机进程的标识符(PID)。 +- **GPA**: 引发页面故障的访问的客户物理地址(Guest Physical Address)。 +- **COUNT**: 相同地址发生页面故障的次数。 +- **DELAY(us)**: 处理页面故障所需的时间,以微秒为单位。 +- **HVA**: 发生页面故障时被访问的客户虚拟地址(Host Virtual Address)。 +- **PFN**: 与页面故障相关联的页帧编号(Page Frame Number)。 +- **MEM_SLOTID**: 指示发生页面故障的内存插槽的标识符。 +- **ERROR_TYPE**: 页面故障的类型,提供页面故障发生的具体原因。 \ No newline at end of file diff --git a/eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md b/eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md new file mode 100644 index 000000000..fc2e12552 --- /dev/null +++ b/eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md @@ -0,0 +1,103 @@ +# kvm vcpu + +## 概述 + +kvm watcher中的kvm vcpu子功能模块是设计用于监控和分析虚拟化环境中虚拟 CPU (VCPU) 活动的工具,它通过精确记录 VCPU 的唤醒/挂起事件、halt poll 时间的变化,以及 KVM 虚拟机在热迁移过程中产生的脏页信息,为优化虚拟机性能、提高系统响应速度、提供数据支持。 + +## 原理介绍 + +### wakeup、暂停轮询 + +KVM 暂停轮询系统是 KVM 内的一项功能,其在某些情况下可以通过在vCPU 放弃运行并让出后,在主机中进行一段时间的轮询来降低虚拟机的延迟。简而言之,当vCPU 放弃运行(即执行 cede 操作)或在 PowerPC 中,当一个虚拟核心(vcore)的所有vCPU 都放弃运行时,主机内核会在将 CPU 让给调度程序之前,通过轮询等待唤醒条件。 + +轮询在某些情况下提供了延迟优势,尤其是在虚拟机可以非常快速地再次运行的情况下。这至少可以通过减少通过调度程序的开销来节省一些时间,通常在几微秒的数量级上,尽管性能优势取决于工作负载。在轮询间隔内,如果没有唤醒源到达,或者运行队列上有其他可运行的任务,则会调用调度程序。因此,在具有非常短唤醒周期的工作负载中,halt轮询特别有用,因为最小化了halt轮询的时间,同时可以避免调用调度程序的时间花费。 +具体的源码流畅及详细原理可以参考:[内核虚拟化:vCPU之暂停轮询系统](https://blog.csdn.net/weixin_46324627/article/details/135342622?spm=1001.2014.3001.5501) + +### dirty page + +在虚拟化环境中,脏页指的是自上次同步以来已经被修改的内存页。特别是在虚拟机热迁移过程中,源虚拟机上的内存页在复制到目标虚拟机的同时仍然处于活动状态,任何在此过程中对这些页的修改都会导致脏页的产生。监控这些脏页对于优化热迁移过程至关重要,因为过多的脏页生成可能会导致迁移延迟,甚至影响到虚拟机的运行性能。此监控功能特别适用于虚拟机热迁移的场景,其中脏页的精确监控和管理可以显著优化迁移过程。 + +## 挂载点 + +### wakeup + +| 类型 | 名称 | +| ---------- | --------------- | +| fentry | kvm_vcpu_halt | +| tracepoint | kvm_vcpu_wakeup | + +### 暂停轮询 + +| 类型 | 名称 | +| ---------- | ---------------- | +| tracepoint | kvm_halt_poll_ns | + +### dirty page + +| 类型 | 名称 | +| ------ | ----------------------- | +| kprobe | mark_page_dirty_in_slot | + +## 示例输出 + +### wakeup + +``` +#sudo ./kvm_watcher -w +TIME(ms) DUR_HALT(ms) COMM PID/TID VCPU_ID WAIT/POLL VAILD? +879542671.234831 59.877982 CPU 2/KVM 269394/269407 2 wait valid +879542703.516948 32.138767 CPU 2/KVM 269394/269407 2 wait valid +879542759.358528 55.680357 CPU 2/KVM 269394/269407 2 wait valid +879542784.405935 74.441939 CPU 1/KVM 529746/529770 1 wait valid +879542784.537334 168.631715 CPU 3/KVM 269394/269408 3 wait valid +879542787.287187 2.632230 CPU 3/KVM 269394/269408 3 wait valid +879542819.265769 31.876126 CPU 3/KVM 269394/269408 3 wait valid +879542831.851643 12.209500 CPU 3/KVM 269394/269408 3 wait valid +879542843.477238 11.473798 CPU 3/KVM 269394/269408 3 wait valid +879542847.407571 3.792108 CPU 3/KVM 269394/269408 3 wait valid +879542851.477951 3.966608 CPU 3/KVM 269394/269408 3 wait valid +879542855.282047 3.720160 CPU 3/KVM 269394/269408 3 wait valid +879542855.806099 96.293315 CPU 2/KVM 269394/269407 2 wait valid +879542859.361304 3.961293 CPU 3/KVM 269394/269408 3 wait valid +``` + +### 暂停轮询 + +``` +TIME(ms) COMM PID/TID TYPE VCPU_ID OLD(ns) NEW(ns) +879590230.083646 CPU 0/KVM 529746/529769 grow 0 0 --> 10000 +879590710.510358 CPU 0/KVM 529746/529769 shrink 0 10000 --> 0 +879590991.741948 CPU 4/KVM 269394/269409 grow 4 0 --> 10000 +879591061.851174 CPU 4/KVM 269394/269409 shrink 4 10000 --> 0 +879591831.571142 CPU 3/KVM 269394/269408 grow 3 0 --> 10000 +879591835.425081 CPU 3/KVM 269394/269408 shrink 3 10000 --> 0 +879592213.992940 CPU 0/KVM 529746/529769 grow 0 0 --> 10000 +879592565.028840 CPU 0/KVM 529746/529769 shrink 0 10000 --> 0 +879596214.435717 CPU 0/KVM 529746/529769 grow 0 0 --> 10000 +879596726.128728 CPU 0/KVM 529746/529769 shrink 0 10000 --> 0 +879597907.648862 CPU 3/KVM 269394/269408 grow 3 0 --> 10000 +879597907.837706 CPU 3/KVM 269394/269408 grow 3 10000 --> 20000 +879597935.631756 CPU 3/KVM 269394/269408 shrink 3 20000 --> 0 +879598230.236048 CPU 0/KVM 529746/529769 grow 0 0 --> 10000 +879598567.847044 CPU 0/KVM 529746/529769 shrink 0 10000 --> 0 +879600214.440083 CPU 0/KVM 529746/529769 grow 0 0 --> 10000 +``` + +### dirty page + +``` + +``` + +## 参数介绍 + +### dirty page + +- **TIME(ms)**: 事件发生的时间,以毫秒为单位。 +- **COMM**: 触发脏页产生的进程名称。 +- **PID/TID**: 进程或线程的标识符。 +- **GFN**: 脏页的客户机页框号(Guest Frame Number)。 +- **REL_GFN**: 相对于 GFN 的偏移量。 +- **NPAGES**: 内存槽的页面数量。 +- **USERSPACE_ADDR**: 触发脏页的用户空间地址。 +- **SLOT_ID**: 内存插槽标识,指示哪个内存区域包含了脏页。 \ No newline at end of file From 92daca74974ad97e3e660af29e9ffdd7ac15aef3 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Tue, 12 Mar 2024 20:28:13 +0800 Subject: [PATCH 23/46] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=8A=9F=E8=83=BD?= =?UTF-8?q?=E6=A8=A1=E5=9D=97=E8=AF=B4=E6=98=8E=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- eBPF_Supermarket/kvm_watcher/docs/kvm_exit.md | 2 +- .../kvm_watcher/docs/kvm_hypercall.md | 10 ++--- eBPF_Supermarket/kvm_watcher/docs/kvm_irq.md | 1 + eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md | 44 ++++++++++++++++--- 4 files changed, 46 insertions(+), 11 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/docs/kvm_exit.md b/eBPF_Supermarket/kvm_watcher/docs/kvm_exit.md index 91adc8020..381befd0c 100644 --- a/eBPF_Supermarket/kvm_watcher/docs/kvm_exit.md +++ b/eBPF_Supermarket/kvm_watcher/docs/kvm_exit.md @@ -89,4 +89,4 @@ pid tid total_time max_time min_time counts re - **VM Exit 原因统计**:记录并展示触发 VM Exit 的具体原因,帮助用户理解 VM Exit 发生的上下文和背景。 - **VM Exit 延时分析**:统计每次 VM Exit 处理的最大、最小和总共延时,为性能分析提供量化数据。 - **VM Exit 次数计数**:计算每种类型的 VM Exit 发生的次数,帮助识别最频繁的性能瓶颈。 -- **PID、TID:**其中PID为主机侧的虚拟机进程号,TID为虚拟机内部的vcpu的进程号 \ No newline at end of file +- **PID、TID号**:其中PID为主机侧的虚拟机进程号,TID为虚拟机内部的vcpu的进程号 \ No newline at end of file diff --git a/eBPF_Supermarket/kvm_watcher/docs/kvm_hypercall.md b/eBPF_Supermarket/kvm_watcher/docs/kvm_hypercall.md index 988478ad5..42c02fac8 100644 --- a/eBPF_Supermarket/kvm_watcher/docs/kvm_hypercall.md +++ b/eBPF_Supermarket/kvm_watcher/docs/kvm_hypercall.md @@ -8,7 +8,7 @@ kvm watcher 的 kvm hypercall 子模块是一个专为 KVM 虚拟化环境设计 在虚拟化环境中,Hypercall 机制是虚拟机(VM)从非特权模式(no root mode)切换到特权模式(root mode)的一种方式,类似于传统操作系统中从用户态切换到内核态的系统调用(syscall)。KVM(Kernel-based Virtual Machine)通过支持 Hypercall 机制,提供了一种高效的方式让虚拟机的 Guest OS 执行一些需要更高权限的操作,比如更新页表或访问物理资源等,这些操作由于虚拟机的非特权域无法完成,因此通过 Hypercall 交由 Hypervisor 来执行。 -![wKiom1afF9-BZbZuAAAotW_0zjg092.png](http://s2.51cto.com/wyfs02/M01/79/F3/wKiom1afF9-BZbZuAAAotW_0zjg092.png) +
![wKiom1afF9-BZbZuAAAotW_0zjg092.png](http://s2.51cto.com/wyfs02/M01/79/F3/wKiom1afF9-BZbZuAAAotW_0zjg092.png)
hypercall的发起需求触发vm exit原因为EXIT_REASON_VMCALL,其对应的处理函数为: @@ -218,8 +218,8 @@ TIME(ms) COMM PID VCPU_ID NAME HYPERCALLS A ## 参数解释 - **PID**: 相应虚拟机进程的标识符(PID) -- **VCPU_ID:**对应的vcpu标识符 -- **NAME:**所发生的hypercall名称 -- **COUNTS:**当前时间段内hypercall发送的次数 -- **HYPERCALLS:** 自虚拟机启动以来,每个vcpu上发生的hypercall的次数 +- **VCPU_ID**:对应的vcpu标识符 +- **NAME**:所发生的hypercall名称 +- **COUNTS**:当前时间段内hypercall发送的次数 +- **HYPERCALLS**:自虚拟机启动以来,每个vcpu上发生的hypercall的次数 diff --git a/eBPF_Supermarket/kvm_watcher/docs/kvm_irq.md b/eBPF_Supermarket/kvm_watcher/docs/kvm_irq.md index fa0533a73..5362b7cae 100644 --- a/eBPF_Supermarket/kvm_watcher/docs/kvm_irq.md +++ b/eBPF_Supermarket/kvm_watcher/docs/kvm_irq.md @@ -113,6 +113,7 @@ KVM_CREATE_IRQCHIP用于虚拟机向VMM的虚拟apic发送中断请求,再有V ## 示例输出 ``` +#sudo ./kvm_watcher -c TIME(ms) COMM PID DELAY TYPE/PIN DST/VEC OTHERS 962804587.768667 CPU 0/KVM 269394 773 MSI /- 0x2/40 Fixed |physical|edge |- |- 962805792.231419 vhost-529746 529767 3008 MSI /- 0x1/40 Fixed |physical|edge |- |- diff --git a/eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md b/eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md index fc2e12552..7b9fdcd1d 100644 --- a/eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md +++ b/eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md @@ -10,8 +10,7 @@ kvm watcher中的kvm vcpu子功能模块是设计用于监控和分析虚拟化 KVM 暂停轮询系统是 KVM 内的一项功能,其在某些情况下可以通过在vCPU 放弃运行并让出后,在主机中进行一段时间的轮询来降低虚拟机的延迟。简而言之,当vCPU 放弃运行(即执行 cede 操作)或在 PowerPC 中,当一个虚拟核心(vcore)的所有vCPU 都放弃运行时,主机内核会在将 CPU 让给调度程序之前,通过轮询等待唤醒条件。 -轮询在某些情况下提供了延迟优势,尤其是在虚拟机可以非常快速地再次运行的情况下。这至少可以通过减少通过调度程序的开销来节省一些时间,通常在几微秒的数量级上,尽管性能优势取决于工作负载。在轮询间隔内,如果没有唤醒源到达,或者运行队列上有其他可运行的任务,则会调用调度程序。因此,在具有非常短唤醒周期的工作负载中,halt轮询特别有用,因为最小化了halt轮询的时间,同时可以避免调用调度程序的时间花费。 -具体的源码流畅及详细原理可以参考:[内核虚拟化:vCPU之暂停轮询系统](https://blog.csdn.net/weixin_46324627/article/details/135342622?spm=1001.2014.3001.5501) +轮询在某些情况下提供了延迟优势,尤其是在虚拟机可以非常快速地再次运行的情况下。这至少可以通过减少通过调度程序的开销来节省一些时间,通常在几微秒的数量级上,尽管性能优势取决于工作负载。在轮询间隔内,如果没有唤醒源到达,或者运行队列上有其他可运行的任务,则会调用调度程序。因此,在具有非常短唤醒周期的工作负载中,halt轮询特别有用,因为最小化了halt轮询的时间,同时可以避免调用调度程序的时间花费。 具体的源码流畅及详细原理可以参考:[内核虚拟化:vCPU之暂停轮询系统](https://blog.csdn.net/weixin_46324627/article/details/135342622?spm=1001.2014.3001.5501) ### dirty page @@ -22,20 +21,20 @@ KVM 暂停轮询系统是 KVM 内的一项功能,其在某些情况下可以 ### wakeup | 类型 | 名称 | -| ---------- | --------------- | +| :--------- | :-------------- | | fentry | kvm_vcpu_halt | | tracepoint | kvm_vcpu_wakeup | ### 暂停轮询 | 类型 | 名称 | -| ---------- | ---------------- | +| :--------- | :--------------- | | tracepoint | kvm_halt_poll_ns | ### dirty page | 类型 | 名称 | -| ------ | ----------------------- | +| :----- | :---------------------- | | kprobe | mark_page_dirty_in_slot | ## 示例输出 @@ -64,6 +63,7 @@ TIME(ms) DUR_HALT(ms) COMM PID/TID VCPU_ID ### 暂停轮询 ``` +sudo ./kvm_watcher -n TIME(ms) COMM PID/TID TYPE VCPU_ID OLD(ns) NEW(ns) 879590230.083646 CPU 0/KVM 529746/529769 grow 0 0 --> 10000 879590710.510358 CPU 0/KVM 529746/529769 shrink 0 10000 --> 0 @@ -86,7 +86,41 @@ TIME(ms) COMM PID/TID TYPE VCPU_ID OLD(ns) ### dirty page ``` +#sudo ./kvm_watcher -d +TIME(ms) COMM PID/TID GFN REL_GFN NPAGES USERSPACE_ADDR SLOT_ID +1056168589.765210 CPU 2/KVM 630632/630644 f3eeb 3eeb 16384 7fbd5fe00000 3 +1056168589.800763 CPU 2/KVM 630632/630644 f3eec 3eec 16384 7fbd5fe00000 3 +1056168589.837823 CPU 2/KVM 630632/630644 f3eed 3eed 16384 7fbd5fe00000 3 +1056168589.875066 CPU 2/KVM 630632/630644 f3eee 3eee 16384 7fbd5fe00000 3 +1056168589.915464 CPU 2/KVM 630632/630644 f3eef 3eef 16384 7fbd5fe00000 3 +1056168589.953782 CPU 2/KVM 630632/630644 f3ef0 3ef0 16384 7fbd5fe00000 3 +1056168589.988523 CPU 2/KVM 630632/630644 f3ef1 3ef1 16384 7fbd5fe00000 3 +1056168590.025416 CPU 2/KVM 630632/630644 f3ef2 3ef2 16384 7fbd5fe00000 3 +1056168590.063675 CPU 2/KVM 630632/630644 f3ef3 3ef3 16384 7fbd5fe00000 3 +1056168590.357484 CPU 2/KVM 630632/630644 f3efb 3efb 16384 7fbd5fe00000 3 +1056168603.844319 CPU 1/KVM 630632/630643 f3f15 3f15 16384 7fbd5fe00000 3 +1056168605.181247 CPU 1/KVM 630632/630643 f3f16 3f16 16384 7fbd5fe00000 3 +``` + +脏页次数会按降序保存到临时文件中 +``` +PID GFN REL_GFN SLOT_ID COUNTS +630632 f0118 118 3 62 +630632 f0119 119 3 62 +630632 f011a 11a 3 61 +630632 f0127 127 3 61 +630632 f0128 128 3 61 +630632 f011d 11d 3 61 +630632 f011c 11c 3 61 +630632 f0120 120 3 61 +630632 f0126 126 3 61 +630632 f011f 11f 3 61 +630632 f011b 11b 3 61 +630632 f0123 123 3 61 +630632 f0124 124 3 61 +630632 f0122 122 3 61 +.... ``` ## 参数介绍 From 9efb16134a4bce05d8b282214a2abea728579b17 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Tue, 12 Mar 2024 20:33:14 +0800 Subject: [PATCH 24/46] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- eBPF_Supermarket/kvm_watcher/docs/kvm_hypercall.md | 3 +-- eBPF_Supermarket/kvm_watcher/docs/kvm_irq.md | 3 ++- eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md | 4 +++- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/docs/kvm_hypercall.md b/eBPF_Supermarket/kvm_watcher/docs/kvm_hypercall.md index 42c02fac8..3030aaff8 100644 --- a/eBPF_Supermarket/kvm_watcher/docs/kvm_hypercall.md +++ b/eBPF_Supermarket/kvm_watcher/docs/kvm_hypercall.md @@ -7,8 +7,7 @@ kvm watcher 的 kvm hypercall 子模块是一个专为 KVM 虚拟化环境设计 ## 原理介绍 在虚拟化环境中,Hypercall 机制是虚拟机(VM)从非特权模式(no root mode)切换到特权模式(root mode)的一种方式,类似于传统操作系统中从用户态切换到内核态的系统调用(syscall)。KVM(Kernel-based Virtual Machine)通过支持 Hypercall 机制,提供了一种高效的方式让虚拟机的 Guest OS 执行一些需要更高权限的操作,比如更新页表或访问物理资源等,这些操作由于虚拟机的非特权域无法完成,因此通过 Hypercall 交由 Hypervisor 来执行。 - -
![wKiom1afF9-BZbZuAAAotW_0zjg092.png](http://s2.51cto.com/wyfs02/M01/79/F3/wKiom1afF9-BZbZuAAAotW_0zjg092.png)
+
hypercall的发起需求触发vm exit原因为EXIT_REASON_VMCALL,其对应的处理函数为: diff --git a/eBPF_Supermarket/kvm_watcher/docs/kvm_irq.md b/eBPF_Supermarket/kvm_watcher/docs/kvm_irq.md index 5362b7cae..9a7e2a801 100644 --- a/eBPF_Supermarket/kvm_watcher/docs/kvm_irq.md +++ b/eBPF_Supermarket/kvm_watcher/docs/kvm_irq.md @@ -100,7 +100,8 @@ KVM_CREATE_IRQCHIP用于虚拟机向VMM的虚拟apic发送中断请求,再有V 其中ioapic的回调函数kvm_set_ioapic_irq依次调用kvm_ioapic_set_irq、ioapic_set_irq最后调用ioapic_service函数,ioapic_service主要是找到中断的重映射表,然后查找中断的目的地信息并转发到对应vcpu的lapic去处理。然后会调用kvm_irq_delivery_to_apic负责将中断分发给lapic。 -> 中断虚拟化详细介绍可以参考:[kvm中断虚拟化 ]() [内核虚拟化:虚拟中断注入](https://blog.csdn.net/weixin_46324627/article/details/136661252?csdn_share_tail=%7B%22type%22%3A%22blog%22%2C%22rType%22%3A%22article%22%2C%22rId%22%3A%22136661252%22%2C%22source%22%3A%22weixin_46324627%22%7D) +> 中断虚拟化详细介绍可以参考:[kvm中断虚拟化 ](https://blog.csdn.net/zgy666/article/details/105456569) +> [内核虚拟化:虚拟中断注入](https://blog.csdn.net/weixin_46324627/article/details/136661252?csdn_share_tail=%7B%22type%22%3A%22blog%22%2C%22rType%22%3A%22article%22%2C%22rId%22%3A%22136661252%22%2C%22source%22%3A%22weixin_46324627%22%7D) ## 挂载点 diff --git a/eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md b/eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md index 7b9fdcd1d..e3733231b 100644 --- a/eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md +++ b/eBPF_Supermarket/kvm_watcher/docs/kvm_vcpu.md @@ -10,7 +10,9 @@ kvm watcher中的kvm vcpu子功能模块是设计用于监控和分析虚拟化 KVM 暂停轮询系统是 KVM 内的一项功能,其在某些情况下可以通过在vCPU 放弃运行并让出后,在主机中进行一段时间的轮询来降低虚拟机的延迟。简而言之,当vCPU 放弃运行(即执行 cede 操作)或在 PowerPC 中,当一个虚拟核心(vcore)的所有vCPU 都放弃运行时,主机内核会在将 CPU 让给调度程序之前,通过轮询等待唤醒条件。 -轮询在某些情况下提供了延迟优势,尤其是在虚拟机可以非常快速地再次运行的情况下。这至少可以通过减少通过调度程序的开销来节省一些时间,通常在几微秒的数量级上,尽管性能优势取决于工作负载。在轮询间隔内,如果没有唤醒源到达,或者运行队列上有其他可运行的任务,则会调用调度程序。因此,在具有非常短唤醒周期的工作负载中,halt轮询特别有用,因为最小化了halt轮询的时间,同时可以避免调用调度程序的时间花费。 具体的源码流畅及详细原理可以参考:[内核虚拟化:vCPU之暂停轮询系统](https://blog.csdn.net/weixin_46324627/article/details/135342622?spm=1001.2014.3001.5501) +轮询在某些情况下提供了延迟优势,尤其是在虚拟机可以非常快速地再次运行的情况下。这至少可以通过减少通过调度程序的开销来节省一些时间,通常在几微秒的数量级上,尽管性能优势取决于工作负载。在轮询间隔内,如果没有唤醒源到达,或者运行队列上有其他可运行的任务,则会调用调度程序。因此,在具有非常短唤醒周期的工作负载中,halt轮询特别有用,因为最小化了halt轮询的时间,同时可以避免调用调度程序的时间花费。 + +具体的源码流程及详细原理可以参考:[内核虚拟化:vCPU之暂停轮询系统](https://blog.csdn.net/weixin_46324627/article/details/135342622?spm=1001.2014.3001.5501) ### dirty page From 0c004cf75f3647caea4292ae39744f81c9019afe Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Wed, 13 Mar 2024 23:10:55 +0800 Subject: [PATCH 25/46] =?UTF-8?q?=E4=BF=AE=E6=94=B9=20Makefile=20=E4=BB=A5?= =?UTF-8?q?=E6=8F=90=E9=AB=98=E5=8F=AF=E8=AF=BB=E6=80=A7=E5=92=8C=E5=8F=AF?= =?UTF-8?q?=E7=BB=B4=E6=8A=A4=E6=80=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- eBPF_Supermarket/kvm_watcher/Makefile | 147 ++++++++++++++------------ 1 file changed, 80 insertions(+), 67 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/Makefile b/eBPF_Supermarket/kvm_watcher/Makefile index 3c9cca0a9..a5d2d2eb0 100644 --- a/eBPF_Supermarket/kvm_watcher/Makefile +++ b/eBPF_Supermarket/kvm_watcher/Makefile @@ -8,81 +8,94 @@ ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \ | sed 's/loongarch64/loongarch/') APP = src/kvm_watcher -OPTIONS = -f -w -n -d -e +OPTIONS = -fm -w -n -d -e -h -c -i -# 共同规则1 -define common_rules1 - # 安装依赖 - sudo apt install clang libelf1 libelf-dev zlib1g-dev libbpf-dev linux-tools-$$(uname -r) linux-cloud-tools-$$(uname -r) - # 生成vmlinux.h文件 +# 编译器标志 +CFLAGS=-g -O2 -Wall +BPF_CFLAGS=-g -O2 -target bpf + +# 要链接的库 +LIBS=-lbpf -lelf -lz -lzstd + +# 头文件目录 +INCLUDE_DIRS=-I/usr/include/x86_64-linux-gnu -I. + +# qemu 命令行参数变量化 +QEMU_CMD=sudo qemu-system-x86_64 -enable-kvm -cpu host -m 2048 -smp 4 -drive file=cirros-0.5.2-x86_64-disk.img,format=qcow2 -boot c -nographic + +CIRROS_IMG_URL=https://gitee.com/nan-shuaibo/cirros/releases/download/0.5.2/cirros-0.5.2-x86_64-disk.img +CIRROS_IMG_FILE=cirros-0.5.2-x86_64-disk.img + +# 定义检查虚拟化支持的命令 +CHECK_VIRT_SUPPORT = [ $$(grep -Eoc '(vmx|svm)' /proc/cpuinfo) -eq 0 ] + +# 定义检查 qemu-system-x86_64 进程是否存在的命令 +CHECK_QEMU_RUNNING = [ -z "$$(pgrep -f qemu-system-x86_64)" ] + +# 默认目标 +.PHONY: default +default: bpf + +# 安装必要的依赖 +.PHONY: deps +deps: + sudo apt-get update + sudo apt-get install -y clang libelf1 libelf-dev zlib1g-dev libbpf-dev \ + linux-tools-$$(uname -r) linux-cloud-tools-$$(uname -r) qemu-kvm wget + +# 生成 vmlinux.h +.PHONY: vmlinux +vmlinux: bpftool btf dump file /sys/kernel/btf/kvm format c > ./include/vmlinux.h -endef - -# 共同规则2 -define common_rules2 - # 编译ebpf程序 - clang -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) -I/usr/include/x86_64-linux-gnu -I. -c $@.bpf.c -o $@.bpf.o - bpftool gen skeleton $@.bpf.o > $@.skel.h - clang -g -O2 -Wall -I . -c $@.c -o $@.o - clang -Wall -O2 -g $@.o -static -lbpf -lelf -lz -lzstd -o $(notdir $@) # 6.5内核编译需要lzstd库 -endef - -# 判断是否已安装 qemu-system-x86_64 -ifeq (,$(shell which qemu-system-x86_64)) - INSTALL_QEMU = sudo apt update && sudo apt install qemu-kvm -endif - -bpf: $(APP) -test: $(APP) - - -.PHONY: $(APP) -$(APP): -# 如果参数为test: -ifeq ($(MAKECMDGOALS),test) -ifeq ($(shell grep -Eoc '(vmx|svm)' /proc/cpuinfo),0) - $(error "The CPU in your device does not support virtualization!") -endif + +# bpf 目标 +.PHONY: bpf +bpf: vmlinux + # 编译BPF程序 + clang $(BPF_CFLAGS) -D__TARGET_ARCH_$(ARCH) $(INCLUDE_DIRS) -c $(APP).bpf.c -o $(APP).bpf.o + # 生成BPF骨架文件 + bpftool gen skeleton ${APP}.bpf.o > $(APP).skel.h + # 编译用户空间应用程序 + clang $(CFLAGS) $(INCLUDE_DIRS) -c $(APP).c -o ${APP}.o + # 将用户空间应用程序与库链接 + clang -Wall $(CFLAGS) ${APP}.o $(LIBS) -o $(notdir $(APP)) + echo "BPF program compiled successfully." + +.PHONY: test +test: bpf + @if $(CHECK_VIRT_SUPPORT); then \ + echo "The CPU in your device does not support virtualization!"; \ + exit 1; \ + fi # 加载KVM模块 sudo modprobe kvm && sudo modprobe kvm-intel - $(common_rules1) - $(common_rules2) -ifeq ($(wildcard ./cirros-0.5.2-x86_64-disk.img),) - wget https://gitee.com/nan-shuaibo/cirros/releases/download/0.5.2/cirros-0.5.2-x86_64-disk.img -endif - # 安装 qemu - $(INSTALL_QEMU) + @if [ ! -f ./${CIRROS_IMG_FILE} ]; then \ + wget $(CIRROS_IMG_URL) -O $(CIRROS_IMG_FILE); \ + fi # 启动虚拟机 - ifneq ($(shell pgrep -f qemu-system-x86_64),) - echo "\nVirtual machine is running..." - sleep 1 - else - echo "\nWaiting for the virtual machine to start..." - sudo qemu-system-x86_64 -enable-kvm -cpu host -m 2048 -smp 4 -drive file=cirros-0.5.2-x86_64-disk.img,format=qcow2 -boot c -nographic >/dev/null 2>&1 & - sleep 8 - echo "\nVirtual machine is running..." - sleep 1 - endif + @if $(CHECK_QEMU_RUNNING); then \ + echo "\nWaiting for the virtual machine to start..."; \ + $(QEMU_CMD) >/dev/null 2>&1 & \ + sleep 8; \ + echo "\nVirtual machine is running..."; \ + else \ + echo "\nVirtual machine is already running..."; \ + fi # 运行kvm_watcher - echo "\nPreparing to run the kvm_watcher program..." + echo "\nPreparing to run the $(notdir $(APP)) program..."; \ for opt in $(OPTIONS); do \ - echo "\nrunning kvm_watcher with option: $$opt"; \ - sleep 2 ;\ - sudo ./kvm_watcher $$opt -t 3; \ - done + echo "\nrunning $(notdir $(APP)) with option: $$opt"; \ + sleep 2; \ + sudo ./$(notdir $(APP)) $$opt -t 3; \ + done # 结束qemu虚拟机进程 - -sudo pkill -f "qemu-system-x86_64 -enable-kvm -cpu host -m 2048 -smp 4 -drive file=cirros-0.5.2-x86_64-disk.img,format=qcow2 -boot c -nographic" - echo "\nSuccessful test run of the kvm_watcher program." -# 如果参数为空或者为bpf -else -ifeq ($(wildcard ./include/vmlinux.h),) - $(common_rules1) -endif - $(common_rules2) - echo "\nCompilation successful!" -endif + -sudo pkill -f "$(QEMU_CMD)" + echo "\nSuccessful test run of the $(notdir $(APP)) program." + + clean: - cd src && rm -f *.o *.skel.h - sudo rm -rf $(notdir $(APP)) include/vmlinux.h temp* + cd src && rm -f *.o *.skel.h *.bpf.o + rm -f $(notdir $(APP)) + rm -rf include/vmlinux.h From 1eb2e5d28abef02cc14eac2085ed9dd46d90b069 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Wed, 13 Mar 2024 23:18:11 +0800 Subject: [PATCH 26/46] update yml --- .github/workflows/kvm_watcher.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/kvm_watcher.yml b/.github/workflows/kvm_watcher.yml index 0a3f7525a..00cda175e 100644 --- a/.github/workflows/kvm_watcher.yml +++ b/.github/workflows/kvm_watcher.yml @@ -23,5 +23,5 @@ jobs: - name: Test program execution run: | cd eBPF_Supermarket/kvm_watcher/ - make - + make test + From 72829e559f75176b12bbf52e0de333c453319394 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Wed, 13 Mar 2024 23:19:34 +0800 Subject: [PATCH 27/46] update yml --- .github/workflows/kvm_watcher.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/kvm_watcher.yml b/.github/workflows/kvm_watcher.yml index 00cda175e..9eb2ef72b 100644 --- a/.github/workflows/kvm_watcher.yml +++ b/.github/workflows/kvm_watcher.yml @@ -23,5 +23,6 @@ jobs: - name: Test program execution run: | cd eBPF_Supermarket/kvm_watcher/ + make deps make test From 732e290718536f17455f2ba3ed388dd4432f772c Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Wed, 13 Mar 2024 23:20:37 +0800 Subject: [PATCH 28/46] update yml --- .github/workflows/kvm_watcher.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/kvm_watcher.yml b/.github/workflows/kvm_watcher.yml index 9eb2ef72b..dc79c8d50 100644 --- a/.github/workflows/kvm_watcher.yml +++ b/.github/workflows/kvm_watcher.yml @@ -24,5 +24,5 @@ jobs: run: | cd eBPF_Supermarket/kvm_watcher/ make deps - make test + make From 924f479f15925f5803536bbe0491dc327db29442 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 15 Mar 2024 16:26:58 +0800 Subject: [PATCH 29/46] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- eBPF_Supermarket/kvm_watcher/README.md | 87 +++++++++++++++----------- 1 file changed, 51 insertions(+), 36 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/README.md b/eBPF_Supermarket/kvm_watcher/README.md index 796ea58cd..f1d99cbb8 100755 --- a/eBPF_Supermarket/kvm_watcher/README.md +++ b/eBPF_Supermarket/kvm_watcher/README.md @@ -2,26 +2,21 @@ ## 一、项目简介 -`kvm_watcher` 是一个基于 eBPF 技术的项目,旨在在宿主机侧监控和提取 KVM 虚拟机的性能指标,同时对宿主机性能影响较小。该项目基于 eBPF 的实时监控方案,通过在宿主机中执行eBPF程序,实时捕获有关 KVM 虚拟机的关键性能数据和性能事件,提供全面的性能数据,帮助管理员优化虚拟化环境,改善虚拟机的运行效率和响应性,并且允许用户根据实际需求选择监控的指标和事件,实现个性化配置。 +`kvm_watcher` 是一个基于 eBPF 技术的项目,旨在在宿主机侧监控和提取 KVM 虚拟机的性能指标,用于诊断对 `kvm` 可见的客户机行为,特别是与客户机相关的问题。同时对宿主机性能影响较小。该项目基于 eBPF 的实时监控方案,通过在宿主机中执行eBPF程序,实时捕获有关 KVM 虚拟机的关键性能数据和性能事件,提供全面的性能数据,帮助管理员优化虚拟化环境,改善虚拟机的运行效率和响应性,并且允许用户根据实际需求选择监控的指标和事件,实现个性化配置。 + +> 局限:鉴于不同体系结构的硬件辅助虚拟化技术,目前我们只适用于intel中的vmx技术。 ## 二、功能介绍 -`kvm_watcher`是一款基于eBPF的kvm检测工具,其旨在使用户方便快捷在宿主机侧获取kvm虚拟机中的各种信息。 +`kvm_watcher`是一款基于eBPF的kvm虚拟机检测工具,其旨在使用户方便快捷在宿主机侧获取kvm虚拟机中的各种信息,报告所有正在运行的guest行为。 目前,其实现的功能主要包括: -- **VM Exit 事件分析:** - - 捕获 VM Exit 事件,包括发生的时间戳、原因、次数以及处理时延等信息。 -- **KVM mmu事件分析:** - - 监控 KVM 中的 mmu page fault 和mmio page fault 事件,记录gva、hva、pfn、错误类型和处理时延等关键信息。 - - 实时监控kvm虚拟机中产生的dirty page,记录脏页地址、变脏时间、变脏次数和memslot等相关信息。 -- **vCPU相关指标分析:** - - 记录有关vCPU的性能指标,包括唤醒时的时间戳,halt持续时间,vCPU id等相关信息。 - - 实时监控vCPU的halt-polling时间的变化信息,包括vCPU的线程tid,变化类型,变化前后的halt-polling时间等信息。 -- **kvm中中断注入时相关信息:** - - PIC:实时记录PIC芯片类型,中断引脚编号,中断触发方式,是否可屏蔽,处理延时,是否发生合并等信息。 - - IOAPIC: - - MSI: +- **[VM Exit 事件分析](./docs/kvm_exit.md)** +- **[KVM mmu事件分析](./docs/kvm_mmu.md)** +- **[vCPU相关指标分析](./docs/kvm_vcpu.md)** +- **[kvm中中断注入记录](./docs/kvm_irq.md)** +- **[hypercall信息统计](./docs/kvm_hypercall.md)** ## 三、使用方法 @@ -43,7 +38,8 @@ sudo modprobe kvm && sudo modprobe kvm-intel //加载kvm模块 **编译运行:** ``` -make +make deps +make bpf sudo ./kvm_watcher [options] make clean ``` @@ -56,14 +52,20 @@ make clean Usage: kvm_watcher [OPTION...] BPF program used for monitoring KVM event + + -c, --kvm_irqchip Monitor the irqchip setting information in KVM + VM. -d, --mark_page_dirty Monitor virtual machine dirty page information. -e, --vm_exit Monitoring the event of vm exit. -f, --kvmmmu_page_fault Monitoring the data of kvmmmu page fault. - -c, --kvm_irq Monitor the interrupt information in KVM VM. - -m, --mmio Monitoring the data of mmio page fault..(The -f option must be specified.) - -n, --halt_poll_ns Monitoring the variation in vCPU halt-polling time. + -h, --hypercall Monitor the hypercall information in KVM VM + -i, --irq_inject Monitor the virq injection information in KVM VM + -l, --kvm_ioctl Monitoring the KVM IOCTL. + -m, --mmio Monitoring the data of mmio page fault.(The -f + option must be specified.) + -n, --halt_poll_ns Monitoring the variation in vCPU halt-polling + time. -p, --vm_pid=PID Specify the virtual machine pid to monitor. - -s, --stat Display statistical data.(The -e option must be specified.) -t, --monitoring_time=SEC Time for monitoring. -w, --vcpu_wakeup Monitoring the wakeup of vcpu. -?, --help Give this help list @@ -71,20 +73,22 @@ BPF program used for monitoring KVM event -V, --version Print program version ``` -`-h`:输出帮助信息 - -`-e`:记录vm exit事件信息 +`-H`:输出帮助信息 -`-s`:输出最后的vm exit事件统计信息(需要和`-e`一同使用) +`-e`:统计vm exit事件信息 `-f`:记录kvmmmu缺页信息 -`-c:记录kvm中断芯片设置相关信息 +`-c`:记录kvm中断芯片设置相关信息 + +`-h`:统计hypercall发生的信息 `-m`:记录mmio缺页信息(需要和`-f`一同使用) `-d`:记录kvm脏页信息 +`-h`:记录hypercall超级调用信息 + `-n`:记录vcpu的halt-polling相关信息 `-w`:记录vcpu唤醒时的相关信息 @@ -96,18 +100,29 @@ BPF program used for monitoring KVM event ## 四、代码结构 ``` -├── include -│ ├── kvm_exits.h //vm exit事件相关的内核bpf程序 -│ ├── kvm_mmu.h //kvmmmu相关的内核bpf程序 -│ ├── kvm_irq.h //kvm中断相关内核bpf程序 -│ ├── kvm_vcpu.h //vcpu相关内核bpf程序 -│ └── kvm_watcher.h //项目公用头文件 -├── Makefile //编译脚本 -├── src -│ ├── kvm_watcher.bpf.c //内核态bpf入口程序 -│ └── kvm_watcher.c //用户态bpf程序 -└── temp - └── dirty_temp //脏页临时文件 +├── docs //功能模块说明文档 +│ ├── kvm_exit.md +│ ├── kvm_hypercall.md +│ ├── kvm_irq.md +│ ├── kvm_mmu.md +│ └── kvm_vcpu.md +├── include //内核态bpf程序 +│ ├── kvm_exits.h +│ ├── kvm_hypercall.h +│ ├── kvm_ioctl.h +│ ├── kvm_irq.h +│ ├── kvm_mmu.h +│ ├── kvm_vcpu.h +│ └── kvm_watcher.h //公共头文件 +├── kvm_exit_bcc //bcc版本的vm exit实现 +│ ├── kvmexit_example.txt +│ └── kvmexit.py +├── Makefile //编译脚本 +├── README.md +├── src +│ ├── kvm_watcher.bpf.c //内核态bpf程序入口 +│ └── kvm_watcher.c //用户态bpf程序 +└── temp //临时文件目录 ``` ## 五、测试 From 82d5a94e5b8d12249a6fe058e15ec28a27accb1b Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 15 Mar 2024 17:10:25 +0800 Subject: [PATCH 30/46] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E6=96=87=E6=A1=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- eBPF_Supermarket/kvm_watcher/docs/kvm_exit.md | 11 +- .../kvm_watcher/docs/kvm_hypercall.md | 208 +++++++++--------- 2 files changed, 117 insertions(+), 102 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/docs/kvm_exit.md b/eBPF_Supermarket/kvm_watcher/docs/kvm_exit.md index 381befd0c..6e4e27f0b 100644 --- a/eBPF_Supermarket/kvm_watcher/docs/kvm_exit.md +++ b/eBPF_Supermarket/kvm_watcher/docs/kvm_exit.md @@ -30,6 +30,15 @@ ![VM entry 与 VM exit](https://ctf-wiki.org/pwn/virtualization/basic-knowledge/figure/vm-entry-and-exit.png) + + +## 挂载点 + +| 类型 | 名称 | +| ---------- | --------- | +| tracepoint | kvm_exit | +| tracepoint | kvm_entry | + ## 示例输出 4391为主机上的虚拟机进程,4508、4509、4510...分别是虚拟机中的vcpu子进程,每隔两秒输出虚拟机中产生的exit事件及其处理延时等信息。 @@ -89,4 +98,4 @@ pid tid total_time max_time min_time counts re - **VM Exit 原因统计**:记录并展示触发 VM Exit 的具体原因,帮助用户理解 VM Exit 发生的上下文和背景。 - **VM Exit 延时分析**:统计每次 VM Exit 处理的最大、最小和总共延时,为性能分析提供量化数据。 - **VM Exit 次数计数**:计算每种类型的 VM Exit 发生的次数,帮助识别最频繁的性能瓶颈。 -- **PID、TID号**:其中PID为主机侧的虚拟机进程号,TID为虚拟机内部的vcpu的进程号 \ No newline at end of file +- **PID、TID号**:其中PID为主机侧的虚拟机进程号,TID为虚拟机内部的vcpu**的进程号** \ No newline at end of file diff --git a/eBPF_Supermarket/kvm_watcher/docs/kvm_hypercall.md b/eBPF_Supermarket/kvm_watcher/docs/kvm_hypercall.md index 3030aaff8..629d11b37 100644 --- a/eBPF_Supermarket/kvm_watcher/docs/kvm_hypercall.md +++ b/eBPF_Supermarket/kvm_watcher/docs/kvm_hypercall.md @@ -7,6 +7,7 @@ kvm watcher 的 kvm hypercall 子模块是一个专为 KVM 虚拟化环境设计 ## 原理介绍 在虚拟化环境中,Hypercall 机制是虚拟机(VM)从非特权模式(no root mode)切换到特权模式(root mode)的一种方式,类似于传统操作系统中从用户态切换到内核态的系统调用(syscall)。KVM(Kernel-based Virtual Machine)通过支持 Hypercall 机制,提供了一种高效的方式让虚拟机的 Guest OS 执行一些需要更高权限的操作,比如更新页表或访问物理资源等,这些操作由于虚拟机的非特权域无法完成,因此通过 Hypercall 交由 Hypervisor 来执行。 +
hypercall的发起需求触发vm exit原因为EXIT_REASON_VMCALL,其对应的处理函数为: @@ -24,112 +25,118 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { ``` int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { - unsigned long nr, a0, a1, a2, a3, ret; - int op_64_bit; - - // 检查是否启用了Xen超级调用,如果是,则调用Xen超级调用处理函数 - if (kvm_xen_hypercall_enabled(vcpu->kvm)) - return kvm_xen_hypercall(vcpu); - - // 检查是否启用了Hypervisor超级调用,如果是,则调用Hypervisor超级调用处理函数 - if (kvm_hv_hypercall_enabled(vcpu)) - return kvm_hv_hypercall(vcpu); - - // 从寄存器中读取超级调用号及参数 - nr = kvm_rax_read(vcpu); - a0 = kvm_rbx_read(vcpu); - a1 = kvm_rcx_read(vcpu); - a2 = kvm_rdx_read(vcpu); - a3 = kvm_rsi_read(vcpu); - - // 记录超级调用的追踪信息 - trace_kvm_hypercall(nr, a0, a1, a2, a3); - - // 检查是否为64位超级调用 - op_64_bit = is_64_bit_hypercall(vcpu); - if (!op_64_bit) { - nr &= 0xFFFFFFFF; - a0 &= 0xFFFFFFFF; - a1 &= 0xFFFFFFFF; - a2 &= 0xFFFFFFFF; - a3 &= 0xFFFFFFFF; - } - - // 检查当前CPU的特权级是否为0 - if (static_call(kvm_x86_get_cpl)(vcpu) != 0) { - ret = -KVM_EPERM; - goto out; - } - - ret = -KVM_ENOSYS; - - // 根据超级调用号执行相应的操作 - switch (nr) { - case KVM_HC_VAPIC_POLL_IRQ: - ret = 0; - break; - case KVM_HC_KICK_CPU: - // 处理CPU唤醒的超级调用 - if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT)) - break; - - kvm_pv_kick_cpu_op(vcpu->kvm, a1); - kvm_sched_yield(vcpu, a1); - ret = 0; - break; + unsigned long nr, a0, a1, a2, a3, ret; + int op_64_bit; + + // 检查是否启用了Xen超级调用,如果是,则调用Xen超级调用处理函数 + if (kvm_xen_hypercall_enabled(vcpu->kvm)) + return kvm_xen_hypercall(vcpu); + + // 检查是否启用了Hypervisor超级调用,如果是,则调用Hypervisor超级调用处理函数 + if (kvm_hv_hypercall_enabled(vcpu)) + return kvm_hv_hypercall(vcpu); + + // 从寄存器中读取超级调用号及参数 + nr = kvm_rax_read(vcpu); + a0 = kvm_rbx_read(vcpu); + a1 = kvm_rcx_read(vcpu); + a2 = kvm_rdx_read(vcpu); + a3 = kvm_rsi_read(vcpu); + + // 记录超级调用的追踪信息 + trace_kvm_hypercall(nr, a0, a1, a2, a3); + + // 检查是否为64位超级调用 + op_64_bit = is_64_bit_hypercall(vcpu); + if (!op_64_bit) { + nr &= 0xFFFFFFFF; + a0 &= 0xFFFFFFFF; + a1 &= 0xFFFFFFFF; + a2 &= 0xFFFFFFFF; + a3 &= 0xFFFFFFFF; + } + + // 检查当前CPU的特权级是否为0 + if (static_call(kvm_x86_get_cpl)(vcpu) != 0) { + ret = -KVM_EPERM; + goto out; + } + + ret = -KVM_ENOSYS; + + // 根据超级调用号执行相应的操作 + switch (nr) { + case KVM_HC_VAPIC_POLL_IRQ: + ret = 0; + break; + case KVM_HC_KICK_CPU: + // 处理CPU唤醒的超级调用 + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT)) + break; + + kvm_pv_kick_cpu_op(vcpu->kvm, a1); + kvm_sched_yield(vcpu, a1); + ret = 0; + break; #ifdef CONFIG_X86_64 - case KVM_HC_CLOCK_PAIRING: - // 处理时钟配对的超级调用 - ret = kvm_pv_clock_pairing(vcpu, a0, a1); - break; + case KVM_HC_CLOCK_PAIRING: + // 处理时钟配对的超级调用 + ret = kvm_pv_clock_pairing(vcpu, a0, a1); + break; #endif - case KVM_HC_SEND_IPI: - // 处理发送中断请求的超级调用 - if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI)) - break; - - ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); - break; - case KVM_HC_SCHED_YIELD: - // 处理调度让出的超级调用 - if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD)) - break; - - kvm_sched_yield(vcpu, a0); - ret = 0; - break; - case KVM_HC_MAP_GPA_RANGE: - // 处理GPA范围映射的超级调用 - ret = -KVM_ENOSYS; - if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) - break; - - // 设置KVM_EXIT_HYPERCALL退出类型,并填充相关信息 - vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; - vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; - vcpu->run->hypercall.args[0] = a0; - vcpu->run->hypercall.args[1] = a1; - vcpu->run->hypercall.args[2] = a2; - vcpu->run->hypercall.longmode = op_64_bit; - vcpu->arch.complete_userspace_io = complete_hypercall_exit; - return 0; - default: - ret = -KVM_ENOSYS; - break; - } + case KVM_HC_SEND_IPI: + // 处理发送中断请求的超级调用 + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI)) + break; + + ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit); + break; + case KVM_HC_SCHED_YIELD: + // 处理调度让出的超级调用 + if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD)) + break; + + kvm_sched_yield(vcpu, a0); + ret = 0; + break; + case KVM_HC_MAP_GPA_RANGE: + // 处理GPA范围映射的超级调用 + ret = -KVM_ENOSYS; + if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) + break; + + // 设置KVM_EXIT_HYPERCALL退出类型,并填充相关信息 + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; + vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; + vcpu->run->hypercall.args[0] = a0; + vcpu->run->hypercall.args[1] = a1; + vcpu->run->hypercall.args[2] = a2; + vcpu->run->hypercall.longmode = op_64_bit; + vcpu->arch.complete_userspace_io = complete_hypercall_exit; + return 0; + default: + ret = -KVM_ENOSYS; + break; + } out: - // 如果不是64位超级调用,则返回值需要截断为32位 - if (!op_64_bit) - ret = (u32)ret; - kvm_rax_write(vcpu, ret); - - // 更新超级调用统计信息,并跳过被模拟的指令 - ++vcpu->stat.hypercalls; - return kvm_skip_emulated_instruction(vcpu); + // 如果不是64位超级调用,则返回值需要截断为32位 + if (!op_64_bit) + ret = (u32)ret; + kvm_rax_write(vcpu, ret); + + // 更新超级调用统计信息,并跳过被模拟的指令 + ++vcpu->stat.hypercalls; + return kvm_skip_emulated_instruction(vcpu); } ``` +## 挂载点 + +| 类型 | 名称 | +| ------ | --------------------- | +| fentry | kvm_emulate_hypercall | + ## 示例输出 ``` @@ -220,5 +227,4 @@ TIME(ms) COMM PID VCPU_ID NAME HYPERCALLS A - **VCPU_ID**:对应的vcpu标识符 - **NAME**:所发生的hypercall名称 - **COUNTS**:当前时间段内hypercall发送的次数 -- **HYPERCALLS**:自虚拟机启动以来,每个vcpu上发生的hypercall的次数 - +- **HYPERCALLS**:自虚拟机启动以来,每个vcpu上发生的hypercall的次数 \ No newline at end of file From f5f43c10f4b7bfc052cdcbde0ede8fcda8a11a7f Mon Sep 17 00:00:00 2001 From: Lzx Date: Wed, 20 Mar 2024 23:32:16 -0700 Subject: [PATCH 31/46] =?UTF-8?q?add=20cma=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- eBPF_Supermarket/mem_watcher/cma/Makefile | 114 ++++++++++++++++++ .../mem_watcher/cma/cma_monitor.bpf.c | 62 ++++++++++ .../mem_watcher/cma/cma_monitor.c | 51 ++++++++ .../mem_watcher/cma/cma_monitor.h | 10 ++ 4 files changed, 237 insertions(+) create mode 100644 eBPF_Supermarket/mem_watcher/cma/Makefile create mode 100644 eBPF_Supermarket/mem_watcher/cma/cma_monitor.bpf.c create mode 100644 eBPF_Supermarket/mem_watcher/cma/cma_monitor.c create mode 100644 eBPF_Supermarket/mem_watcher/cma/cma_monitor.h diff --git a/eBPF_Supermarket/mem_watcher/cma/Makefile b/eBPF_Supermarket/mem_watcher/cma/Makefile new file mode 100644 index 000000000..c3a2cd4fc --- /dev/null +++ b/eBPF_Supermarket/mem_watcher/cma/Makefile @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +OUTPUT := .output +CLANG ?= clang +LIBBPF_SRC := $(abspath ../libbpf-bootstrap/libbpf/src) +BPFTOOL_SRC := $(abspath ../libbpf-bootstrap/bpftool/src) +LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a) +BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool) +BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool + +ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \ + | sed 's/arm.*/arm/' \ + | sed 's/aarch64/arm64/' \ + | sed 's/ppc64le/powerpc/' \ + | sed 's/mips.*/mips/' \ + | sed 's/riscv64/riscv/' \ + | sed 's/loongarch64/loongarch/') +VMLINUX := ../libbpf-bootstrap/vmlinux/$(ARCH)/vmlinux.h +# Use our own libbpf API headers and Linux UAPI headers distributed with +# libbpf to avoid dependency on system-wide headers, which could be missing or +# outdated +INCLUDES := -I$(OUTPUT) -I../../libbpf/include/uapi -I$(dir $(VMLINUX)) +CFLAGS := -g -Wall +ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS) + +APPS = cma_monitor + +# Get Clang's default includes on this system. We'll explicitly add these dirs +# to the includes list when compiling with `-target bpf` because otherwise some +# architecture-specific dirs will be "missing" on some architectures/distros - +# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h, +# sys/cdefs.h etc. might be missing. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +CLANG_BPF_SYS_INCLUDES = $(shell $(CLANG) -v -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') + +ifeq ($(V),1) + Q = + msg = +else + Q = @ + msg = @printf ' %-8s %s%s\n' \ + "$(1)" \ + "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \ + "$(if $(3), $(3))"; + MAKEFLAGS += --no-print-directory +endif + +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +$(call allow-override,CC,$(CROSS_COMPILE)cc) +$(call allow-override,LD,$(CROSS_COMPILE)ld) + +.PHONY: all +all: $(APPS) + +.PHONY: clean +clean: + $(call msg,CLEAN) + $(Q)rm -rf $(OUTPUT) $(APPS) + +$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ + +# Build libbpf +$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf + $(call msg,LIB,$@) + $(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \ + OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \ + INCLUDEDIR= LIBDIR= UAPIDIR= \ + install + +# Build bpftool +$(BPFTOOL): | $(BPFTOOL_OUTPUT) + $(call msg,BPFTOOL,$@) + $(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap + +# Build BPF code +$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL) + $(call msg,BPF,$@) + $(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) \ + $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \ + -c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + $(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + +# Generate BPF skeletons +$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL) + $(call msg,GEN-SKEL,$@) + $(Q)$(BPFTOOL) gen skeleton $< > $@ + +# Build user-space code +$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h + +$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT) + $(call msg,CC,$@) + $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ + +# Build application binary +$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT) + $(call msg,BINARY,$@) + $(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@ + +# delete failed targets +.DELETE_ON_ERROR: + +# keep intermediate (.skel.h, .bpf.o, etc) targets +.SECONDARY: + diff --git a/eBPF_Supermarket/mem_watcher/cma/cma_monitor.bpf.c b/eBPF_Supermarket/mem_watcher/cma/cma_monitor.bpf.c new file mode 100644 index 000000000..dd18dd50c --- /dev/null +++ b/eBPF_Supermarket/mem_watcher/cma/cma_monitor.bpf.c @@ -0,0 +1,62 @@ +#include "vmlinux.h" +#include +#include +#include +#include "cma_monitor.h" + +#define INTERVAL_MAX 6U +char LICENSE[] SEC("license") = "Dual BSD/GPL"; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 8192); + __type(key, unsigned); + __type(value, u64); +} count_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 8192); + __type(key, u32); + __type(value, u64); +} time_map SEC(".maps"); + + +SEC("kretprobe/cma_alloc") +int BPF_KRETPROBE(cma_alloc) +{ + u32 pid = bpf_get_current_pid_tgid(); + u64 ts = bpf_ktime_get_ns(); + + bpf_map_update_elem(&time_map, &pid, &ts, BPF_ANY); + + return 0; +} + +SEC("kprobe/alloc_contig_range") +int BPF_KRETPROBE(alloc_contig_range) +{ + u32 pid = bpf_get_current_pid_tgid(); + u64 tm = bpf_ktime_get_ns(); + u64 *tsp = bpf_map_lookup_elem(&time_map, &pid); + + if (tsp) + tm -= *tsp; + else + return 1; + + unsigned key = tm / 10000000; + if (key > INTERVAL_MAX - 1) + key = INTERVAL_MAX - 1; + + u64 *value = bpf_map_lookup_elem(&count_map, &key); + if (value) + *value += 1; + else { + u64 init_value = 1; + bpf_map_update_elem(&count_map, &key, &init_value, BPF_ANY); + } + + bpf_map_delete_elem(&time_map, &pid); + + return 0; +} diff --git a/eBPF_Supermarket/mem_watcher/cma/cma_monitor.c b/eBPF_Supermarket/mem_watcher/cma/cma_monitor.c new file mode 100644 index 000000000..cecf5bc82 --- /dev/null +++ b/eBPF_Supermarket/mem_watcher/cma/cma_monitor.c @@ -0,0 +1,51 @@ +#include +#include +#include +#include +#include +#include +#include +//#include +#include "cma_monitor.h" +#include "cma_monitor.skel.h" + +#define INTERVAL_MAX 6U + +int main(int argc, char **argv) +{ + /* + char file_name[200]; + + snprintf(file_name, sizeof(file_name), "%s_kern.o", argv[0]); + if (load_bpf_file(file_name)) { + printf("%s", bpf_log_buf); + + return 1; + }*/ + struct cma_monitor_bpf *skel = cma_monitor_bpf__open_and_load(); + if (!skel) { + fprintf(stderr, "Failed to open BPF skeleton\n"); + return 1; + } + int fd = bpf_map__fd(skel->maps.time_map); + int key; + + for (;;) { + sleep(5); + + for (key = 0; key < INTERVAL_MAX; key++) { + unsigned long long value = 0; + bpf_map_lookup_elem(fd, &key, &value); + + if (key < INTERVAL_MAX - 1) + printf("Range %dms - %dms\tCount:%llu\n", + key * 10, (key + 1) * 10, value); + else + printf("Over 50ms\t\tCount:%llu\n", value); + } + + printf("=========================================\n"); + } + + return 0; +} diff --git a/eBPF_Supermarket/mem_watcher/cma/cma_monitor.h b/eBPF_Supermarket/mem_watcher/cma/cma_monitor.h new file mode 100644 index 000000000..b9b3eddb1 --- /dev/null +++ b/eBPF_Supermarket/mem_watcher/cma/cma_monitor.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* Copyright (c) 2022 Jacky Yin */ +#ifndef __CMA_MONITOR_H +#define __CMA_MONTOR_H + + + + + +#endif /* __CMA_MONTOR_H */ From 4f57303760bdd1fb011f66a8224bfb5c3b257882 Mon Sep 17 00:00:00 2001 From: Lzx Date: Wed, 20 Mar 2024 23:48:03 -0700 Subject: [PATCH 32/46] =?UTF-8?q?add=20ion=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- eBPF_Supermarket/mem_watcher/ion/Makefile | 114 ++++++++++++++++++ .../mem_watcher/ion/ion_monitor.bpf.c | 63 ++++++++++ .../mem_watcher/ion/ion_monitor.c | 57 +++++++++ .../mem_watcher/ion/ion_monitor.h | 10 ++ 4 files changed, 244 insertions(+) create mode 100644 eBPF_Supermarket/mem_watcher/ion/Makefile create mode 100644 eBPF_Supermarket/mem_watcher/ion/ion_monitor.bpf.c create mode 100644 eBPF_Supermarket/mem_watcher/ion/ion_monitor.c create mode 100644 eBPF_Supermarket/mem_watcher/ion/ion_monitor.h diff --git a/eBPF_Supermarket/mem_watcher/ion/Makefile b/eBPF_Supermarket/mem_watcher/ion/Makefile new file mode 100644 index 000000000..e9e8ef726 --- /dev/null +++ b/eBPF_Supermarket/mem_watcher/ion/Makefile @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +OUTPUT := .output +CLANG ?= clang +LIBBPF_SRC := $(abspath ../libbpf-bootstrap/libbpf/src) +BPFTOOL_SRC := $(abspath ../libbpf-bootstrap/bpftool/src) +LIBBPF_OBJ := $(abspath $(OUTPUT)/libbpf.a) +BPFTOOL_OUTPUT ?= $(abspath $(OUTPUT)/bpftool) +BPFTOOL ?= $(BPFTOOL_OUTPUT)/bootstrap/bpftool + +ARCH ?= $(shell uname -m | sed 's/x86_64/x86/' \ + | sed 's/arm.*/arm/' \ + | sed 's/aarch64/arm64/' \ + | sed 's/ppc64le/powerpc/' \ + | sed 's/mips.*/mips/' \ + | sed 's/riscv64/riscv/' \ + | sed 's/loongarch64/loongarch/') +VMLINUX := ../libbpf-bootstrap/vmlinux/$(ARCH)/vmlinux.h +# Use our own libbpf API headers and Linux UAPI headers distributed with +# libbpf to avoid dependency on system-wide headers, which could be missing or +# outdated +INCLUDES := -I$(OUTPUT) -I../../libbpf/include/uapi -I$(dir $(VMLINUX)) +CFLAGS := -g -Wall +ALL_LDFLAGS := $(LDFLAGS) $(EXTRA_LDFLAGS) + +APPS = ion_monitor + +# Get Clang's default includes on this system. We'll explicitly add these dirs +# to the includes list when compiling with `-target bpf` because otherwise some +# architecture-specific dirs will be "missing" on some architectures/distros - +# headers such as asm/types.h, asm/byteorder.h, asm/socket.h, asm/sockios.h, +# sys/cdefs.h etc. might be missing. +# +# Use '-idirafter': Don't interfere with include mechanics except where the +# build would have failed anyways. +CLANG_BPF_SYS_INCLUDES = $(shell $(CLANG) -v -E - &1 \ + | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') + +ifeq ($(V),1) + Q = + msg = +else + Q = @ + msg = @printf ' %-8s %s%s\n' \ + "$(1)" \ + "$(patsubst $(abspath $(OUTPUT))/%,%,$(2))" \ + "$(if $(3), $(3))"; + MAKEFLAGS += --no-print-directory +endif + +define allow-override + $(if $(or $(findstring environment,$(origin $(1))),\ + $(findstring command line,$(origin $(1)))),,\ + $(eval $(1) = $(2))) +endef + +$(call allow-override,CC,$(CROSS_COMPILE)cc) +$(call allow-override,LD,$(CROSS_COMPILE)ld) + +.PHONY: all +all: $(APPS) + +.PHONY: clean +clean: + $(call msg,CLEAN) + $(Q)rm -rf $(OUTPUT) $(APPS) + +$(OUTPUT) $(OUTPUT)/libbpf $(BPFTOOL_OUTPUT): + $(call msg,MKDIR,$@) + $(Q)mkdir -p $@ + +# Build libbpf +$(LIBBPF_OBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf + $(call msg,LIB,$@) + $(Q)$(MAKE) -C $(LIBBPF_SRC) BUILD_STATIC_ONLY=1 \ + OBJDIR=$(dir $@)/libbpf DESTDIR=$(dir $@) \ + INCLUDEDIR= LIBDIR= UAPIDIR= \ + install + +# Build bpftool +$(BPFTOOL): | $(BPFTOOL_OUTPUT) + $(call msg,BPFTOOL,$@) + $(Q)$(MAKE) ARCH= CROSS_COMPILE= OUTPUT=$(BPFTOOL_OUTPUT)/ -C $(BPFTOOL_SRC) bootstrap + +# Build BPF code +$(OUTPUT)/%.bpf.o: %.bpf.c $(LIBBPF_OBJ) $(wildcard %.h) $(VMLINUX) | $(OUTPUT) $(BPFTOOL) + $(call msg,BPF,$@) + $(Q)$(CLANG) -g -O2 -target bpf -D__TARGET_ARCH_$(ARCH) \ + $(INCLUDES) $(CLANG_BPF_SYS_INCLUDES) \ + -c $(filter %.c,$^) -o $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + $(Q)$(BPFTOOL) gen object $@ $(patsubst %.bpf.o,%.tmp.bpf.o,$@) + +# Generate BPF skeletons +$(OUTPUT)/%.skel.h: $(OUTPUT)/%.bpf.o | $(OUTPUT) $(BPFTOOL) + $(call msg,GEN-SKEL,$@) + $(Q)$(BPFTOOL) gen skeleton $< > $@ + +# Build user-space code +$(patsubst %,$(OUTPUT)/%.o,$(APPS)): %.o: %.skel.h + +$(OUTPUT)/%.o: %.c $(wildcard %.h) | $(OUTPUT) + $(call msg,CC,$@) + $(Q)$(CC) $(CFLAGS) $(INCLUDES) -c $(filter %.c,$^) -o $@ + +# Build application binary +$(APPS): %: $(OUTPUT)/%.o $(LIBBPF_OBJ) | $(OUTPUT) + $(call msg,BINARY,$@) + $(Q)$(CC) $(CFLAGS) $^ $(ALL_LDFLAGS) -lelf -lz -o $@ + +# delete failed targets +.DELETE_ON_ERROR: + +# keep intermediate (.skel.h, .bpf.o, etc) targets +.SECONDARY: + diff --git a/eBPF_Supermarket/mem_watcher/ion/ion_monitor.bpf.c b/eBPF_Supermarket/mem_watcher/ion/ion_monitor.bpf.c new file mode 100644 index 000000000..e91936618 --- /dev/null +++ b/eBPF_Supermarket/mem_watcher/ion/ion_monitor.bpf.c @@ -0,0 +1,63 @@ +#include "vmlinux.h" +#include +#include +#include +#include "ion_monitor.h" + +#define INTERVAL_MAX 6U + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 8192); + __type(key, unsigned); + __type(value, u64); +} count_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 8192); + __type(key, u32); + __type(value, u64); +} time_map SEC(".maps"); + +SEC("kprobe/ion_alloc") +int bpf_prog1(void *ctx) +{ + u32 pid = bpf_get_current_pid_tgid() >> 32; + u64 time = bpf_ktime_get_ns(); + u64 ts = bpf_ktime_get_ns(); + bpf_map_update_elem(&time_map, &pid, &ts, BPF_ANY); + + return 0; +} + +SEC("kprobe/ion_ioctl") +int bpf_prog2(void *ctx) +{ + u32 pid = bpf_get_current_pid_tgid() >> 32; + u64 tm = bpf_ktime_get_ns(); + + u64 *tsp = bpf_map_lookup_elem(&time_map, &pid); + if (tsp) + tm -= *tsp; + else + return -1; + + unsigned key = tm / 10000000;//10ms为区间单位 + if (key > INTERVAL_MAX - 1) + key = INTERVAL_MAX - 1; + u64 *value = bpf_map_lookup_elem(&count_map,&key); + if (value) { + *value += 1; + } else { + u64 init_value = 1; + bpf_map_update_elem(&count_map, &key, &init_value, BPF_ANY); + } + + bpf_map_delete_elem(&time_map, &pid); + + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/eBPF_Supermarket/mem_watcher/ion/ion_monitor.c b/eBPF_Supermarket/mem_watcher/ion/ion_monitor.c new file mode 100644 index 000000000..d22a7e004 --- /dev/null +++ b/eBPF_Supermarket/mem_watcher/ion/ion_monitor.c @@ -0,0 +1,57 @@ +#include +#include +#include +#include +#include +#include +#include +//#include +#include "ion_monitor.h" +#include "ion_monitor.skel.h" +#include + + + +#define INTERVAL_MAX 6U +int main(int argc, char **argv) +{ + /* + char file_name[200]; + + snprintf(file_name, sizeof(file_name), "%s_kern.o", argv[0]); + if (load_bpf_file(file_name)) { + printf("%s", bpf_log_buf); + + return 1; + }*/ + struct ion_monitor_bpf *skel = ion_monitor_bpf__open_and_load(); + if (!skel) { + fprintf(stderr, "Failed to open BPF skeleton\n"); + return 1; + } + + int fd = bpf_map__fd(skel->maps.time_map); + int key; + + for(;;) { + sleep(10); + + for (key = 0; key < INTERVAL_MAX; key++) { + unsigned long long value = 0; + bpf_map_lookup_elem(fd, &key, &value); + if (key < INTERVAL_MAX - 1) + printf("Range %dms - %dms\tCount:%llu\n", + key * 10, (key + 1) * 10, value); + else + printf("Over 50ms\t\tCount:%llu\n", value); + } + + printf("==========================================\n"); + } + + return 0; +} + + + + diff --git a/eBPF_Supermarket/mem_watcher/ion/ion_monitor.h b/eBPF_Supermarket/mem_watcher/ion/ion_monitor.h new file mode 100644 index 000000000..e6712713e --- /dev/null +++ b/eBPF_Supermarket/mem_watcher/ion/ion_monitor.h @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* Copyright (c) 2022 Jacky Yin */ +#ifndef __ION_MONITOR_H +#define __ION_MONITOR_H + + + + + +#endif /* __ION_MONTOR_H */ From ede9f468b1f52da3151a58697037c42b87f9b064 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 22 Mar 2024 12:17:24 +0800 Subject: [PATCH 33/46] =?UTF-8?q?=E5=AE=8C=E5=96=84=E5=86=85=E6=A0=B8?= =?UTF-8?q?=E6=80=81kvm=20ioctl=E7=A8=8B=E5=BA=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kvm_watcher/include/kvm_ioctl.h | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h b/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h index 97d13ba03..d6d82489c 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h @@ -21,11 +21,95 @@ #include "kvm_watcher.h" #include "vmlinux.h" +#include #include #include #include +#define KVMIO 0xAE +#define KVM_CREATE_VM _IO(KVMIO, 0x01) /* returns a VM fd */ +#define KVM_CREATE_VCPU _IO(KVMIO, 0x41) +#define KVM_GET_VCPU_EVENTS _IOR(KVMIO, 0x9f, struct kvm_vcpu_events) +#define KVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events) +#define KVM_SET_USER_MEMORY_REGION \ + _IOW(KVMIO, 0x46, struct kvm_userspace_memory_region) +#define KVM_GET_REGS _IOR(KVMIO, 0x81, struct kvm_regs) +#define KVM_SET_REGS _IOW(KVMIO, 0x82, struct kvm_regs) +#define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation) +#define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt) +#define KVM_RUN _IO(KVMIO, 0x80) + static int trace_kvm_ioctl(struct trace_event_raw_sys_enter *args) { + int fd = (int)args->args[0]; + u64 ts; + unsigned int cmd = (unsigned int)args->args[1]; + unsigned long arg = (unsigned long)args->args[2]; + switch (cmd) { + case KVM_CREATE_VM: + bpf_printk("KVM_CREATE_VM: fd=%d\n", fd); + break; + case KVM_CREATE_VCPU: { + int vcpu_id; + bpf_probe_read(&vcpu_id, sizeof(vcpu_id), (void *)arg); + bpf_printk("KVM_CREATE_VCPU: fd=%d, vcpu_id=%d\n", fd, vcpu_id); + break; + } + case KVM_SET_USER_MEMORY_REGION: { + struct kvm_userspace_memory_region region; + bpf_probe_read(®ion, sizeof(region), (void *)arg); + // 打印或处理 region 数据 + bpf_printk( + "KVM_SET_USER_MEMORY_REGION: fd=%d, slot=%u, flags=%u, " + "guest_phys_addr=%llx, memory_size=%lluK,userspace_addr=%llx\n", + fd, region.slot, region.flags, region.guest_phys_addr, + region.memory_size / 1024, region.userspace_addr); + break; + } + case KVM_GET_VCPU_EVENTS: + case KVM_SET_VCPU_EVENTS: { + struct kvm_vcpu_events events; + bpf_probe_read(&events, sizeof(events), (void *)arg); + // 打印或处理 events 数据 + bpf_printk( + "KVM_SET/GET_VCPU_EVENTS: fd=%d, exception=%u, interrupt=%u\n", + fd, events.exception.nr, events.interrupt.nr); + break; + } + case KVM_GET_REGS: + case KVM_SET_REGS: { + struct kvm_regs regs; + bpf_probe_read(®s, sizeof(regs), (void *)arg); + // 此处仅展示部分寄存器值的打印 + bpf_printk( + "KVM_GET/SET_REGS: fd=%d, rax=%llx, rbx=%llx, rcx=%llx, " + "rdx=%llx, rsi=%llx\n", + fd, regs.rax, regs.rbx, regs.rcx, regs.rdx, regs.rsi); + + break; + } + case KVM_TRANSLATE: { + struct kvm_translation tr; + bpf_probe_read(&tr, sizeof(tr), (void *)arg); + bpf_printk( + "KVM_TRANSLATE: fd=%d,linear_address=%llx, " + "physical_address=%llx\n", + fd, tr.linear_address, tr.physical_address); + break; + } + case KVM_INTERRUPT: { + struct kvm_interrupt irq; + bpf_probe_read(&irq, sizeof(irq), (void *)arg); + bpf_printk("KVM_INTERRUPT:fd=%d,interrupt vector:%d\n", fd, + irq.irq); + break; + } + case KVM_RUN: { + bpf_printk("KVM_RUN:fd=%d,fd); + break; + } + default: + break; + } return 0; } From bff2fb9dcfb5f9ada305d7a8e62cc2224c0a38b2 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 22 Mar 2024 12:22:23 +0800 Subject: [PATCH 34/46] =?UTF-8?q?=E5=88=A0=E9=99=A4bcc=E7=89=88=E6=9C=ACvm?= =?UTF-8?q?=20exit=E5=AE=9E=E7=8E=B0=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kvm_watcher/kvm_exit_bcc/kvmexit.py | 378 ------------------ .../kvm_exit_bcc/kvmexit_example.txt | 250 ------------ 2 files changed, 628 deletions(-) delete mode 100644 eBPF_Supermarket/kvm_watcher/kvm_exit_bcc/kvmexit.py delete mode 100644 eBPF_Supermarket/kvm_watcher/kvm_exit_bcc/kvmexit_example.txt diff --git a/eBPF_Supermarket/kvm_watcher/kvm_exit_bcc/kvmexit.py b/eBPF_Supermarket/kvm_watcher/kvm_exit_bcc/kvmexit.py deleted file mode 100644 index dd157488f..000000000 --- a/eBPF_Supermarket/kvm_watcher/kvm_exit_bcc/kvmexit.py +++ /dev/null @@ -1,378 +0,0 @@ -#!/usr/bin/env python -# -# kvmexit.py -# -# Display the exit_reason and its statistics of each vm exit -# for all vcpus of all virtual machines. For example: -# $./kvmexit.py -# PID TID KVM_EXIT_REASON COUNT -# 1273551 1273568 EXIT_REASON_MSR_WRITE 6 -# 1274253 1274261 EXIT_REASON_EXTERNAL_INTERRUPT 1 -# 1274253 1274261 EXIT_REASON_HLT 12 -# ... -# -# Besides, we also allow users to specify one pid, tid(s), or one -# pid and its vcpu. See kvmexit_example.txt for more examples. -# -# @PID: each vitual machine's pid in the user space. -# @TID: the user space's thread of each vcpu of that virtual machine. -# @KVM_EXIT_REASON: the reason why the vm exits. -# @COUNT: the counts of the @KVM_EXIT_REASONS. -# -# REQUIRES: Linux 4.7+ (BPF_PROG_TYPE_TRACEPOINT support) -# -# Copyright (c) 2024 YYS. All rights reserved. -# Original code © 2024 ByteDance Inc. All rights reserved. -# Author(s): -# YYS -# 以下代码段是根据Fei Li的实现进行的修改 -# 原始代码链接:https://github.com/iovisor/bcc/blob/master/tools/kvmexit.py - - -from __future__ import print_function -from time import sleep -from bcc import BPF -import argparse -import multiprocessing -import os -import subprocess - -# -# Process Arguments -# -def valid_args_list(args): - args_list = args.split(",") - for arg in args_list: - try: - int(arg) - except: - raise argparse.ArgumentTypeError("must be valid integer") - return args_list - -# arguments -examples = """examples: - ./kvmexit # Display kvm_exit_reason and its statistics in real-time until Ctrl-C - ./kvmexit 5 # Display in real-time after sleeping 5s - ./kvmexit -p 3195281 # Collpase all tids for pid 3195281 with exit reasons sorted in descending order - ./kvmexit -p 3195281 20 # Collpase all tids for pid 3195281 with exit reasons sorted in descending order, and display after sleeping 20s - ./kvmexit -p 3195281 -v 0 # Display only vcpu0 for pid 3195281, descending sort by default - ./kvmexit -p 3195281 -a # Display all tids for pid 3195281 - ./kvmexit -t 395490 # Display only for tid 395490 with exit reasons sorted in descending order - ./kvmexit -t 395490 20 # Display only for tid 395490 with exit reasons sorted in descending order after sleeping 20s - ./kvmexit -T '395490,395491' # Display for a union like {395490, 395491} -""" -parser = argparse.ArgumentParser( - description="Display kvm_exit_reason and its statistics at a timed interval", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=examples) -parser.add_argument("duration", nargs="?", default=99999999, type=int, help="show delta for next several seconds") -parser.add_argument("-p", "--pid", type=int, help="trace this PID only") -exgroup = parser.add_mutually_exclusive_group() -exgroup.add_argument("-t", "--tid", type=int, help="trace this TID only") -exgroup.add_argument("-T", "--tids", type=valid_args_list, help="trace a comma separated series of tids with no space in between") -exgroup.add_argument("-v", "--vcpu", type=int, help="trace this vcpu only") -exgroup.add_argument("-a", "--alltids", action="store_true", help="trace all tids for this pid") -args = parser.parse_args() -duration = int(args.duration) - -# -# Setup BPF -# - -# load BPF program -bpf_text = """ -#include - -#define REASON_NUM 76 -#define TGID_NUM 1024 - -struct exit_count { - u64 exit_ct[REASON_NUM]; -}; -BPF_PERCPU_ARRAY(init_value, struct exit_count, 1); -BPF_TABLE("percpu_hash", u64, struct exit_count, pcpu_kvm_stat, TGID_NUM); - -struct cache_info { - u64 cache_pid_tgid; - struct exit_count cache_exit_ct; -}; -BPF_PERCPU_ARRAY(pcpu_cache, struct cache_info, 1); - -TRACEPOINT_PROBE(kvm, kvm_exit) { - int cache_miss = 0; - int zero = 0; - u32 er = args->exit_reason; - if (er >= REASON_NUM) { - return 0; - } - - u64 cur_pid_tgid = bpf_get_current_pid_tgid(); - u32 tgid = cur_pid_tgid >> 32; - u32 pid = cur_pid_tgid; - - if (THREAD_FILTER) - return 0; - - struct exit_count *tmp_info = NULL, *initial = NULL; - struct cache_info *cache_p; - cache_p = pcpu_cache.lookup(&zero); - if (cache_p == NULL) { - return 0; - } - - if (cache_p->cache_pid_tgid == cur_pid_tgid) { - //a. If the cur_pid_tgid hit this physical cpu consecutively, save it to pcpu_cache - tmp_info = &cache_p->cache_exit_ct; - } else { - //b. If another pid_tgid matches this pcpu for the last hit, OR it is the first time to hit this physical cpu. - cache_miss = 1; - - // b.a Try to load the last cache struct if exists. - tmp_info = pcpu_kvm_stat.lookup(&cur_pid_tgid); - - // b.b If it is the first time for the cur_pid_tgid to hit this pcpu, employ a - // per_cpu array to initialize pcpu_kvm_stat's exit_count with each exit reason's count is zero - if (tmp_info == NULL) { - initial = init_value.lookup(&zero); - if (initial == NULL) { - return 0; - } - - pcpu_kvm_stat.update(&cur_pid_tgid, initial); - tmp_info = pcpu_kvm_stat.lookup(&cur_pid_tgid); - // To pass the verifier - if (tmp_info == NULL) { - return 0; - } - } - } - - if (er < REASON_NUM) { - tmp_info->exit_ct[er]++; - if (cache_miss == 1) { - if (cache_p->cache_pid_tgid != 0) { - // b.*.a Let's save the last hit cache_info into kvm_stat. - pcpu_kvm_stat.update(&cache_p->cache_pid_tgid, &cache_p->cache_exit_ct); - } - // b.* As the cur_pid_tgid meets current pcpu_cache_array for the first time, save it. - cache_p->cache_pid_tgid = cur_pid_tgid; - bpf_probe_read(&cache_p->cache_exit_ct, sizeof(*tmp_info), tmp_info); - } - return 0; - } - - return 0; -} -""" - -# format output -exit_reasons = ( - "EXCEPTION_NMI", - "EXTERNAL_INTERRUPT", - "TRIPLE_FAULT", - "INIT_SIGNAL", - "SIPI_SIGNAL ", - "N/A", - "N/A", - "INTERRUPT_WINDOW", - "NMI_WINDOW", - "TASK_SWITCH", - "CPUID", - "N/A", - "HLT", - "INVD", - "INVLPG", - "RDPMC", - "RDTSC", - "N/A", - "VMCALL", - "VMCLEAR", - "VMLAUNCH", - "VMPTRLD", - "VMPTRST", - "VMREAD", - "VMRESUME", - "VMWRITE", - "VMOFF", - "VMON", - "CR_ACCESS", - "DR_ACCESS", - "IO_INSTRUCTION", - "MSR_READ", - "MSR_WRITE", - "INVALID_STATE", - "MSR_LOAD_FAIL", - "N/A", - "MWAIT_INSTRUCTION", - "MONITOR_TRAP_FLAG", - "N/A", - "MONITOR_INSTRUCTION", - "PAUSE_INSTRUCTION", - "MCE_DURING_VMENTRY", - "N/A", - "TPR_BELOW_THRESHOLD", - "APIC_ACCESS", - "EOI_INDUCED", - "GDTR_IDTR", - "LDTR_TR", - "EPT_VIOLATION", - "EPT_MISCONFIG", - "INVEPT", - "RDTSCP", - "PREEMPTION_TIMER", - "INVVPID", - "WBINVD", - "XSETBV", - "APIC_WRITE", - "RDRAND", - "INVPCID", - "VMFUNC", - "ENCLS", - "RDSEED", - "PML_FULL", - "XSAVES", - "XRSTORS", - "N/A", - "N/A", - "UMWAIT", - "TPAUSE", - "N/A", - "N/A", - "N/A", - "N/A", - "N/A", - "BUS_LOCK", - "NOTIFY " -) - -# -# Do some checks -# -try: - # Currently, only adapte on intel architecture - cmd = "cat /proc/cpuinfo | grep vendor_id | head -n 1" - arch_info = subprocess.check_output(cmd, shell=True).strip() - if b"Intel" in arch_info: - pass - else: - raise Exception("Currently we only support Intel architecture, please do expansion if needs more.") - - # Check if kvm module is loaded - if os.access("/dev/kvm", os.R_OK | os.W_OK): - pass - else: - raise Exception("Please insmod kvm module to use kvmexit tool.") -except Exception as e: - raise Exception("Failed to do precondition check, due to: %s." % e) - -def find_tid(tgt_dir, tgt_vcpu): - for tid in os.listdir(tgt_dir): - path = tgt_dir + "/" + tid + "/comm" - fp = open(path, "r") - comm = fp.read() - if (comm.find(tgt_vcpu) != -1): - return tid - return -1 - -# set process/thread filter -thread_context = "" -header_format = "" -need_collapse = not args.alltids -if args.tid is not None: - thread_context = "TID %s" % args.tid - thread_filter = 'pid != %s' % args.tid -elif args.tids is not None: - thread_context = "TIDS %s" % args.tids - thread_filter = "pid != " + " && pid != ".join(args.tids) - header_format = "TIDS " -elif args.pid is not None: - thread_context = "PID %s" % args.pid - thread_filter = 'tgid != %s' % args.pid - if args.vcpu is not None: - thread_context = "PID %s VCPU %s" % (args.pid, args.vcpu) - # transfer vcpu to tid - tgt_dir = '/proc/' + str(args.pid) + '/task' - tgt_vcpu = "CPU " + str(args.vcpu) - args.tid = find_tid(tgt_dir, tgt_vcpu) - if args.tid == -1: - raise Exception("There's no v%s for PID %d." % (tgt_vcpu, args.pid)) - thread_filter = 'pid != %s' % args.tid - elif args.alltids: - thread_context = "PID %s and its all threads" % args.pid - header_format = "TID " -else: - thread_context = "all threads" - thread_filter = '0' - header_format = "PID TID " -bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter) -b = BPF(text=bpf_text) - - -# header -print("Display kvm exit reasons and statistics for %s" % thread_context, end="") -if duration < 99999999: - print(" after sleeping %d secs." % duration) -else: - print("... Hit Ctrl-C to end.") - -try: - sleep(duration) -except KeyboardInterrupt: - print() - - -# Currently, sort multiple tids in descending order is not supported. -if (args.pid or args.tid): - ct_reason = [] - if args.pid: - tgid_exit = [0 for i in range(len(exit_reasons))] - -# output -print("%s%-35s %s" % (header_format, "KVM_EXIT_REASON", "COUNT")) - -pcpu_kvm_stat = b["pcpu_kvm_stat"] -pcpu_cache = b["pcpu_cache"] -for k, v in pcpu_kvm_stat.items(): - tgid = k.value >> 32 - pid = k.value & 0xffffffff - for i in range(0, len(exit_reasons)): - sum1 = 0 - for inner_cpu in range(0, multiprocessing.cpu_count()): - cachePIDTGID = pcpu_cache[0][inner_cpu].cache_pid_tgid - # Take priority to check if it is in cache - if cachePIDTGID == k.value: - sum1 += pcpu_cache[0][inner_cpu].cache_exit_ct.exit_ct[i] - # If not in cache, find from kvm_stat - else: - sum1 += v[inner_cpu].exit_ct[i] - if sum1 == 0: - continue - - if (args.pid and args.pid == tgid and need_collapse): - tgid_exit[i] += sum1 - elif (args.tid and args.tid == pid): - ct_reason.append((sum1, i)) - elif not need_collapse or args.tids: - print("%-8u %-35s %-8u" % (pid, exit_reasons[i], sum1)) - else: - print("%-8u %-8u %-35s %-8u" % (tgid, pid, exit_reasons[i], sum1)) - - # Display only for the target tid in descending sort - if (args.tid and args.tid == pid): - ct_reason.sort(reverse=True) - for i in range(0, len(ct_reason)): - if ct_reason[i][0] == 0: - continue - print("%-35s %-8u" % (exit_reasons[ct_reason[i][1]], ct_reason[i][0])) - break - - -# Aggregate all tids' counts for this args.pid in descending sort -if args.pid and need_collapse: - for i in range(0, len(exit_reasons)): - ct_reason.append((tgid_exit[i], i)) - ct_reason.sort(reverse=True) - for i in range(0, len(ct_reason)): - if ct_reason[i][0] == 0: - continue - print("%-35s %-8u" % (exit_reasons[ct_reason[i][1]], ct_reason[i][0])) diff --git a/eBPF_Supermarket/kvm_watcher/kvm_exit_bcc/kvmexit_example.txt b/eBPF_Supermarket/kvm_watcher/kvm_exit_bcc/kvmexit_example.txt deleted file mode 100644 index 3ee773bbe..000000000 --- a/eBPF_Supermarket/kvm_watcher/kvm_exit_bcc/kvmexit_example.txt +++ /dev/null @@ -1,250 +0,0 @@ -Demonstrations of kvm exit reasons, the Linux eBPF/bcc version. - - -Considering virtual machines' frequent exits can cause performance problems, -this tool aims to locate the frequent exited reasons and then find solutions -to reduce or even avoid the exit, by displaying the detail exit reasons and -the counts of each vm exit for all vms running on one physical machine. - - -Features of this tool -===================== - -- Although there is a patch: [KVM: x86: add full vm-exit reason debug entries] - (https://patchwork.kernel.org/project/kvm/patch/1555939499-30854-1-git-send-email-pizhenwei@bytedance.com/) - trying to fill more vm-exit reason debug entries, just as the comments said, - the code allocates lots of memory that may never be consumed, misses some - arch-specific kvm causes, and can not do kernel aggregation. Instead bcc, as - a user space tool, can implement all these functions more easily and flexibly. -- The bcc python logic could provide nice kernel aggregation and custom output, - like collpasing all tids for one pid (e.i. one vm's qemu process id) with exit - reasons sorted in descending order. For more information, see the following - #USAGE message. -- The bpf in-kernel percpu_array and percpu_cache further improves performance. - For more information, see the following #Help to understand. - - -Limited -======= - -In view of the hardware-assisted virtualization technology of -different architectures, currently we only adapt on vmx in intel. -And the amd feature is on the road.. - - -Example output: -=============== - -# ./kvmexit.py -Display kvm exit reasons and statistics for all threads... Hit Ctrl-C to end. -PID TID KVM_EXIT_REASON COUNT -^C1273551 1273568 EXIT_REASON_HLT 12 -1273551 1273568 EXIT_REASON_MSR_WRITE 6 -1274253 1274261 EXIT_REASON_EXTERNAL_INTERRUPT 1 -1274253 1274261 EXIT_REASON_HLT 12 -1274253 1274261 EXIT_REASON_MSR_WRITE 4 - -# ./kvmexit.py 6 -Display kvm exit reasons and statistics for all threads after sleeping 6 secs. -PID TID KVM_EXIT_REASON COUNT -1273903 1273922 EXIT_REASON_EXTERNAL_INTERRUPT 175 -1273903 1273922 EXIT_REASON_CPUID 10 -1273903 1273922 EXIT_REASON_HLT 6043 -1273903 1273922 EXIT_REASON_IO_INSTRUCTION 24 -1273903 1273922 EXIT_REASON_MSR_WRITE 15025 -1273903 1273922 EXIT_REASON_PAUSE_INSTRUCTION 11 -1273903 1273922 EXIT_REASON_EOI_INDUCED 12 -1273903 1273922 EXIT_REASON_EPT_VIOLATION 6 -1273903 1273922 EXIT_REASON_EPT_MISCONFIG 380 -1273903 1273922 EXIT_REASON_PREEMPTION_TIMER 194 -1273551 1273568 EXIT_REASON_EXTERNAL_INTERRUPT 18 -1273551 1273568 EXIT_REASON_HLT 989 -1273551 1273568 EXIT_REASON_IO_INSTRUCTION 10 -1273551 1273568 EXIT_REASON_MSR_WRITE 2205 -1273551 1273568 EXIT_REASON_PAUSE_INSTRUCTION 1 -1273551 1273568 EXIT_REASON_EOI_INDUCED 5 -1273551 1273568 EXIT_REASON_EPT_MISCONFIG 61 -1273551 1273568 EXIT_REASON_PREEMPTION_TIMER 14 - -# ./kvmexit.py -p 1273795 5 -Display kvm exit reasons and statistics for PID 1273795 after sleeping 5 secs. -KVM_EXIT_REASON COUNT -MSR_WRITE 13467 -HLT 5060 -PREEMPTION_TIMER 345 -EPT_MISCONFIG 264 -EXTERNAL_INTERRUPT 169 -EPT_VIOLATION 18 -PAUSE_INSTRUCTION 6 -IO_INSTRUCTION 4 -EOI_INDUCED 2 - -# ./kvmexit.py -p 1273795 5 -a -Display kvm exit reasons and statistics for PID 1273795 and its all threads after sleeping 5 secs. -TID KVM_EXIT_REASON COUNT -1273819 EXTERNAL_INTERRUPT 64 -1273819 HLT 2802 -1273819 IO_INSTRUCTION 4 -1273819 MSR_WRITE 7196 -1273819 PAUSE_INSTRUCTION 2 -1273819 EOI_INDUCED 2 -1273819 EPT_VIOLATION 6 -1273819 EPT_MISCONFIG 162 -1273819 PREEMPTION_TIMER 194 -1273820 EXTERNAL_INTERRUPT 78 -1273820 HLT 2054 -1273820 MSR_WRITE 5199 -1273820 EPT_VIOLATION 2 -1273820 EPT_MISCONFIG 77 -1273820 PREEMPTION_TIMER 102 - -# ./kvmexit.py -p 1273795 -v 0 -Display kvm exit reasons and statistics for PID 1273795 VCPU 0... Hit Ctrl-C to end. -KVM_EXIT_REASON COUNT -^CMSR_WRITE 2076 -HLT 795 -PREEMPTION_TIMER 86 -EXTERNAL_INTERRUPT 20 -EPT_MISCONFIG 10 -PAUSE_INSTRUCTION 2 -IO_INSTRUCTION 2 -EPT_VIOLATION 1 -EOI_INDUCED 1 - -# ./kvmexit.py -p 1273795 -v 0 4 -Display kvm exit reasons and statistics for PID 1273795 VCPU 0 after sleeping 4 secs. -KVM_EXIT_REASON COUNT -MSR_WRITE 4726 -HLT 1827 -PREEMPTION_TIMER 78 -EPT_MISCONFIG 67 -EXTERNAL_INTERRUPT 28 -IO_INSTRUCTION 4 -EOI_INDUCED 2 -PAUSE_INSTRUCTION 2 - -# ./kvmexit.py -p 1273795 -v 4 4 -Traceback (most recent call last): - File "tools/kvmexit.py", line 306, in - raise Exception("There's no v%s for PID %d." % (tgt_vcpu, args.pid)) - Exception: There's no vCPU 4 for PID 1273795. - -# ./kvmexit.py -t 1273819 10 -Display kvm exit reasons and statistics for TID 1273819 after sleeping 10 secs. -KVM_EXIT_REASON COUNT -MSR_WRITE 13318 -HLT 5274 -EPT_MISCONFIG 263 -PREEMPTION_TIMER 171 -EXTERNAL_INTERRUPT 109 -IO_INSTRUCTION 8 -PAUSE_INSTRUCTION 5 -EOI_INDUCED 4 -EPT_VIOLATION 2 - -# ./kvmexit.py -T '1273820,1273819' -Display kvm exit reasons and statistics for TIDS ['1273820', '1273819']... Hit Ctrl-C to end. -TIDS KVM_EXIT_REASON COUNT -^C1273819 EXTERNAL_INTERRUPT 300 -1273819 HLT 13718 -1273819 IO_INSTRUCTION 26 -1273819 MSR_WRITE 37457 -1273819 PAUSE_INSTRUCTION 13 -1273819 EOI_INDUCED 13 -1273819 EPT_VIOLATION 53 -1273819 EPT_MISCONFIG 654 -1273819 PREEMPTION_TIMER 958 -1273820 EXTERNAL_INTERRUPT 212 -1273820 HLT 9002 -1273820 MSR_WRITE 25495 -1273820 PAUSE_INSTRUCTION 2 -1273820 EPT_VIOLATION 64 -1273820 EPT_MISCONFIG 396 -1273820 PREEMPTION_TIMER 268 - - -Help to understand -================== - -We use a PERCPU_ARRAY: pcpuArrayA and a percpu_hash: hashA to collaboratively -store each kvm exit reason and its count. The reason is there exists a rule when -one vcpu exits and re-enters, it tends to continue to run on the same physical -cpu (pcpu as follows) as the last cycle, which is also called 'cache hit'. Thus -we turn to use a PERCPU_ARRAY to record the 'cache hit' situation to speed -things up; and for other cases, then use a percpu_hash. - -BTW, we originally use a common hash to do this, with a u64(exit_reason) -key and a struct exit_info {tgid_pid, exit_reason} value. But due to -the big lock in bpf_hash, each updating is quite performance consuming. - -Now imagine here is a pid_tgidA (vcpu A) exits and is going to run on -pcpuArrayA, the BPF code flow is as follows: - - pid_tgidA keeps running on the same pcpu - // \\ - // \\ - // Y N \\ - // \\ - a. cache_hit b. cache_miss -(cacheA's pid_tgid matches pid_tgidA) || - | || - | || - "increase percpu exit_ct and return" || - [*Note*] || - pid_tgidA ever been exited on pcpuArrayA? - // \\ - // \\ - // \\ - // Y N \\ - // \\ - b.a load_last_hashA b.b initialize_hashA_with_zero - \ / - \ / - \ / - "increase percpu exit_ct" - || - || - is another pid_tgid been running on pcpuArrayA? - // \\ - // Y N \\ - // \\ - b.*.a save_theLastHit_hashB do_nothing - \\ // - \\ // - \\ // - b.* save_to_pcpuArrayA - - -[*Note*] we do not update the table in above "a.", in case the vcpu hit the same -pcpu again when exits next time, instead we only update until this pcpu is not -hitted by the same tgidpid(vcpu) again, which is in "b.*.a" and "b.*". - - -USAGE message: -============== - -# ./kvmexit.py -h -usage: kvmexit.py [-h] [-p PID [-v VCPU | -a] ] [-t TID | -T 'TID1,TID2'] [duration] - -Display kvm_exit_reason and its statistics at a timed interval - -optional arguments: - -h, --help show this help message and exit - -p PID, --pid PID display process with this PID only, collpase all tids with exit reasons sorted in descending order - -v VCPU, --v VCPU display this VCPU only for this PID - -a, --alltids display all TIDS for this PID - -t TID, --tid TID display thread with this TID only with exit reasons sorted in descending order - -T 'TID1,TID2', --tids 'TID1,TID2' - display threads for a union like {395490, 395491} - duration duration of display, after sleeping several seconds - -examples: - ./kvmexit # Display kvm_exit_reason and its statistics in real-time until Ctrl-C - ./kvmexit 5 # Display in real-time after sleeping 5s - ./kvmexit -p 3195281 # Collpase all tids for pid 3195281 with exit reasons sorted in descending order - ./kvmexit -p 3195281 20 # Collpase all tids for pid 3195281 with exit reasons sorted in descending order, and display after sleeping 20s - ./kvmexit -p 3195281 -v 0 # Display only vcpu0 for pid 3195281, descending sort by default - ./kvmexit -p 3195281 -a # Display all tids for pid 3195281 - ./kvmexit -t 395490 # Display only for tid 395490 with exit reasons sorted in descending order - ./kvmexit -t 395490 20 # Display only for tid 395490 with exit reasons sorted in descending order after sleeping 20s - ./kvmexit -T '395490,395491' # Display for a union like {395490, 395491} \ No newline at end of file From accb7163d25f0890459b21ff843893209e60b9a9 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 22 Mar 2024 12:22:35 +0800 Subject: [PATCH 35/46] =?UTF-8?q?=E6=9B=B4=E6=96=B0readme?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- eBPF_Supermarket/kvm_watcher/README.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/README.md b/eBPF_Supermarket/kvm_watcher/README.md index f1d99cbb8..efe76f5be 100755 --- a/eBPF_Supermarket/kvm_watcher/README.md +++ b/eBPF_Supermarket/kvm_watcher/README.md @@ -31,14 +31,13 @@ **安装依赖:** ``` -sudo apt install clang libelf1 libelf-dev zlib1g-dev libbpf-dev linux-tools-$(uname -r) linux-cloud-tools-$(uname -r) -sudo modprobe kvm && sudo modprobe kvm-intel //加载kvm模块 +make deps ``` + **编译运行:** ``` -make deps make bpf sudo ./kvm_watcher [options] make clean @@ -93,6 +92,8 @@ BPF program used for monitoring KVM event `-w`:记录vcpu唤醒时的相关信息 +`-l`:记录kvm相关ioctl系统调用命令字 + `-p`:指定kvm虚拟机进程pid `-t`:监控时间 @@ -114,9 +115,6 @@ BPF program used for monitoring KVM event │ ├── kvm_mmu.h │ ├── kvm_vcpu.h │ └── kvm_watcher.h //公共头文件 -├── kvm_exit_bcc //bcc版本的vm exit实现 -│ ├── kvmexit_example.txt -│ └── kvmexit.py ├── Makefile //编译脚本 ├── README.md ├── src From c75ab4f243556e56a0771c1deaefccc9b0f2920e Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 22 Mar 2024 12:24:30 +0800 Subject: [PATCH 36/46] update --- eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h b/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h index d6d82489c..97e4c38fe 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h @@ -104,7 +104,7 @@ static int trace_kvm_ioctl(struct trace_event_raw_sys_enter *args) { break; } case KVM_RUN: { - bpf_printk("KVM_RUN:fd=%d,fd); + bpf_printk("KVM_RUN:fd=%d",fd); break; } default: From 4f3796d0df64e2e51362c6cdf62c10a258c632a1 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 22 Mar 2024 12:25:10 +0800 Subject: [PATCH 37/46] =?UTF-8?q?=E8=B0=83=E8=AF=95=E4=BF=A1=E6=81=AF?= =?UTF-8?q?=E6=89=93=E5=8D=B0=E5=8F=AF=E9=80=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kvm_watcher/src/kvm_watcher.c | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c index 2746149fe..8dcc223ac 100644 --- a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c +++ b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c @@ -265,6 +265,7 @@ static struct env { bool execute_irq_inject; bool execute_hypercall; bool execute_ioctl; + bool verbose; int monitoring_time; pid_t vm_pid; enum EventType event_type; @@ -279,6 +280,7 @@ static struct env { .mmio_page_fault = false, .execute_hypercall = false, .execute_ioctl = false, + .verbose = false, .monitoring_time = 0, .vm_pid = -1, .event_type = NONE_TYPE, @@ -309,6 +311,7 @@ static const struct argp_option opts[] = { {"vm_pid", 'p', "PID", 0, "Specify the virtual machine pid to monitor."}, {"monitoring_time", 't', "SEC", 0, "Time for monitoring."}, {"kvm_ioctl", 'l', NULL, 0, "Monitoring the KVM IOCTL."}, + {"verbose", 'v', NULL, 0, "Verbose debug output"}, {NULL, 'H', NULL, OPTION_HIDDEN, "Show the full help"}, {}, }; @@ -318,6 +321,9 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) { case 'H': argp_state_help(state, stderr, ARGP_HELP_STD_HELP); break; + case 'v': + env.verbose = true; + break; case 'w': SET_OPTION_AND_CHECK_USAGE(option_selected, env.execute_vcpu_wakeup); @@ -392,6 +398,8 @@ static const struct argp argp = { static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) { + if (level == LIBBPF_DEBUG && !env.verbose) + return 0; return vfprintf(stderr, format, args); } @@ -471,7 +479,7 @@ static int handle_event(void *ctx, void *data, size_t data_sz) { } case PAGE_FAULT: { // 使用 e->page_fault_data 访问 PAGE_FAULT 特有成员 - printf("%-18.6f %-15s %-10u %-14llx %-6u %-10.4f ", timestamp_ms, + printf("%-18.6f %-15s %-10u %-12llx %-6u %-10.4f ", timestamp_ms, e->process.comm, e->process.pid, e->page_fault_data.addr, e->page_fault_data.count, NS_TO_US_WITH_DECIMAL(e->page_fault_data.delay)); @@ -643,6 +651,7 @@ static int print_event_head(struct env *env) { "VAILD?"); break; case EXIT: + printf("Waiting vm_exit ... \n"); break; case HALT_POLL: printf("%-18s %-15s %-15s %-10s %-7s %-11s %-10s\n", "TIME(ms)", @@ -655,12 +664,12 @@ static int print_event_head(struct env *env) { break; case PAGE_FAULT: printf("%-18s %-15s %-10s %-12s %-6s %-10s %-20s %-17s %-10s %s\n", - "TIME(ms)", "COMM", "PID", "(f(GPA)m(GFN))", "COUNT", - "DELAY(us)", "HVA", "PFN", "MEM_SLOTID", "ERROR_TYPE"); + "TIME(ms)", "COMM", "PID", "GPA", "COUNT", "DELAY(us)", + "HVA", "PFN", "MEM_SLOTID", "ERROR_TYPE"); break; case IRQCHIP: printf("%-18s %-15s %-10s %-10s %-14s %-10s %-10s\n", "TIME(ms)", - "COMM", "PID", "DELAY", "CHIP/PIN", "DST/VEC", "OTHERS"); + "COMM", "PID", "DELAY", "TYPE/PIN", "DST/VEC", "OTHERS"); break; case IRQ_INJECT: printf("%-18s %-15s %-10s %-10s %-10s %-10s %-10s %-10s\n", @@ -721,9 +730,9 @@ static void set_disable_load(struct kvm_watcher_bpf *skel) { env.execute_irqchip ? true : false); bpf_program__set_autoload(skel->progs.fexit_kvm_ioapic_set_irq, env.execute_irqchip ? true : false); - bpf_program__set_autoload(skel->progs.fentry_kvm_set_msi_irq, + bpf_program__set_autoload(skel->progs.fentry_kvm_set_msi, env.execute_irqchip ? true : false); - bpf_program__set_autoload(skel->progs.fexit_kvm_set_msi_irq, + bpf_program__set_autoload(skel->progs.fexit_kvm_set_msi, env.execute_irqchip ? true : false); bpf_program__set_autoload(skel->progs.fentry_vmx_inject_irq, env.execute_irq_inject ? true : false); @@ -890,7 +899,7 @@ int print_exit_map(struct kvm_watcher_bpf *skel) { NS_TO_MS_WITH_DECIMAL(values[i].total_time), NS_TO_MS_WITH_DECIMAL(values[i].max_time), NS_TO_MS_WITH_DECIMAL(values[i].min_time), values[i].count, - getName(keys[i].reason, EXIT)); + getName(keys[i].reason, EXIT)); } } // clear the maps @@ -976,11 +985,6 @@ int main(int argc, char **argv) { goto cleanup; } - // 清屏 - if (option_selected) { - CLEAR_SCREEN(); - } - /*打印信息头*/ err = print_event_head(&env); if (err) { @@ -989,7 +993,6 @@ int main(int argc, char **argv) { } while (!exiting) { err = ring_buffer__poll(rb, RING_BUFFER_TIMEOUT_MS /* timeout, ms */); - if (env.execute_hypercall) { print_map_and_check_error(print_hc_map, skel, "hypercall", err); } From 095fd0cbde8870295b8091ba9526f1b6b115416f Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 22 Mar 2024 13:58:19 +0800 Subject: [PATCH 38/46] =?UTF-8?q?=E4=BC=98=E5=8C=96=E4=BB=A3=E7=A0=81?= =?UTF-8?q?=EF=BC=8C=E4=BF=AE=E6=94=B9kvm=20msi=E4=B8=AD=E6=96=AD=E6=8C=82?= =?UTF-8?q?=E8=BD=BD=E5=87=BD=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kvm_watcher/include/kvm_exits.h | 3 +- .../kvm_watcher/include/kvm_hypercall.h | 4 +- .../kvm_watcher/include/kvm_irq.h | 27 ++++++---- .../kvm_watcher/include/kvm_mmu.h | 10 ++-- .../kvm_watcher/include/kvm_vcpu.h | 15 +++--- .../kvm_watcher/include/kvm_watcher.h | 12 ++--- .../kvm_watcher/src/kvm_watcher.bpf.c | 53 +++++++++++-------- 7 files changed, 66 insertions(+), 58 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_exits.h b/eBPF_Supermarket/kvm_watcher/include/kvm_exits.h index 29fb8043a..5e9bc16de 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_exits.h +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_exits.h @@ -54,8 +54,7 @@ struct exit { unsigned int vcpu_id; }; -static int trace_kvm_exit(struct exit *ctx, pid_t vm_pid) { - CHECK_PID(vm_pid); +static int trace_kvm_exit(struct exit *ctx) { u32 reason; reason = (u32)ctx->exit_reason; // 如果是节能停止退出,就不采集数据 diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_hypercall.h b/eBPF_Supermarket/kvm_watcher/include/kvm_hypercall.h index dba8bf0d8..ab9b26a52 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_hypercall.h +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_hypercall.h @@ -54,8 +54,8 @@ struct { } hc_count SEC(".maps"); static int entry_emulate_hypercall(struct kvm_vcpu *vcpu, void *rb, - struct common_event *e, pid_t vm_pid) { - CHECK_PID(vm_pid); + struct common_event *e) { + u32 pid = bpf_get_current_pid_tgid() >> 32; u64 nr, a0, a1, a2, a3; nr = kvm_rax_read(vcpu); // 超级调用号 // 超级调用参数 diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_irq.h b/eBPF_Supermarket/kvm_watcher/include/kvm_irq.h index 6fa41d5df..90cc6545c 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_irq.h +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_irq.h @@ -38,8 +38,7 @@ struct { __type(value, u64); } irq_inject_delay SEC(".maps"); -static int entry_kvm_pic_set_irq(int irq, pid_t vm_pid) { - CHECK_PID(vm_pid); +static int entry_kvm_pic_set_irq(int irq) { if (irq < 0 || irq >= PIC_NUM_PINS) { return 0; } @@ -77,8 +76,7 @@ static int exit_kvm_pic_set_irq(struct kvm_pic *s, int irq, int ret, void *rb, return 0; } -static int entry_kvm_ioapic_set_irq(int irq, pid_t vm_pid) { - CHECK_PID(vm_pid); +static int entry_kvm_ioapic_set_irq(int irq) { if (irq < 0 || irq >= IOAPIC_NUM_PINS) { return 0; } @@ -118,17 +116,25 @@ static int exit_kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int ret, return 0; } -static int entry_kvm_set_msi_irq(struct kvm *kvm, pid_t vm_pid) { - CHECK_PID(vm_pid); +static int entry_kvm_set_msi(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *routing_entry, + int level) { + bool x2apic_format; + bpf_probe_read_kernel(&x2apic_format, sizeof(bool), + &kvm->arch.x2apic_format); + if (x2apic_format && (routing_entry->msi.address_hi & 0xff)) + return 0; + if (!level) + return 0; pid_t tid = (u32)bpf_get_current_pid_tgid(); u64 ts = bpf_ktime_get_ns(); bpf_map_update_elem(&irq_set_delay, &tid, &ts, BPF_ANY); return 0; } -static int exit_kvm_set_msi_irq( - struct kvm *kvm, struct kvm_kernel_irq_routing_entry *routing_entry, - void *rb, struct common_event *e) { +static int exit_kvm_set_msi(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *routing_entry, + void *rb, struct common_event *e) { struct msi_msg msg = {.address_lo = routing_entry->msi.address_lo, .address_hi = routing_entry->msi.address_hi, .data = routing_entry->msi.data}; @@ -156,8 +162,7 @@ static int exit_kvm_set_msi_irq( return 0; } -static int entry_vmx_inject_irq(struct kvm_vcpu *vcpu, pid_t vm_pid) { - CHECK_PID(vm_pid); +static int entry_vmx_inject_irq(struct kvm_vcpu *vcpu) { u32 irq_nr; bool rei; bpf_probe_read_kernel(&irq_nr, sizeof(u32), &vcpu->arch.interrupt.nr); diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_mmu.h b/eBPF_Supermarket/kvm_watcher/include/kvm_mmu.h index 52d5b4fa0..ceea6d10d 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_mmu.h +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_mmu.h @@ -25,8 +25,6 @@ #include #include -#define PAGE_SHIFT 12 - struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, 8192); @@ -50,8 +48,7 @@ struct page_fault { char __data[0]; }; -static int trace_page_fault(struct page_fault *ctx, pid_t vm_pid) { - CHECK_PID(vm_pid); +static int trace_page_fault(struct page_fault *ctx) { u64 ts = bpf_ktime_get_ns(); u64 addr = ctx->fault_address; bpf_map_update_elem(&pf_delay, &addr, &ts, BPF_ANY); @@ -101,8 +98,7 @@ static int trace_tdp_page_fault(struct kvm_vcpu *vcpu, } static int trace_kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u64 error_code, pid_t vm_pid) { - CHECK_PID(vm_pid); + u64 error_code) { if (error_code & PFERR_RSVD_MASK) { u64 ts = bpf_ktime_get_ns(); u64 gfn = cr2_or_gpa >> PAGE_SHIFT; @@ -141,7 +137,7 @@ static int trace_handle_mmio_page_fault(struct mmio_page_fault *ctx, void *rb, bpf_map_update_elem(&pf_count, &gfn, &new_count, BPF_ANY); } e->page_fault_data.delay = delay; - e->page_fault_data.addr = gfn; + e->page_fault_data.addr = gfn << PAGE_SHIFT; e->page_fault_data.error_code = PFERR_RSVD_MASK; e->process.pid = bpf_get_current_pid_tgid() >> 32; bpf_get_current_comm(&e->process.comm, sizeof(e->process.comm)); diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_vcpu.h b/eBPF_Supermarket/kvm_watcher/include/kvm_vcpu.h index 7bba6477d..2223c3384 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_vcpu.h +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_vcpu.h @@ -54,8 +54,7 @@ struct { __type(value, u32); } vcpu_tid SEC(".maps"); // 记录vcpu_halt的id信息 -static int trace_kvm_vcpu_halt(struct kvm_vcpu *vcpu, pid_t vm_pid) { - CHECK_PID(vm_pid); +static int trace_kvm_vcpu_halt(struct kvm_vcpu *vcpu) { u32 tid = bpf_get_current_pid_tgid(); u32 vcpu_id; bpf_probe_read_kernel(&vcpu_id, sizeof(vcpu->vcpu_id), &vcpu->vcpu_id); @@ -64,8 +63,8 @@ static int trace_kvm_vcpu_halt(struct kvm_vcpu *vcpu, pid_t vm_pid) { } // 使用kvm_vcpu_halt记录的数据,来获取vcpu的启动信息 static int trace_kvm_vcpu_wakeup(struct vcpu_wakeup *ctx, void *rb, - struct common_event *e, pid_t vm_pid) { - CHECK_PID(vm_pid); + struct common_event *e) { + u32 pid = bpf_get_current_pid_tgid() >> 32; u32 tid = bpf_get_current_pid_tgid(); u32 *vcpu_id = bpf_map_lookup_elem(&vcpu_tid, &tid); if (!vcpu_id) { @@ -86,8 +85,8 @@ static int trace_kvm_vcpu_wakeup(struct vcpu_wakeup *ctx, void *rb, } static int trace_kvm_halt_poll_ns(struct halt_poll_ns *ctx, void *rb, - struct common_event *e, pid_t vm_pid) { - CHECK_PID(vm_pid); + struct common_event *e) { + u32 pid = bpf_get_current_pid_tgid() >> 32; u32 tid = bpf_get_current_pid_tgid(); RESERVE_RINGBUF_ENTRY(rb, e); u64 time = bpf_ktime_get_ns(); @@ -106,8 +105,8 @@ static int trace_kvm_halt_poll_ns(struct halt_poll_ns *ctx, void *rb, static int trace_mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn, void *rb, - struct common_event *e, pid_t vm_pid) { - CHECK_PID(vm_pid); + struct common_event *e) { + u32 pid = bpf_get_current_pid_tgid() >> 32; u32 flags; struct kvm_memory_slot *slot; bpf_probe_read_kernel(&slot, sizeof(memslot), &memslot); diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h b/eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h index 97c77244b..b13071223 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_watcher.h @@ -22,6 +22,8 @@ #define TASK_COMM_LEN 16 #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) +#define PAGE_SHIFT 12 + #define NS_TO_US_FACTOR 1000.0 #define NS_TO_MS_FACTOR 1000000.0 @@ -66,9 +68,6 @@ } \ } while (0) -// 定义清屏宏 -#define CLEAR_SCREEN() printf("\033[2J\033[H\n") - #define RING_BUFFER_TIMEOUT_MS 100 #define RESERVE_RINGBUF_ENTRY(rb, e) \ @@ -79,10 +78,9 @@ e = _tmp; \ } while (0) -#define CHECK_PID(vm_pid) \ - __u32 pid = bpf_get_current_pid_tgid() >> 32; \ - if ((vm_pid) > 0 && pid != (vm_pid)) { \ - return 0; \ +#define CHECK_PID(vm_pid) \ + if ((vm_pid) > 0 && (bpf_get_current_pid_tgid() >> 32) != (vm_pid)) { \ + return 0; \ } struct reason_info { diff --git a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c index 406e5d273..6fabe0ba7 100644 --- a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c +++ b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.bpf.c @@ -22,11 +22,11 @@ #include #include "../include/kvm_watcher.h" #include "../include/kvm_exits.h" +#include "../include/kvm_ioctl.h" #include "../include/kvm_vcpu.h" #include "../include/kvm_mmu.h" #include "../include/kvm_irq.h" #include "../include/kvm_hypercall.h" -#include "../include/kvm_ioctl.h" char LICENSE[] SEC("license") = "Dual BSD/GPL"; @@ -42,22 +42,25 @@ struct { // 获取vcpu的id SEC("fentry/kvm_vcpu_halt") int BPF_PROG(fentry_kvm_vcpu_halt, struct kvm_vcpu *vcpu) { - return trace_kvm_vcpu_halt(vcpu, vm_pid); + CHECK_PID(vm_pid); + return trace_kvm_vcpu_halt(vcpu); } // 追踪vcpu运行信息 SEC("tp/kvm/kvm_vcpu_wakeup") int tp_vcpu_wakeup(struct vcpu_wakeup *ctx) { - return trace_kvm_vcpu_wakeup(ctx, &rb, e, vm_pid); + return trace_kvm_vcpu_wakeup(ctx, &rb, e); } // 记录vcpu的halt_poll(暂停轮询)时间变化 SEC("tp/kvm/kvm_halt_poll_ns") int tp_kvm_halt_poll_ns(struct halt_poll_ns *ctx) { - return trace_kvm_halt_poll_ns(ctx, &rb, e, vm_pid); + CHECK_PID(vm_pid); + return trace_kvm_halt_poll_ns(ctx, &rb, e); } // 记录vm_exit的时间 SEC("tp/kvm/kvm_exit") int tp_exit(struct exit *ctx) { - return trace_kvm_exit(ctx, vm_pid); + CHECK_PID(vm_pid); + return trace_kvm_exit(ctx); } // 记录vm_entry和vm_exit的时间差 SEC("tp/kvm/kvm_entry") @@ -68,12 +71,14 @@ int tp_entry(struct exit *ctx) { SEC("kprobe/mark_page_dirty_in_slot") int BPF_KPROBE(kp_mark_page_dirty_in_slot, struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn) { - return trace_mark_page_dirty_in_slot(kvm, memslot, gfn, &rb, e, vm_pid); + CHECK_PID(vm_pid); + return trace_mark_page_dirty_in_slot(kvm, memslot, gfn, &rb, e); } SEC("tp/kvm/kvm_page_fault") int tp_page_fault(struct page_fault *ctx) { - return trace_page_fault(ctx, vm_pid); + CHECK_PID(vm_pid); + return trace_page_fault(ctx); } SEC("fexit/kvm_tdp_page_fault") @@ -85,7 +90,8 @@ int BPF_PROG(fexit_tdp_page_fault, struct kvm_vcpu *vcpu, SEC("fentry/kvm_mmu_page_fault") int BPF_PROG(fentry_kvm_mmu_page_fault, struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code) { - return trace_kvm_mmu_page_fault(vcpu, cr2_or_gpa, error_code, vm_pid); + CHECK_PID(vm_pid); + return trace_kvm_mmu_page_fault(vcpu, cr2_or_gpa, error_code); } SEC("tp/kvmmmu/handle_mmio_page_fault") @@ -96,7 +102,8 @@ int tp_handle_mmio_page_fault(struct mmio_page_fault *ctx) { SEC("fentry/kvm_pic_set_irq") int BPF_PROG(fentry_kvm_pic_set_irq, struct kvm_pic *s, int irq, int irq_source_id, int level) { - return entry_kvm_pic_set_irq(irq, vm_pid); + CHECK_PID(vm_pid); + return entry_kvm_pic_set_irq(irq); } SEC("fexit/kvm_pic_set_irq") @@ -108,7 +115,8 @@ int BPF_PROG(fexit_kvm_pic_set_irq, struct kvm_pic *s, int irq, SEC("fentry/ioapic_set_irq") int BPF_PROG(fentry_kvm_ioapic_set_irq, struct kvm_ioapic *ioapic, int irq, int irq_level, bool line_status) { - return entry_kvm_ioapic_set_irq(irq, vm_pid); + CHECK_PID(vm_pid); + return entry_kvm_ioapic_set_irq(irq); } SEC("fexit/ioapic_set_irq") @@ -117,23 +125,25 @@ int BPF_PROG(fexit_kvm_ioapic_set_irq, struct kvm_ioapic *ioapic, int irq, return exit_kvm_ioapic_set_irq(ioapic, irq, ret, &rb, e); } -SEC("fentry/kvm_set_msi_irq") -int BPF_PROG(fentry_kvm_set_msi_irq, struct kvm *kvm, +SEC("fentry/kvm_set_msi") +int BPF_PROG(fentry_kvm_set_msi, struct kvm_kernel_irq_routing_entry *routing_entry, - struct kvm_lapic_irq *irq) { - return entry_kvm_set_msi_irq(kvm, vm_pid); + struct kvm *kvm, int irq_source_id, int level, bool line_status) { + CHECK_PID(vm_pid); + return entry_kvm_set_msi(kvm, routing_entry, level); } -SEC("fexit/kvm_set_msi_irq") -int BPF_PROG(fexit_kvm_set_msi_irq, struct kvm *kvm, +SEC("fexit/kvm_set_msi") +int BPF_PROG(fexit_kvm_set_msi, struct kvm_kernel_irq_routing_entry *routing_entry, - struct kvm_lapic_irq *irq) { - return exit_kvm_set_msi_irq(kvm, routing_entry, &rb, e); + struct kvm *kvm, int irq_source_id, int level, bool line_status) { + return exit_kvm_set_msi(kvm, routing_entry, &rb, e); } SEC("fentry/vmx_inject_irq") int BPF_PROG(fentry_vmx_inject_irq, struct kvm_vcpu *vcpu, bool reinjected) { - return entry_vmx_inject_irq(vcpu, vm_pid); + CHECK_PID(vm_pid); + return entry_vmx_inject_irq(vcpu); } SEC("fexit/vmx_inject_irq") @@ -143,10 +153,11 @@ int BPF_PROG(fexit_vmx_inject_irq, struct kvm_vcpu *vcpu, bool reinjected) { SEC("fentry/kvm_emulate_hypercall") int BPF_PROG(fentry_emulate_hypercall, struct kvm_vcpu *vcpu) { - return entry_emulate_hypercall(vcpu, &rb, e, vm_pid); + CHECK_PID(vm_pid); + return entry_emulate_hypercall(vcpu, &rb, e); } -SEC("tracepoint/syscalls/sys_enter_ioctl") +SEC("tp/syscalls/sys_enter_ioctl") int tp_ioctl(struct trace_event_raw_sys_enter *args) { return trace_kvm_ioctl(args); } \ No newline at end of file From 23021b5809f05312e6305fef1394039be524e4a3 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 22 Mar 2024 14:27:54 +0800 Subject: [PATCH 39/46] modify ioctl --- eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h | 13 ++++--------- eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c | 5 ++++- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h b/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h index 97e4c38fe..23f8725fc 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h @@ -59,9 +59,9 @@ static int trace_kvm_ioctl(struct trace_event_raw_sys_enter *args) { bpf_probe_read(®ion, sizeof(region), (void *)arg); // 打印或处理 region 数据 bpf_printk( - "KVM_SET_USER_MEMORY_REGION: fd=%d, slot=%u, flags=%u, " + "KVM_SET_USER_MEMORY_REGION: fd=%d, slot=%u, " "guest_phys_addr=%llx, memory_size=%lluK,userspace_addr=%llx\n", - fd, region.slot, region.flags, region.guest_phys_addr, + fd, region.slot, region.guest_phys_addr, region.memory_size / 1024, region.userspace_addr); break; } @@ -82,9 +82,8 @@ static int trace_kvm_ioctl(struct trace_event_raw_sys_enter *args) { // 此处仅展示部分寄存器值的打印 bpf_printk( "KVM_GET/SET_REGS: fd=%d, rax=%llx, rbx=%llx, rcx=%llx, " - "rdx=%llx, rsi=%llx\n", - fd, regs.rax, regs.rbx, regs.rcx, regs.rdx, regs.rsi); - + "rdx=%llx\n", + fd, regs.rax, regs.rbx, regs.rcx, regs.rdx); break; } case KVM_TRANSLATE: { @@ -103,10 +102,6 @@ static int trace_kvm_ioctl(struct trace_event_raw_sys_enter *args) { irq.irq); break; } - case KVM_RUN: { - bpf_printk("KVM_RUN:fd=%d",fd); - break; - } default: break; } diff --git a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c index 8dcc223ac..20d67a82c 100644 --- a/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c +++ b/eBPF_Supermarket/kvm_watcher/src/kvm_watcher.c @@ -690,7 +690,10 @@ static int print_event_head(struct env *env) { break; } case IOCTL: { - printf("wait....\n"); + printf( + "Successfully started! Please run `sudo cat " + "/sys/kernel/debug/tracing/trace_pipe` " + "to see output of the BPF programs.\n"); break; } default: From 9e90997e21714c015e51844a0a8968f13c5b8b4d Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 22 Mar 2024 15:25:27 +0800 Subject: [PATCH 40/46] update ioctl --- .../kvm_watcher/include/kvm_ioctl.h | 24 ++++++------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h b/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h index 23f8725fc..23ae5257c 100644 --- a/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h +++ b/eBPF_Supermarket/kvm_watcher/include/kvm_ioctl.h @@ -25,6 +25,7 @@ #include #include #include +#include #define KVMIO 0xAE #define KVM_CREATE_VM _IO(KVMIO, 0x01) /* returns a VM fd */ @@ -33,17 +34,15 @@ #define KVM_SET_VCPU_EVENTS _IOW(KVMIO, 0xa0, struct kvm_vcpu_events) #define KVM_SET_USER_MEMORY_REGION \ _IOW(KVMIO, 0x46, struct kvm_userspace_memory_region) -#define KVM_GET_REGS _IOR(KVMIO, 0x81, struct kvm_regs) -#define KVM_SET_REGS _IOW(KVMIO, 0x82, struct kvm_regs) #define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation) #define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt) #define KVM_RUN _IO(KVMIO, 0x80) static int trace_kvm_ioctl(struct trace_event_raw_sys_enter *args) { int fd = (int)args->args[0]; - u64 ts; unsigned int cmd = (unsigned int)args->args[1]; unsigned long arg = (unsigned long)args->args[2]; + char buf[256]; // 创建一个缓冲区来存储格式化后的字符串 switch (cmd) { case KVM_CREATE_VM: bpf_printk("KVM_CREATE_VM: fd=%d\n", fd); @@ -59,9 +58,11 @@ static int trace_kvm_ioctl(struct trace_event_raw_sys_enter *args) { bpf_probe_read(®ion, sizeof(region), (void *)arg); // 打印或处理 region 数据 bpf_printk( - "KVM_SET_USER_MEMORY_REGION: fd=%d, slot=%u, " - "guest_phys_addr=%llx, memory_size=%lluK,userspace_addr=%llx\n", - fd, region.slot, region.guest_phys_addr, + "KVM_SET_USER_MEMORY_REGION: fd=%d, slot=%u,flags=%u\n", + fd, region.slot, region.flags); + bpf_printk( + "KVM_SET_USER_MEMORY_REGION: guest_phys_addr=%llx, memory_size=%lluK,userspace_addr=%llx\n", + region.guest_phys_addr, region.memory_size / 1024, region.userspace_addr); break; } @@ -75,17 +76,6 @@ static int trace_kvm_ioctl(struct trace_event_raw_sys_enter *args) { fd, events.exception.nr, events.interrupt.nr); break; } - case KVM_GET_REGS: - case KVM_SET_REGS: { - struct kvm_regs regs; - bpf_probe_read(®s, sizeof(regs), (void *)arg); - // 此处仅展示部分寄存器值的打印 - bpf_printk( - "KVM_GET/SET_REGS: fd=%d, rax=%llx, rbx=%llx, rcx=%llx, " - "rdx=%llx\n", - fd, regs.rax, regs.rbx, regs.rcx, regs.rdx); - break; - } case KVM_TRANSLATE: { struct kvm_translation tr; bpf_probe_read(&tr, sizeof(tr), (void *)arg); From b14d4c40bed98d8a8176f0c53b9632938ba93fd9 Mon Sep 17 00:00:00 2001 From: nanshuaibo Date: Fri, 22 Mar 2024 15:34:32 +0800 Subject: [PATCH 41/46] update mk --- eBPF_Supermarket/kvm_watcher/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eBPF_Supermarket/kvm_watcher/Makefile b/eBPF_Supermarket/kvm_watcher/Makefile index a5d2d2eb0..e1ba47998 100644 --- a/eBPF_Supermarket/kvm_watcher/Makefile +++ b/eBPF_Supermarket/kvm_watcher/Makefile @@ -96,6 +96,6 @@ test: bpf clean: cd src && rm -f *.o *.skel.h *.bpf.o - rm -f $(notdir $(APP)) - rm -rf include/vmlinux.h + rm -rf $(notdir $(APP)) include/vmlinux.h temp + From 860a93dde4b0c1555ebc6ffb20c22226031a3d57 Mon Sep 17 00:00:00 2001 From: zmx Date: Sat, 23 Mar 2024 09:57:11 +0800 Subject: [PATCH 42/46] fix drop reason and add the addr_to_func --- .../net_watcher/dropreason.h | 83 +++++++++++++ .../net_watcher/netwatcher.bpf.c | 73 ++---------- .../net_watcher/netwatcher.c | 112 ++++++++++++------ .../net_watcher/netwatcher.h | 2 + 4 files changed, 174 insertions(+), 96 deletions(-) create mode 100644 eBPF_Supermarket/Network_Subsystem/net_watcher/dropreason.h diff --git a/eBPF_Supermarket/Network_Subsystem/net_watcher/dropreason.h b/eBPF_Supermarket/Network_Subsystem/net_watcher/dropreason.h new file mode 100644 index 000000000..a41d070b7 --- /dev/null +++ b/eBPF_Supermarket/Network_Subsystem/net_watcher/dropreason.h @@ -0,0 +1,83 @@ +#ifndef __DROPREASON_H +#define __DROPREASON_H +const char *SKB_Drop_Reason_Strings[] = { + "SKB_NOT_DROPPED_YET", + "SKB_CONSUMED", + "SKB_DROP_REASON_NOT_SPECIFIED", + "SKB_DROP_REASON_NO_SOCKET", + "SKB_DROP_REASON_PKT_TOO_SMALL", + "SKB_DROP_REASON_TCP_CSUM", + "SKB_DROP_REASON_SOCKET_FILTER", + "SKB_DROP_REASON_UDP_CSUM", + "SKB_DROP_REASON_NETFILTER_DROP", + "SKB_DROP_REASON_OTHERHOST", + "SKB_DROP_REASON_IP_CSUM", + "SKB_DROP_REASON_IP_INHDR", + "SKB_DROP_REASON_IP_RPFILTER", + "SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST", + "SKB_DROP_REASON_XFRM_POLICY", + "SKB_DROP_REASON_IP_NOPROTO", + "SKB_DROP_REASON_SOCKET_RCVBUFF", + "SKB_DROP_REASON_PROTO_MEM", + "SKB_DROP_REASON_TCP_MD5NOTFOUND", + "SKB_DROP_REASON_TCP_MD5UNEXPECTED", + "SKB_DROP_REASON_TCP_MD5FAILURE", + "SKB_DROP_REASON_SOCKET_BACKLOG", + "SKB_DROP_REASON_TCP_FLAGS", + "SKB_DROP_REASON_TCP_ZEROWINDOW", + "SKB_DROP_REASON_TCP_OLD_DATA", + "SKB_DROP_REASON_TCP_OVERWINDOW", + "SKB_DROP_REASON_TCP_OFOMERGE", + "SKB_DROP_REASON_TCP_RFC7323_PAWS", + "SKB_DROP_REASON_TCP_INVALID_SEQUENCE", + "SKB_DROP_REASON_TCP_RESET", + "SKB_DROP_REASON_TCP_INVALID_SYN", + "SKB_DROP_REASON_TCP_CLOSE", + "SKB_DROP_REASON_TCP_FASTOPEN", + "SKB_DROP_REASON_TCP_OLD_ACK", + "SKB_DROP_REASON_TCP_TOO_OLD_ACK", + "SKB_DROP_REASON_TCP_ACK_UNSENT_DATA", + "SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE", + "SKB_DROP_REASON_TCP_OFO_DROP", + "SKB_DROP_REASON_IP_OUTNOROUTES", + "SKB_DROP_REASON_BPF_CGROUP_EGRESS", + "SKB_DROP_REASON_IPV6DISABLED", + "SKB_DROP_REASON_NEIGH_CREATEFAIL", + "SKB_DROP_REASON_NEIGH_FAILED", + "SKB_DROP_REASON_NEIGH_QUEUEFULL", + "SKB_DROP_REASON_NEIGH_DEAD", + "SKB_DROP_REASON_TC_EGRESS", + "SKB_DROP_REASON_QDISC_DROP", + "SKB_DROP_REASON_CPU_BACKLOG", + "SKB_DROP_REASON_XDP", + "SKB_DROP_REASON_TC_INGRESS", + "SKB_DROP_REASON_UNHANDLED_PROTO", + "SKB_DROP_REASON_SKB_CSUM", + "SKB_DROP_REASON_SKB_GSO_SEG", + "SKB_DROP_REASON_SKB_UCOPY_FAULT", + "SKB_DROP_REASON_DEV_HDR", + "SKB_DROP_REASON_DEV_READY", + "SKB_DROP_REASON_FULL_RING", + "SKB_DROP_REASON_NOMEM", + "SKB_DROP_REASON_HDR_TRUNC", + "SKB_DROP_REASON_TAP_FILTER", + "SKB_DROP_REASON_TAP_TXFILTER", + "SKB_DROP_REASON_ICMP_CSUM", + "SKB_DROP_REASON_INVALID_PROTO", + "SKB_DROP_REASON_IP_INADDRERRORS", + "SKB_DROP_REASON_IP_INNOROUTES", + "SKB_DROP_REASON_PKT_TOO_BIG", + "SKB_DROP_REASON_DUP_FRAG", + "SKB_DROP_REASON_FRAG_REASM_TIMEOUT", + "SKB_DROP_REASON_FRAG_TOO_FAR", + "SKB_DROP_REASON_TCP_MINTTL", + "SKB_DROP_REASON_IPV6_BAD_EXTHDR", + "SKB_DROP_REASON_IPV6_NDISC_FRAG", + "SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT", + "SKB_DROP_REASON_IPV6_NDISC_BAD_CODE", + "SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS", + "SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST", + "SKB_DROP_REASON_MAX", + "SKB_DROP_REASON_SUBSYS_MASK", +}; +#endif \ No newline at end of file diff --git a/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.bpf.c b/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.bpf.c index 1a13bb87f..352bcaf09 100644 --- a/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.bpf.c +++ b/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.bpf.c @@ -1508,17 +1508,20 @@ int BPF_KPROBE(ip_finish_output) { return 0; } + //drop -SEC("kprobe/kfree_skb_reason") -int BPF_KPROBE(kfree_skb_reason,struct sk_buff *skb, enum skb_drop_reason reason) { +SEC("tp/skb/kfree_skb") +int tp_kfree(struct trace_event_raw_kfree_skb *ctx) { if(!kfree_info) return 0; + struct sk_buff *skb=ctx->skbaddr; if (skb == NULL) // 判断是否为空 - return 0; + return 0; struct iphdr *ip = skb_to_iphdr(skb); - struct udphdr *udp = skb_to_udphdr(skb); - struct packet_tuple pkt_tuple = {0}; - get_udp_pkt_tuple(&pkt_tuple, ip, udp); + struct tcphdr *tcp = skb_to_tcphdr(skb); + struct packet_tuple pkt_tuple = {0}; + get_pkt_tuple(&pkt_tuple, ip, tcp); + struct reasonissue *message; message = bpf_ringbuf_reserve(&kfree_rb, sizeof(*message), 0); if(!message){ @@ -1528,59 +1531,9 @@ int BPF_KPROBE(kfree_skb_reason,struct sk_buff *skb, enum skb_drop_reason reason message->daddr = pkt_tuple.daddr; message->sport = pkt_tuple.sport; message->dport = pkt_tuple.dport; - message->drop_reason = reason; + message->protocol = ctx->protocol; + message->location = (long)ctx->location; + message->drop_reason = ctx->reason; bpf_ringbuf_submit(message,0); return 0; -} -//icmp -// SEC("kprobe/__icmp_send") -// int BPF_KPROBE(__icmp_send,struct sk_buff *skb_in){ -// bpf_printk("111111"); -// if (skb_in== NULL) // 判断是否为空 -// return 0; -// struct iphdr *ip = skb_to_iphdr(skb_in); -// struct udphdr *udp = skb_to_udphdr(skb_in); -// struct packet_tuple pkt_tuple = {0}; -// get_udp_pkt_tuple(&pkt_tuple, ip, udp); -// bpf_printk("%d %d",pkt_tuple.saddr,pkt_tuple.daddr); -// struct time_icmp *tinfo; -// //tinfo = (struct time_icmp *)bpf_map_lookup_elem(&icmp_time,&pkt_tuple); -// //tinfo->icmp_start_time = bpf_ktime_get_ns() / 1000; -// return 0; -// } - -// SEC("kprobe/icmp_rcv") -// int BPF_KPROBE(icmp_rcv,struct sk_buff *skb){ -// bpf_printk("2222222 pid:%d ",bpf_get_current_pid_tgid()); -// if (skb== NULL) // 判断是否为空 -// return 0; -// struct iphdr *ip = skb_to_iphdr(skb); -// struct udphdr *udp = skb_to_udphdr(skb); -// struct packet_tuple pkt_tuple = {0}; -// get_udp_pkt_tuple(&pkt_tuple, ip, udp); -// //bpf_printk("%s %s",inet_ntop(AF_INET, &saddr, s_str, sizeof(s_str)),inet_ntop(AF_INET, &daddr, d_str, sizeof(d_str))); - -// //struct time_icmp *tinfo; -// // tinfo = (struct time_icmp *)bpf_map_lookup_elem(&icmp_time,&pkt_tuple); -// // if (tinfo == NULL) { -// // return 0; -// // } -// // tinfo->icmp_end_time = bpf_ktime_get_ns() / 1000; -// // struct icmptime *message; -// // message = bpf_ringbuf_reserve(&netfilter_rb, sizeof(*message), 0); -// // if(!message){ -// // return 0; -// // } -// // message->saddr = pkt_tuple.saddr; -// // message->daddr =pkt_tuple.daddr; -// // message->sport =pkt_tuple.sport; -// // message->dport = pkt_tuple.dport; -// // message->icmp_tran_time = tinfo->icmp_end_time-tinfo->icmp_start_time; -// // bpf_ringbuf_submit(message,0); -// return 0; -// } -// SEC("kretprobe/icmp_rcv") -// int BPF_KPROBE(icmp_rcv_ret){ -// bpf_printk("33333 pid:%d ",bpf_get_current_pid_tgid()); -// return 0; -// } \ No newline at end of file +} \ No newline at end of file diff --git a/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.c b/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.c index 812a02897..e99c96761 100644 --- a/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.c +++ b/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.c @@ -29,6 +29,7 @@ #include #include #include +#include "dropreason.h" static volatile bool exiting = false; @@ -39,7 +40,7 @@ static char udp_file_path[1024]; static int sport = 0, dport = 0; // for filter static int all_conn = 0, err_packet = 0, extra_conn_info = 0, layer_time = 0, - http_info = 0, retrans_info = 0, udp_info = 0,net_filter = 0,kfree_info = 0; // flag + http_info = 0, retrans_info = 0, udp_info = 0,net_filter = 0,kfree_info = 0,addr_to_func=0; // flag static const char argp_program_doc[] = "Watch tcp/ip in network subsystem \n"; @@ -55,6 +56,7 @@ static const struct argp_option opts[] = { {"udp", 'u', 0, 0, "trace the udp message"}, {"net_filter",'n',0,0,"trace ipv4 packget filter "}, {"kfree_info",'k',0,0,"trace kfree "}, + {"addr_to_func",'T',0,0,"translation addr to func and offset"}, {}}; static error_t parse_arg(int key, char *arg, struct argp_state *state) { @@ -93,6 +95,9 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) { case 'k': kfree_info = 1; break; + case 'T': + addr_to_func = 1; + break; default: return ARGP_ERR_UNKNOWN; } @@ -105,6 +110,50 @@ static const struct argp argp = { .doc = argp_program_doc, }; +struct SymbolEntry{ + unsigned long addr; + char name[30]; +}; +struct SymbolEntry symbols[300000]; +int num_symbols = 0; +struct SymbolEntry findfunc(unsigned long int addr) +{ + int low = 0, high = num_symbols - 1; + int result = -1; + + while (low <= high) { + int mid = low + (high - low) / 2; + if (symbols[mid].addr < addr) { + result = mid; + low = mid + 1; + } else { + high = mid - 1; + } + } + + return symbols[result]; +}; +void readallsym() +{ + FILE *file = fopen("/proc/kallsyms", "r"); + if (!file) { + perror("Error opening file"); + exit(EXIT_FAILURE); + } + char line[256]; + while (fgets(line, sizeof(line), file)) { + unsigned long addr; + char type, name[30]; + int ret = sscanf(line, "%lx %c %s", &addr, &type, name); + if (ret == 3) { + symbols[num_symbols].addr = addr; + strncpy(symbols[num_symbols].name, name, 30); + num_symbols++; + } + } + + fclose(file); +} static void sig_handler(int signo) { exiting = true; } static void bytes_to_str(char *str, unsigned long long num) { @@ -335,41 +384,29 @@ static int print_kfree(void *ctx, void *packet_info, size_t size) { { return 0; } - printf("%-25s %-25s %-25u %-25u", + char prot[6]; + if(pack_info->protocol==2048) + { + strcpy(prot, "ipv4"); + } + else if(pack_info->protocol==34525) + { + strcpy(prot, "ipv6"); + } + else { + // 其他协议 + strcpy(prot, "other"); + } + printf("%-20s %-20s %-10u %-10u %-10s", inet_ntop(AF_INET, &saddr, s_str, sizeof(s_str)), - inet_ntop(AF_INET, &daddr, d_str, sizeof(d_str)), pack_info->sport,pack_info->dport); - switch (pack_info->drop_reason) { - case 0: - printf("SKB_NOT_DROPPED_YET"); - break; - case 1: - printf("SKB_CONSUMED"); - break; - case 2: - printf("SKB_DROP_REASON_NOT_SPECIFIED"); - break; - case 3: - printf("SKB_DROP_REASON_NO_SOCKET"); - break; - case 4: - printf("SKB_DROP_REASON_PKT_TOO_SMALL"); - break; - case 5: - printf("SKB_DROP_REASON_TCP_CSUM"); - break; - case 6: - printf("SKB_DROP_REASON_SOCKET_FILTER"); - break; - case 7: - printf("SKB_DROP_REASON_UDP_CSUM"); - break; - case 8: - printf("SKB_DROP_REASON_NETFILTER_DROP"); - break; - default: - printf("Unknown SKB Drop Reason"); - } - printf("\n"); + inet_ntop(AF_INET, &daddr, d_str, sizeof(d_str)), pack_info->sport,pack_info->dport,prot); + if(!addr_to_func) + printf("%-20lx",pack_info->location); + else { + struct SymbolEntry data= findfunc(pack_info->location); + printf("%s+0x%-10lx",data.name,pack_info->location-data.addr); + } + printf("%s\n", SKB_Drop_Reason_Strings[pack_info->drop_reason]); return 0; } int main(int argc, char **argv) { @@ -422,6 +459,9 @@ int main(int argc, char **argv) { skel->rodata->net_filter = net_filter; skel->rodata->kfree_info = kfree_info; + if(addr_to_func) + readallsym(); + err = netwatcher_bpf__load(skel); if (err) { fprintf(stderr, "Failed to load and verify BPF skeleton\n"); @@ -444,7 +484,7 @@ int main(int argc, char **argv) { } else if(kfree_info) { - printf("%-25s %-25s %-25s %-25s %-25s\n", "saddr", "daddr","sprot", "dprot","reason"); + printf("%-20s %-20s %-10s %-10s %-9s %-24s %-25s\n", "saddr", "daddr","sprot", "dprot","prot","addr","reason"); } else{ printf("%-22s %-10s %-10s %-10s %-10s %-10s %-5s %s\n", "SOCK", "SEQ", diff --git a/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.h b/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.h index 353cf94e3..ded67af39 100644 --- a/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.h +++ b/eBPF_Supermarket/Network_Subsystem/net_watcher/netwatcher.h @@ -113,6 +113,8 @@ struct reasonissue unsigned int daddr; unsigned short sport; unsigned short dport; + long location; + unsigned short protocol; int drop_reason; }; From 494faec4b87a19aea1fa6410e0168fd67ecf1481 Mon Sep 17 00:00:00 2001 From: zmx Date: Sat, 23 Mar 2024 10:26:58 +0800 Subject: [PATCH 43/46] 111 --- eBPF_Supermarket/Network_Subsystem/net_watcher/dropreason.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eBPF_Supermarket/Network_Subsystem/net_watcher/dropreason.h b/eBPF_Supermarket/Network_Subsystem/net_watcher/dropreason.h index a41d070b7..b59917cdb 100644 --- a/eBPF_Supermarket/Network_Subsystem/net_watcher/dropreason.h +++ b/eBPF_Supermarket/Network_Subsystem/net_watcher/dropreason.h @@ -55,7 +55,7 @@ const char *SKB_Drop_Reason_Strings[] = { "SKB_DROP_REASON_SKB_CSUM", "SKB_DROP_REASON_SKB_GSO_SEG", "SKB_DROP_REASON_SKB_UCOPY_FAULT", - "SKB_DROP_REASON_DEV_HDR", + "SKB_DROP_REASON_DEV_HDR", "SKB_DROP_REASON_DEV_READY", "SKB_DROP_REASON_FULL_RING", "SKB_DROP_REASON_NOMEM", From 9440e0ade19098a2e6d7169b235b79ddda49105d Mon Sep 17 00:00:00 2001 From: zmx Date: Sat, 23 Mar 2024 10:28:18 +0800 Subject: [PATCH 44/46] 222 --- .../net_watcher/dropreason.h | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/eBPF_Supermarket/Network_Subsystem/net_watcher/dropreason.h b/eBPF_Supermarket/Network_Subsystem/net_watcher/dropreason.h index b59917cdb..92e5a8cb8 100644 --- a/eBPF_Supermarket/Network_Subsystem/net_watcher/dropreason.h +++ b/eBPF_Supermarket/Network_Subsystem/net_watcher/dropreason.h @@ -56,28 +56,28 @@ const char *SKB_Drop_Reason_Strings[] = { "SKB_DROP_REASON_SKB_GSO_SEG", "SKB_DROP_REASON_SKB_UCOPY_FAULT", "SKB_DROP_REASON_DEV_HDR", - "SKB_DROP_REASON_DEV_READY", - "SKB_DROP_REASON_FULL_RING", - "SKB_DROP_REASON_NOMEM", - "SKB_DROP_REASON_HDR_TRUNC", - "SKB_DROP_REASON_TAP_FILTER", - "SKB_DROP_REASON_TAP_TXFILTER", - "SKB_DROP_REASON_ICMP_CSUM", - "SKB_DROP_REASON_INVALID_PROTO", - "SKB_DROP_REASON_IP_INADDRERRORS", - "SKB_DROP_REASON_IP_INNOROUTES", - "SKB_DROP_REASON_PKT_TOO_BIG", - "SKB_DROP_REASON_DUP_FRAG", - "SKB_DROP_REASON_FRAG_REASM_TIMEOUT", - "SKB_DROP_REASON_FRAG_TOO_FAR", - "SKB_DROP_REASON_TCP_MINTTL", - "SKB_DROP_REASON_IPV6_BAD_EXTHDR", - "SKB_DROP_REASON_IPV6_NDISC_FRAG", - "SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT", - "SKB_DROP_REASON_IPV6_NDISC_BAD_CODE", - "SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS", - "SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST", - "SKB_DROP_REASON_MAX", - "SKB_DROP_REASON_SUBSYS_MASK", -}; + "SKB_DROP_REASON_DEV_READY", + "SKB_DROP_REASON_FULL_RING", + "SKB_DROP_REASON_NOMEM", + "SKB_DROP_REASON_HDR_TRUNC", + "SKB_DROP_REASON_TAP_FILTER", + "SKB_DROP_REASON_TAP_TXFILTER", + "SKB_DROP_REASON_ICMP_CSUM", + "SKB_DROP_REASON_INVALID_PROTO", + "SKB_DROP_REASON_IP_INADDRERRORS", + "SKB_DROP_REASON_IP_INNOROUTES", + "SKB_DROP_REASON_PKT_TOO_BIG", + "SKB_DROP_REASON_DUP_FRAG", + "SKB_DROP_REASON_FRAG_REASM_TIMEOUT", + "SKB_DROP_REASON_FRAG_TOO_FAR", + "SKB_DROP_REASON_TCP_MINTTL", + "SKB_DROP_REASON_IPV6_BAD_EXTHDR", + "SKB_DROP_REASON_IPV6_NDISC_FRAG", + "SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT", + "SKB_DROP_REASON_IPV6_NDISC_BAD_CODE", + "SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS", + "SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST", + "SKB_DROP_REASON_MAX", + "SKB_DROP_REASON_SUBSYS_MASK", +} #endif \ No newline at end of file From faba22faba9f4cd6711a30d82f7ab66c9b58cc84 Mon Sep 17 00:00:00 2001 From: zmx Date: Sat, 23 Mar 2024 10:40:39 +0800 Subject: [PATCH 45/46] fix the buf --- eBPF_Supermarket/Network_Subsystem/net_watcher/dropreason.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eBPF_Supermarket/Network_Subsystem/net_watcher/dropreason.h b/eBPF_Supermarket/Network_Subsystem/net_watcher/dropreason.h index 92e5a8cb8..68ff87eaa 100644 --- a/eBPF_Supermarket/Network_Subsystem/net_watcher/dropreason.h +++ b/eBPF_Supermarket/Network_Subsystem/net_watcher/dropreason.h @@ -79,5 +79,5 @@ const char *SKB_Drop_Reason_Strings[] = { "SKB_DROP_REASON_IPV6_NDISC_NS_OTHERHOST", "SKB_DROP_REASON_MAX", "SKB_DROP_REASON_SUBSYS_MASK", -} +}; #endif \ No newline at end of file From 6bf863138545ced7c2283c0b2bc3828dde091a2d Mon Sep 17 00:00:00 2001 From: zhangxianyu777 <93031728+zhangxianyu777@users.noreply.github.com> Date: Sat, 23 Mar 2024 10:43:42 +0800 Subject: [PATCH 46/46] Update net_watcher.yml --- .github/workflows/net_watcher.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/net_watcher.yml b/.github/workflows/net_watcher.yml index 851647ecb..5dc1a6d95 100644 --- a/.github/workflows/net_watcher.yml +++ b/.github/workflows/net_watcher.yml @@ -41,4 +41,7 @@ jobs: sudo timeout -s SIGINT 5 ./netwatcher -r || if [[ $? != 124 && $? != 0 ]];then exit $?;fi sudo timeout -s SIGINT 5 ./netwatcher -t || if [[ $? != 124 && $? != 0 ]];then exit $?;fi sudo timeout -s SIGINT 5 ./netwatcher -u || if [[ $? != 124 && $? != 0 ]];then exit $?;fi + sudo timeout -s SIGINT 5 ./netwatcher -n || if [[ $? != 124 && $? != 0 ]];then exit $?;fi + sudo timeout -s SIGINT 5 ./netwatcher -k || if [[ $? != 124 && $? != 0 ]];then exit $?;fi + sudo timeout -s SIGINT 5 ./netwatcher -k -T || if [[ $? != 124 && $? != 0 ]];then exit $?;fi timeout-minutes: 5