From ee79f77a3a1f49c310b2c2ea51fa6ac9af8a0d62 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Thu, 6 Feb 2025 23:32:46 +0000 Subject: [PATCH 01/88] add gitignore for build directory --- module/.gitignore | 1 + module/memory_collector.c | 102 +++++++++++++++++++++++++++++++++++++- 2 files changed, 101 insertions(+), 2 deletions(-) create mode 100644 module/.gitignore diff --git a/module/.gitignore b/module/.gitignore new file mode 100644 index 0000000..567609b --- /dev/null +++ b/module/.gitignore @@ -0,0 +1 @@ +build/ diff --git a/module/memory_collector.c b/module/memory_collector.c index 4d0fb02..157ee5c 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -1,21 +1,119 @@ #include #include #include +#include +#include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Memory Collector Project"); MODULE_DESCRIPTION("Memory subsystem monitoring for Kubernetes"); MODULE_VERSION("1.0"); +// Data structure for PMU samples +struct memory_collector_data { + u64 timestamp; // Current timestamp + u32 core_id; // CPU core number + char comm[16]; // Current task comm (name) +} __packed; + +// PMU type for our custom events +static struct pmu memory_collector_pmu; + +// PMU event attributes +static DEFINE_PER_CPU(struct perf_event *, sampling_event); + +// PMU callback functions +static void memory_collector_start(struct perf_event *event, int flags) +{ + // Enable sampling + event->hw.state = 0; +} + +static void memory_collector_stop(struct perf_event *event, int flags) +{ + // Disable sampling + event->hw.state = PERF_HES_STOPPED; +} + +static int memory_collector_add(struct perf_event *event, int flags) +{ + // Store event in per-CPU variable + __this_cpu_write(sampling_event, event); + return 0; +} + +static void memory_collector_del(struct perf_event *event, int flags) +{ + // Clear per-CPU event + __this_cpu_write(sampling_event, NULL); +} + +static void memory_collector_read(struct perf_event *event) +{ + struct memory_collector_data data; + struct perf_sample_data sample_data; + struct pt_regs *regs = get_irq_regs(); + + // Initialize sample data + perf_sample_data_init(&sample_data, 0, 0); + + // Fill data structure + data.timestamp = ktime_get_ns(); + data.core_id = smp_processor_id(); + strncpy(data.comm, current->comm, sizeof(data.comm) - 1); + data.comm[sizeof(data.comm) - 1] = '\0'; + + // Output data through perf buffer + if (regs) { + perf_event_output(event, &sample_data, regs); + } +} + +// PMU configuration +static int memory_collector_event_init(struct perf_event *event) +{ + if (event->attr.type != memory_collector_pmu.type) + return -ENOENT; + + // Set callbacks + event->destroy = NULL; + event->hw.config = 0; + + return 0; +} + +// PMU structure +static struct pmu memory_collector_pmu = { + .task_ctx_nr = perf_sw_context, + .event_init = memory_collector_event_init, + .add = memory_collector_add, + .del = memory_collector_del, + .start = memory_collector_start, + .stop = memory_collector_stop, + .read = memory_collector_read, +}; + static int __init memory_collector_init(void) { - printk(KERN_INFO "Memory Collector: module loaded\n"); + int ret; + + printk(KERN_INFO "Memory Collector: initializing PMU module\n"); + + // Register PMU + ret = perf_pmu_register(&memory_collector_pmu, "memory_collector", -1); + if (ret) { + printk(KERN_ERR "Memory Collector: failed to register PMU: %d\n", ret); + return ret; + } + return 0; } static void __exit memory_collector_exit(void) { - printk(KERN_INFO "Memory Collector: module unloaded\n"); + printk(KERN_INFO "Memory Collector: unregistering PMU module\n"); + perf_pmu_unregister(&memory_collector_pmu); } module_init(memory_collector_init); From 98dc100f47c9c09da6ed3a83d042b12b4d0b27b9 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Thu, 6 Feb 2025 23:33:29 +0000 Subject: [PATCH 02/88] add skeleton perf pmu that uses printk --- module/memory_collector.c | 87 +++++++++++++++++++++++++++------------ module/test_pmu.sh | 55 +++++++++++++++++++++++++ 2 files changed, 116 insertions(+), 26 deletions(-) create mode 100644 module/test_pmu.sh diff --git a/module/memory_collector.c b/module/memory_collector.c index 157ee5c..9763f6b 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -10,63 +10,68 @@ MODULE_AUTHOR("Memory Collector Project"); MODULE_DESCRIPTION("Memory subsystem monitoring for Kubernetes"); MODULE_VERSION("1.0"); +#ifndef CONFIG_PERF_EVENTS +#error "This module requires CONFIG_PERF_EVENTS" +#endif + // Data structure for PMU samples struct memory_collector_data { u64 timestamp; // Current timestamp - u32 core_id; // CPU core number - char comm[16]; // Current task comm (name) + u32 core_id; // CPU core number + char comm[16]; // Current task comm (name) } __packed; // PMU type for our custom events static struct pmu memory_collector_pmu; -// PMU event attributes -static DEFINE_PER_CPU(struct perf_event *, sampling_event); +// Custom overflow handler +static void memory_collector_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct memory_collector_data sample = { + .timestamp = ktime_get_ns(), + .core_id = smp_processor_id() + }; + + strncpy(sample.comm, current->comm, sizeof(sample.comm) - 1); + sample.comm[sizeof(sample.comm) - 1] = '\0'; + + // For now, just print the data + printk(KERN_DEBUG "Memory sample - CPU: %u, Time: %llu, Task: %s\n", + sample.core_id, sample.timestamp, sample.comm); +} // PMU callback functions static void memory_collector_start(struct perf_event *event, int flags) { - // Enable sampling event->hw.state = 0; } static void memory_collector_stop(struct perf_event *event, int flags) { - // Disable sampling event->hw.state = PERF_HES_STOPPED; } static int memory_collector_add(struct perf_event *event, int flags) { - // Store event in per-CPU variable - __this_cpu_write(sampling_event, event); return 0; } static void memory_collector_del(struct perf_event *event, int flags) { - // Clear per-CPU event - __this_cpu_write(sampling_event, NULL); } static void memory_collector_read(struct perf_event *event) { - struct memory_collector_data data; - struct perf_sample_data sample_data; + // Trigger the overflow handler directly + struct perf_sample_data data; struct pt_regs *regs = get_irq_regs(); - - // Initialize sample data - perf_sample_data_init(&sample_data, 0, 0); - - // Fill data structure - data.timestamp = ktime_get_ns(); - data.core_id = smp_processor_id(); - strncpy(data.comm, current->comm, sizeof(data.comm) - 1); - data.comm[sizeof(data.comm) - 1] = '\0'; - - // Output data through perf buffer + + perf_sample_data_init(&data, 0, 1); + if (regs) { - perf_event_output(event, &sample_data, regs); + memory_collector_overflow_handler(event, &data, regs); } } @@ -76,7 +81,6 @@ static int memory_collector_event_init(struct perf_event *event) if (event->attr.type != memory_collector_pmu.type) return -ENOENT; - // Set callbacks event->destroy = NULL; event->hw.config = 0; @@ -94,9 +98,18 @@ static struct pmu memory_collector_pmu = { .read = memory_collector_read, }; +// Keep track of our event +static struct perf_event *sampling_event; + static int __init memory_collector_init(void) { int ret; + struct perf_event_attr attr = { + .type = PERF_TYPE_SOFTWARE, + .size = sizeof(struct perf_event_attr), + .sample_period = 1000000, // 1ms + .config = PERF_COUNT_SW_CPU_CLOCK, + }; printk(KERN_INFO "Memory Collector: initializing PMU module\n"); @@ -107,14 +120,36 @@ static int __init memory_collector_init(void) return ret; } + // Create a kernel counter that will drive our sampling + sampling_event = perf_event_create_kernel_counter(&attr, + -1, // any CPU + NULL, // all threads + memory_collector_overflow_handler, + NULL); + if (IS_ERR(sampling_event)) { + ret = PTR_ERR(sampling_event); + printk(KERN_ERR "Memory Collector: failed to create sampling event: %d\n", ret); + perf_pmu_unregister(&memory_collector_pmu); + return ret; + } + + // Enable the event + perf_event_enable(sampling_event); + return 0; } static void __exit memory_collector_exit(void) { printk(KERN_INFO "Memory Collector: unregistering PMU module\n"); + + if (sampling_event) { + perf_event_disable(sampling_event); + perf_event_release_kernel(sampling_event); + } + perf_pmu_unregister(&memory_collector_pmu); } module_init(memory_collector_init); -module_exit(memory_collector_exit); \ No newline at end of file +module_exit(memory_collector_exit); \ No newline at end of file diff --git a/module/test_pmu.sh b/module/test_pmu.sh new file mode 100644 index 0000000..510f9ab --- /dev/null +++ b/module/test_pmu.sh @@ -0,0 +1,55 @@ +#!/bin/bash +set -e + +echo "Building kernel module..." +make clean +make + +echo "Loading kernel module..." +sudo insmod build/memory_collector.ko + +echo "Verifying module is loaded..." +lsmod | grep memory_collector || { + echo "Error: Module failed to load" + exit 1 +} + +echo "Starting perf recording..." +sudo perf record -e memory_collector/sampling=1/ -a -o perf.data sleep 1 + +echo "Generating perf report..." +sudo perf script -i perf.data > perf_output.txt + +echo "Validating output..." +# Check sample count (expect ~1000 samples per CPU) +CPU_COUNT=$(nproc) +EXPECTED_MIN=$((900 * CPU_COUNT)) +SAMPLE_COUNT=$(wc -l < perf_output.txt) + +if [ $SAMPLE_COUNT -lt $EXPECTED_MIN ]; then + echo "Error: Got $SAMPLE_COUNT samples, expected at least $EXPECTED_MIN" + exit 1 +fi + +# Validate data format +if ! grep -q "timestamp:" perf_output.txt; then + echo "Error: Missing timestamp data" + exit 1 +fi + +if ! grep -q "core_id:" perf_output.txt; then + echo "Error: Missing core_id data" + exit 1 +fi + +if ! grep -q "comm:" perf_output.txt; then + echo "Error: Missing comm data" + exit 1 +fi + +echo "Unloading module..." +sudo rmmod memory_collector + +echo "Test completed successfully!" +echo "Sample count: $SAMPLE_COUNT" +echo "Output saved to perf_output.txt" \ No newline at end of file From 3ca9d4565e9948e15ebd7384a0a528c9668eae32 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Thu, 6 Feb 2025 17:40:20 -0600 Subject: [PATCH 03/88] remove get_irq_regs(), unused --- module/memory_collector.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 9763f6b..8958abb 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -26,8 +26,7 @@ static struct pmu memory_collector_pmu; // Custom overflow handler static void memory_collector_overflow_handler(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) + struct perf_sample_data *data) { struct memory_collector_data sample = { .timestamp = ktime_get_ns(), @@ -66,13 +65,10 @@ static void memory_collector_read(struct perf_event *event) { // Trigger the overflow handler directly struct perf_sample_data data; - struct pt_regs *regs = get_irq_regs(); perf_sample_data_init(&data, 0, 1); - if (regs) { - memory_collector_overflow_handler(event, &data, regs); - } + memory_collector_overflow_handler(event, &data); } // PMU configuration From 202088569ce0b6438b14dbadabe2d10c1d474d1b Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Thu, 6 Feb 2025 17:44:41 -0600 Subject: [PATCH 04/88] revert the removal of regs --- module/memory_collector.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 8958abb..9763f6b 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -26,7 +26,8 @@ static struct pmu memory_collector_pmu; // Custom overflow handler static void memory_collector_overflow_handler(struct perf_event *event, - struct perf_sample_data *data) + struct perf_sample_data *data, + struct pt_regs *regs) { struct memory_collector_data sample = { .timestamp = ktime_get_ns(), @@ -65,10 +66,13 @@ static void memory_collector_read(struct perf_event *event) { // Trigger the overflow handler directly struct perf_sample_data data; + struct pt_regs *regs = get_irq_regs(); perf_sample_data_init(&data, 0, 1); - memory_collector_overflow_handler(event, &data); + if (regs) { + memory_collector_overflow_handler(event, &data, regs); + } } // PMU configuration From 8463b1382cad7e46895f2cad70f7bc5b1d9aeeaa Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Thu, 6 Feb 2025 17:51:06 -0600 Subject: [PATCH 05/88] fix include for get_irq_flags() --- module/memory_collector.c | 1 + 1 file changed, 1 insertion(+) diff --git a/module/memory_collector.c b/module/memory_collector.c index 9763f6b..41ab389 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -4,6 +4,7 @@ #include #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Memory Collector Project"); From 5fd581c55679e06f947c1e2cf21df30c3daa5e16 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 7 Feb 2025 12:05:19 -0600 Subject: [PATCH 06/88] use tracepoints to output events --- module/memory_collector.c | 31 ++++++++++++++++++++++++--- module/test_pmu.sh | 44 ++++++++++++++++++--------------------- 2 files changed, 48 insertions(+), 27 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 41ab389..cc160f5 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -5,6 +5,8 @@ #include #include #include +#include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Memory Collector Project"); @@ -25,6 +27,30 @@ struct memory_collector_data { // PMU type for our custom events static struct pmu memory_collector_pmu; +// Define the tracepoint +TRACE_EVENT(memory_collector_sample, + TP_PROTO(u32 core_id, u64 timestamp, const char *comm), + + TP_ARGS(core_id, timestamp, comm), + + TP_STRUCT__entry( + __field(u32, core_id) + __field(u64, timestamp) + __array(char, comm, 16) + ), + + TP_fast_assign( + __entry->core_id = core_id; + __entry->timestamp = timestamp; + memcpy(__entry->comm, comm, 16); + ), + + TP_printk("cpu=%u timestamp=%llu comm=%s", + __entry->core_id, + __entry->timestamp, + __entry->comm) +); + // Custom overflow handler static void memory_collector_overflow_handler(struct perf_event *event, struct perf_sample_data *data, @@ -38,9 +64,8 @@ static void memory_collector_overflow_handler(struct perf_event *event, strncpy(sample.comm, current->comm, sizeof(sample.comm) - 1); sample.comm[sizeof(sample.comm) - 1] = '\0'; - // For now, just print the data - printk(KERN_DEBUG "Memory sample - CPU: %u, Time: %llu, Task: %s\n", - sample.core_id, sample.timestamp, sample.comm); + // Replace printk with tracepoint + trace_memory_collector_sample(sample.core_id, sample.timestamp, sample.comm); } // PMU callback functions diff --git a/module/test_pmu.sh b/module/test_pmu.sh index 510f9ab..995d6b6 100644 --- a/module/test_pmu.sh +++ b/module/test_pmu.sh @@ -14,42 +14,38 @@ lsmod | grep memory_collector || { exit 1 } -echo "Starting perf recording..." -sudo perf record -e memory_collector/sampling=1/ -a -o perf.data sleep 1 +echo "Setting up tracing..." +sudo trace-cmd start -e memory_collector_sample -echo "Generating perf report..." -sudo perf script -i perf.data > perf_output.txt +echo "Collecting samples for 1 second..." +sleep 1 + +echo "Stopping trace..." +sudo trace-cmd stop + +echo "Extracting trace data..." +sudo trace-cmd extract + +echo "Reading trace report..." +sudo trace-cmd report > trace_output.txt echo "Validating output..." -# Check sample count (expect ~1000 samples per CPU) +# Check if we have any trace entries +SAMPLE_COUNT=$(grep "memory_collector_sample:" trace_output.txt | wc -l) CPU_COUNT=$(nproc) EXPECTED_MIN=$((900 * CPU_COUNT)) -SAMPLE_COUNT=$(wc -l < perf_output.txt) if [ $SAMPLE_COUNT -lt $EXPECTED_MIN ]; then - echo "Error: Got $SAMPLE_COUNT samples, expected at least $EXPECTED_MIN" - exit 1 -fi - -# Validate data format -if ! grep -q "timestamp:" perf_output.txt; then - echo "Error: Missing timestamp data" - exit 1 -fi - -if ! grep -q "core_id:" perf_output.txt; then - echo "Error: Missing core_id data" - exit 1 -fi - -if ! grep -q "comm:" perf_output.txt; then - echo "Error: Missing comm data" + echo "Error: Got $SAMPLE_COUNT trace entries, expected at least $EXPECTED_MIN" exit 1 fi echo "Unloading module..." sudo rmmod memory_collector +echo "Cleaning up trace..." +sudo trace-cmd reset + echo "Test completed successfully!" echo "Sample count: $SAMPLE_COUNT" -echo "Output saved to perf_output.txt" \ No newline at end of file +echo "Output saved to trace_output.txt" \ No newline at end of file From 953f41dd96747463684f7d9fa0d49e62b3247ea4 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 7 Feb 2025 12:06:40 -0600 Subject: [PATCH 07/88] fix creating kernel counter (needs CPU number) --- module/memory_collector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index cc160f5..8370c41 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -148,7 +148,7 @@ static int __init memory_collector_init(void) // Create a kernel counter that will drive our sampling sampling_event = perf_event_create_kernel_counter(&attr, - -1, // any CPU + 0, // any CPU NULL, // all threads memory_collector_overflow_handler, NULL); From c07e2d89a6e3644f6160b1810cf28695378ff1cd Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 7 Feb 2025 12:10:26 -0600 Subject: [PATCH 08/88] declare tracepoint --- module/Makefile | 2 ++ module/memory_collector.c | 30 +++++++++++++++++++++--------- module/memory_collector_trace.h | 17 +++++++++++++++++ 3 files changed, 40 insertions(+), 9 deletions(-) create mode 100644 module/memory_collector_trace.h diff --git a/module/Makefile b/module/Makefile index 8215291..9d87ed4 100644 --- a/module/Makefile +++ b/module/Makefile @@ -1,3 +1,4 @@ +EXTRA_CFLAGS += -I$(src) obj-m += memory_collector.o # Check if KVERSION is provided on command line @@ -29,5 +30,6 @@ $(BUILD_DIR): clean: $(MAKE) -C $(KDIR) M=$(BUILD_DIR) src=$(PWD) clean rm -rf $(BUILD_DIR) + rm -f Module.symvers Module.markers modules.order .PHONY: all clean \ No newline at end of file diff --git a/module/memory_collector.c b/module/memory_collector.c index 8370c41..d6f0a5d 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -17,17 +17,13 @@ MODULE_VERSION("1.0"); #error "This module requires CONFIG_PERF_EVENTS" #endif -// Data structure for PMU samples -struct memory_collector_data { - u64 timestamp; // Current timestamp - u32 core_id; // CPU core number - char comm[16]; // Current task comm (name) -} __packed; +// First, declare the tracepoint +DECLARE_TRACEPOINT(memory_collector_sample); -// PMU type for our custom events -static struct pmu memory_collector_pmu; +// Define the tracepoint event +DEFINE_TRACEPOINT_CONDITION(memory_collector_sample); -// Define the tracepoint +// Define the tracepoint format TRACE_EVENT(memory_collector_sample, TP_PROTO(u32 core_id, u64 timestamp, const char *comm), @@ -51,6 +47,22 @@ TRACE_EVENT(memory_collector_sample, __entry->comm) ); +// Create a header file for the tracepoint +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE memory_collector_trace +#include + +// Data structure for PMU samples +struct memory_collector_data { + u64 timestamp; // Current timestamp + u32 core_id; // CPU core number + char comm[16]; // Current task comm (name) +} __packed; + +// PMU type for our custom events +static struct pmu memory_collector_pmu; + // Custom overflow handler static void memory_collector_overflow_handler(struct perf_event *event, struct perf_sample_data *data, diff --git a/module/memory_collector_trace.h b/module/memory_collector_trace.h new file mode 100644 index 0000000..f71f59c --- /dev/null +++ b/module/memory_collector_trace.h @@ -0,0 +1,17 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM memory_collector + +#if !defined(_MEMORY_COLLECTOR_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _MEMORY_COLLECTOR_TRACE_H + +#include + +DECLARE_TRACEPOINT(memory_collector_sample); + +#endif /* _MEMORY_COLLECTOR_TRACE_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE memory_collector_trace +#include \ No newline at end of file From 92cc151befe5361eee730d37d26caad28df94639 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 7 Feb 2025 12:13:52 -0600 Subject: [PATCH 09/88] fix tracepoint definition --- module/memory_collector.c | 41 ++++----------------------------- module/memory_collector_trace.h | 27 ++++++++++++++++++++-- 2 files changed, 29 insertions(+), 39 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index d6f0a5d..970571e 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -6,7 +6,6 @@ #include #include #include -#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Memory Collector Project"); @@ -17,41 +16,9 @@ MODULE_VERSION("1.0"); #error "This module requires CONFIG_PERF_EVENTS" #endif -// First, declare the tracepoint -DECLARE_TRACEPOINT(memory_collector_sample); - -// Define the tracepoint event -DEFINE_TRACEPOINT_CONDITION(memory_collector_sample); - -// Define the tracepoint format -TRACE_EVENT(memory_collector_sample, - TP_PROTO(u32 core_id, u64 timestamp, const char *comm), - - TP_ARGS(core_id, timestamp, comm), - - TP_STRUCT__entry( - __field(u32, core_id) - __field(u64, timestamp) - __array(char, comm, 16) - ), - - TP_fast_assign( - __entry->core_id = core_id; - __entry->timestamp = timestamp; - memcpy(__entry->comm, comm, 16); - ), - - TP_printk("cpu=%u timestamp=%llu comm=%s", - __entry->core_id, - __entry->timestamp, - __entry->comm) -); - -// Create a header file for the tracepoint -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . -#define TRACE_INCLUDE_FILE memory_collector_trace -#include +// Define the tracepoint +#define CREATE_TRACE_POINTS +#include "memory_collector_trace.h" // Data structure for PMU samples struct memory_collector_data { @@ -76,7 +43,7 @@ static void memory_collector_overflow_handler(struct perf_event *event, strncpy(sample.comm, current->comm, sizeof(sample.comm) - 1); sample.comm[sizeof(sample.comm) - 1] = '\0'; - // Replace printk with tracepoint + // Trace the event trace_memory_collector_sample(sample.core_id, sample.timestamp, sample.comm); } diff --git a/module/memory_collector_trace.h b/module/memory_collector_trace.h index f71f59c..02778e8 100644 --- a/module/memory_collector_trace.h +++ b/module/memory_collector_trace.h @@ -6,12 +6,35 @@ #include -DECLARE_TRACEPOINT(memory_collector_sample); +TRACE_EVENT(memory_collector_sample, + TP_PROTO(u32 core_id, u64 timestamp, const char *comm), + + TP_ARGS(core_id, timestamp, comm), + + TP_STRUCT__entry( + __field(u32, core_id) + __field(u64, timestamp) + __array(char, comm, 16) + ), + + TP_fast_assign( + __entry->core_id = core_id; + __entry->timestamp = timestamp; + memcpy(__entry->comm, comm, 16); + ), + + TP_printk("cpu=%u timestamp=%llu comm=%s", + __entry->core_id, + __entry->timestamp, + __entry->comm) +); #endif /* _MEMORY_COLLECTOR_TRACE_H */ -/* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE memory_collector_trace + +/* This part must be outside protection */ #include \ No newline at end of file From 59234461ab566c35cbfeadec3356c31c8d9d2b05 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 7 Feb 2025 12:17:18 -0600 Subject: [PATCH 10/88] make test_pmu.sh executable --- module/test_pmu.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 module/test_pmu.sh diff --git a/module/test_pmu.sh b/module/test_pmu.sh old mode 100644 new mode 100755 From 3af7eeb0f13096b349cd3c987d7caaabbb796578 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 7 Feb 2025 12:22:43 -0600 Subject: [PATCH 11/88] use random run IDs for testing the PMU --- module/test_pmu.sh | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/module/test_pmu.sh b/module/test_pmu.sh index 995d6b6..7af5eb3 100755 --- a/module/test_pmu.sh +++ b/module/test_pmu.sh @@ -1,6 +1,11 @@ #!/bin/bash set -e +# Generate random temporary filenames +RUN_ID=$(openssl rand -hex 8) +TRACE_DATA="/tmp/trace_data_$RUN_ID" +TRACE_OUTPUT="/tmp/trace_output_$RUN_ID.txt" + echo "Building kernel module..." make clean make @@ -23,15 +28,15 @@ sleep 1 echo "Stopping trace..." sudo trace-cmd stop -echo "Extracting trace data..." -sudo trace-cmd extract +echo "Extracting trace data to $TRACE_DATA..." +sudo trace-cmd extract -o "$TRACE_DATA" echo "Reading trace report..." -sudo trace-cmd report > trace_output.txt +sudo trace-cmd report -i "$TRACE_DATA" > "$TRACE_OUTPUT" echo "Validating output..." # Check if we have any trace entries -SAMPLE_COUNT=$(grep "memory_collector_sample:" trace_output.txt | wc -l) +SAMPLE_COUNT=$(grep "memory_collector_sample:" "$TRACE_OUTPUT" | wc -l) CPU_COUNT=$(nproc) EXPECTED_MIN=$((900 * CPU_COUNT)) @@ -48,4 +53,5 @@ sudo trace-cmd reset echo "Test completed successfully!" echo "Sample count: $SAMPLE_COUNT" -echo "Output saved to trace_output.txt" \ No newline at end of file +echo "Trace data: $TRACE_DATA" +echo "Trace output: $TRACE_OUTPUT" \ No newline at end of file From fb627897557634826fef0876df558be9ef0a8753 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 7 Feb 2025 12:27:26 -0600 Subject: [PATCH 12/88] use IPI to collect from all CPUs --- module/memory_collector.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 970571e..16ba53d 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -20,31 +20,32 @@ MODULE_VERSION("1.0"); #define CREATE_TRACE_POINTS #include "memory_collector_trace.h" -// Data structure for PMU samples -struct memory_collector_data { - u64 timestamp; // Current timestamp - u32 core_id; // CPU core number - char comm[16]; // Current task comm (name) -} __packed; - // PMU type for our custom events static struct pmu memory_collector_pmu; -// Custom overflow handler + +// New IPI handler function that will run on each CPU +static void memory_collector_ipi_handler(void *info) +{ + u32 cpu = smp_processor_id(); + + // Trace the event for this CPU + trace_memory_collector_sample(cpu, ktime_get_ns(), current->comm); +} + +// Modified overflow handler static void memory_collector_overflow_handler(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { - struct memory_collector_data sample = { - .timestamp = ktime_get_ns(), - .core_id = smp_processor_id() - }; + const struct cpumask *mask = cpu_online_mask; - strncpy(sample.comm, current->comm, sizeof(sample.comm) - 1); - sample.comm[sizeof(sample.comm) - 1] = '\0'; - // Trace the event - trace_memory_collector_sample(sample.core_id, sample.timestamp, sample.comm); + // Send IPI to all other CPUs + smp_call_function_many(mask, memory_collector_ipi_handler, NULL, 1); + + // Run the trace on this CPU + memory_collector_ipi_handler(NULL); } // PMU callback functions From 77685ca5972e255e063cff2b53ea6e33d0b7712f Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 7 Feb 2025 21:55:03 -0600 Subject: [PATCH 13/88] add llc miss events --- module/memory_collector.c | 147 ++++++++++++++++---------------- module/memory_collector_trace.h | 11 ++- 2 files changed, 80 insertions(+), 78 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 16ba53d..3826f66 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -20,20 +20,33 @@ MODULE_VERSION("1.0"); #define CREATE_TRACE_POINTS #include "memory_collector_trace.h" -// PMU type for our custom events -static struct pmu memory_collector_pmu; +// Keep track of the LLC miss events for each CPU +static struct perf_event **llc_miss_events; +// Keep track of the time sampling +static struct perf_event *sampling_event; -// New IPI handler function that will run on each CPU +// IPI handler function that will run on each CPU static void memory_collector_ipi_handler(void *info) { - u32 cpu = smp_processor_id(); + u64 timestamp; + u32 cpu; + u64 llc_misses, enabled, running; + + timestamp = ktime_get_ns(); + cpu = smp_processor_id(); - // Trace the event for this CPU - trace_memory_collector_sample(cpu, ktime_get_ns(), current->comm); + // Read LLC misses + if (llc_miss_events[cpu]) { + llc_misses = perf_event_read_value(llc_miss_events[cpu], &enabled, &running); + } else { + llc_misses = 0; + } + + trace_memory_collector_sample(timestamp, cpu, current->comm, llc_misses); } -// Modified overflow handler +// Overflow handler for the time sampling event static void memory_collector_overflow_handler(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) @@ -48,68 +61,11 @@ static void memory_collector_overflow_handler(struct perf_event *event, memory_collector_ipi_handler(NULL); } -// PMU callback functions -static void memory_collector_start(struct perf_event *event, int flags) -{ - event->hw.state = 0; -} - -static void memory_collector_stop(struct perf_event *event, int flags) -{ - event->hw.state = PERF_HES_STOPPED; -} - -static int memory_collector_add(struct perf_event *event, int flags) -{ - return 0; -} - -static void memory_collector_del(struct perf_event *event, int flags) -{ -} - -static void memory_collector_read(struct perf_event *event) -{ - // Trigger the overflow handler directly - struct perf_sample_data data; - struct pt_regs *regs = get_irq_regs(); - - perf_sample_data_init(&data, 0, 1); - - if (regs) { - memory_collector_overflow_handler(event, &data, regs); - } -} - -// PMU configuration -static int memory_collector_event_init(struct perf_event *event) -{ - if (event->attr.type != memory_collector_pmu.type) - return -ENOENT; - - event->destroy = NULL; - event->hw.config = 0; - - return 0; -} - -// PMU structure -static struct pmu memory_collector_pmu = { - .task_ctx_nr = perf_sw_context, - .event_init = memory_collector_event_init, - .add = memory_collector_add, - .del = memory_collector_del, - .start = memory_collector_start, - .stop = memory_collector_stop, - .read = memory_collector_read, -}; - -// Keep track of our event -static struct perf_event *sampling_event; +// Modify init function: static int __init memory_collector_init(void) { - int ret; + int cpu, ret; struct perf_event_attr attr = { .type = PERF_TYPE_SOFTWARE, .size = sizeof(struct perf_event_attr), @@ -119,12 +75,6 @@ static int __init memory_collector_init(void) printk(KERN_INFO "Memory Collector: initializing PMU module\n"); - // Register PMU - ret = perf_pmu_register(&memory_collector_pmu, "memory_collector", -1); - if (ret) { - printk(KERN_ERR "Memory Collector: failed to register PMU: %d\n", ret); - return ret; - } // Create a kernel counter that will drive our sampling sampling_event = perf_event_create_kernel_counter(&attr, @@ -135,18 +85,60 @@ static int __init memory_collector_init(void) if (IS_ERR(sampling_event)) { ret = PTR_ERR(sampling_event); printk(KERN_ERR "Memory Collector: failed to create sampling event: %d\n", ret); - perf_pmu_unregister(&memory_collector_pmu); return ret; } // Enable the event perf_event_enable(sampling_event); + // Allocate array for LLC miss events + llc_miss_events = kcalloc(num_possible_cpus(), sizeof(*llc_miss_events), GFP_KERNEL); + if (!llc_miss_events) { + ret = -ENOMEM; + goto error_alloc; + } + + // Setup LLC miss event attributes + memset(&attr, 0, sizeof(attr)); + attr.type = PERF_TYPE_HW_CACHE; + attr.config = PERF_COUNT_HW_CACHE_MISSES; + attr.size = sizeof(attr); + attr.disabled = 0; + attr.exclude_kernel = 0; + attr.exclude_hv = 0; + attr.exclude_idle = 0; + + // Create LLC miss counter for each CPU + for_each_possible_cpu(cpu) { + llc_miss_events[cpu] = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); + if (IS_ERR(llc_miss_events[cpu])) { + pr_err("Failed to create LLC miss event for CPU %d\n", cpu); + llc_miss_events[cpu] = NULL; + ret = PTR_ERR(llc_miss_events[cpu]); + goto error_events; + } + } + return 0; + +error_events: + // Cleanup LLC miss events + for_each_possible_cpu(cpu) { + if (llc_miss_events[cpu]) { + perf_event_release_kernel(llc_miss_events[cpu]); + } + } + kfree(llc_miss_events); +error_alloc: + perf_event_disable(sampling_event); + perf_event_release_kernel(sampling_event); + return ret; } static void __exit memory_collector_exit(void) { + int cpu; + printk(KERN_INFO "Memory Collector: unregistering PMU module\n"); if (sampling_event) { @@ -154,7 +146,14 @@ static void __exit memory_collector_exit(void) perf_event_release_kernel(sampling_event); } - perf_pmu_unregister(&memory_collector_pmu); + + // Cleanup LLC miss events + for_each_possible_cpu(cpu) { + if (llc_miss_events[cpu]) { + perf_event_release_kernel(llc_miss_events[cpu]); + } + } + kfree(llc_miss_events); } module_init(memory_collector_init); diff --git a/module/memory_collector_trace.h b/module/memory_collector_trace.h index 02778e8..6f1dc7a 100644 --- a/module/memory_collector_trace.h +++ b/module/memory_collector_trace.h @@ -7,26 +7,29 @@ #include TRACE_EVENT(memory_collector_sample, - TP_PROTO(u32 core_id, u64 timestamp, const char *comm), + TP_PROTO(u32 core_id, u64 timestamp, const char *comm, u64 llc_misses), - TP_ARGS(core_id, timestamp, comm), + TP_ARGS(core_id, timestamp, comm, llc_misses), TP_STRUCT__entry( __field(u32, core_id) __field(u64, timestamp) __array(char, comm, 16) + __field(u64, llc_misses) ), TP_fast_assign( __entry->core_id = core_id; __entry->timestamp = timestamp; memcpy(__entry->comm, comm, 16); + __entry->llc_misses = llc_misses; ), - TP_printk("cpu=%u timestamp=%llu comm=%s", + TP_printk("cpu=%u timestamp=%llu comm=%s llc_misses=%llu", __entry->core_id, __entry->timestamp, - __entry->comm) + __entry->comm, + __entry->llc_misses) ); #endif /* _MEMORY_COLLECTOR_TRACE_H */ From 679cdced095fc3edea72ba78cfe583fac1bea2c5 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Fri, 7 Feb 2025 21:56:58 -0600 Subject: [PATCH 14/88] fix trace parameter order --- module/memory_collector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 3826f66..d30b5c8 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -43,7 +43,7 @@ static void memory_collector_ipi_handler(void *info) llc_misses = 0; } - trace_memory_collector_sample(timestamp, cpu, current->comm, llc_misses); + trace_memory_collector_sample(cpu, timestamp, current->comm, llc_misses); } // Overflow handler for the time sampling event From 5597881d407f7f35a61dbe97366b77d6e9d8586b Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sat, 8 Feb 2025 10:32:46 -0600 Subject: [PATCH 15/88] add cycle and instruction monitoring --- module/memory_collector.c | 170 +++++++++++++++++++++++--------- module/memory_collector_trace.h | 24 ++--- 2 files changed, 137 insertions(+), 57 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index d30b5c8..36a013b 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -20,30 +20,49 @@ MODULE_VERSION("1.0"); #define CREATE_TRACE_POINTS #include "memory_collector_trace.h" -// Keep track of the LLC miss events for each CPU -static struct perf_event **llc_miss_events; - -// Keep track of the time sampling +// Add the cpu_state struct definition after the includes +struct cpu_state { + struct perf_event *llc_miss; + struct perf_event *cycles; + struct perf_event *instructions; +}; + +// Replace the llc_miss_events global with cpu_states +static struct cpu_state *cpu_states; static struct perf_event *sampling_event; -// IPI handler function that will run on each CPU +// Add the init_cpu and cleanup_cpu function declarations +static int init_cpu(int cpu); +static void cleanup_cpu(int cpu); + +// Update the IPI handler to collect all metrics static void memory_collector_ipi_handler(void *info) { u64 timestamp; u32 cpu; - u64 llc_misses, enabled, running; + u64 llc_misses = 0, cycles = 0, instructions = 0; + u64 enabled, running; timestamp = ktime_get_ns(); cpu = smp_processor_id(); // Read LLC misses - if (llc_miss_events[cpu]) { - llc_misses = perf_event_read_value(llc_miss_events[cpu], &enabled, &running); - } else { - llc_misses = 0; + if (cpu_states[cpu].llc_miss) { + llc_misses = perf_event_read_value(cpu_states[cpu].llc_miss, &enabled, &running); + } + + // Read cycles + if (cpu_states[cpu].cycles) { + cycles = perf_event_read_value(cpu_states[cpu].cycles, &enabled, &running); } - trace_memory_collector_sample(cpu, timestamp, current->comm, llc_misses); + // Read instructions + if (cpu_states[cpu].instructions) { + instructions = perf_event_read_value(cpu_states[cpu].instructions, &enabled, &running); + } + + trace_memory_collector_sample(cpu, timestamp, current->comm, + llc_misses, cycles, instructions); } // Overflow handler for the time sampling event @@ -61,8 +80,85 @@ static void memory_collector_overflow_handler(struct perf_event *event, memory_collector_ipi_handler(NULL); } +// Add the init_cpu function +static int init_cpu(int cpu) +{ + struct perf_event_attr attr; + int ret; + + // Setup LLC miss event + memset(&attr, 0, sizeof(attr)); + attr.type = PERF_TYPE_HW_CACHE; + attr.config = PERF_COUNT_HW_CACHE_MISSES; + attr.size = sizeof(attr); + attr.disabled = 0; + attr.exclude_kernel = 0; + attr.exclude_hv = 0; + attr.exclude_idle = 0; + + cpu_states[cpu].llc_miss = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); + if (IS_ERR(cpu_states[cpu].llc_miss)) { + ret = PTR_ERR(cpu_states[cpu].llc_miss); + pr_err("Failed to create LLC miss event for CPU %d\n", cpu); + cpu_states[cpu].llc_miss = NULL; + goto error; + } + + // Setup cycles event + memset(&attr, 0, sizeof(attr)); + attr.type = PERF_TYPE_HARDWARE; + attr.config = PERF_COUNT_HW_CPU_CYCLES; + attr.size = sizeof(attr); + attr.disabled = 0; + + cpu_states[cpu].cycles = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); + if (IS_ERR(cpu_states[cpu].cycles)) { + ret = PTR_ERR(cpu_states[cpu].cycles); + pr_err("Failed to create cycles event for CPU %d\n", cpu); + cpu_states[cpu].cycles = NULL; + goto error; + } + + // Setup instructions event + memset(&attr, 0, sizeof(attr)); + attr.type = PERF_TYPE_HARDWARE; + attr.config = PERF_COUNT_HW_INSTRUCTIONS; + attr.size = sizeof(attr); + attr.disabled = 0; + + cpu_states[cpu].instructions = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); + if (IS_ERR(cpu_states[cpu].instructions)) { + ret = PTR_ERR(cpu_states[cpu].instructions); + pr_err("Failed to create instructions event for CPU %d\n", cpu); + cpu_states[cpu].instructions = NULL; + goto error; + } + + return 0; + +error: + cleanup_cpu(cpu); + return ret; +} + +// Add the cleanup_cpu function +static void cleanup_cpu(int cpu) +{ + if (cpu_states[cpu].llc_miss) { + perf_event_release_kernel(cpu_states[cpu].llc_miss); + cpu_states[cpu].llc_miss = NULL; + } + if (cpu_states[cpu].cycles) { + perf_event_release_kernel(cpu_states[cpu].cycles); + cpu_states[cpu].cycles = NULL; + } + if (cpu_states[cpu].instructions) { + perf_event_release_kernel(cpu_states[cpu].instructions); + cpu_states[cpu].instructions = NULL; + } +} -// Modify init function: +// Update the init function static int __init memory_collector_init(void) { int cpu, ret; @@ -75,8 +171,7 @@ static int __init memory_collector_init(void) printk(KERN_INFO "Memory Collector: initializing PMU module\n"); - - // Create a kernel counter that will drive our sampling + // Create sampling event sampling_event = perf_event_create_kernel_counter(&attr, 0, // any CPU NULL, // all threads @@ -91,50 +186,36 @@ static int __init memory_collector_init(void) // Enable the event perf_event_enable(sampling_event); - // Allocate array for LLC miss events - llc_miss_events = kcalloc(num_possible_cpus(), sizeof(*llc_miss_events), GFP_KERNEL); - if (!llc_miss_events) { + // Allocate array for CPU states + cpu_states = kcalloc(num_possible_cpus(), sizeof(*cpu_states), GFP_KERNEL); + if (!cpu_states) { ret = -ENOMEM; goto error_alloc; } - // Setup LLC miss event attributes - memset(&attr, 0, sizeof(attr)); - attr.type = PERF_TYPE_HW_CACHE; - attr.config = PERF_COUNT_HW_CACHE_MISSES; - attr.size = sizeof(attr); - attr.disabled = 0; - attr.exclude_kernel = 0; - attr.exclude_hv = 0; - attr.exclude_idle = 0; - - // Create LLC miss counter for each CPU + // Initialize each CPU for_each_possible_cpu(cpu) { - llc_miss_events[cpu] = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); - if (IS_ERR(llc_miss_events[cpu])) { - pr_err("Failed to create LLC miss event for CPU %d\n", cpu); - llc_miss_events[cpu] = NULL; - ret = PTR_ERR(llc_miss_events[cpu]); - goto error_events; + ret = init_cpu(cpu); + if (ret < 0) { + goto error_init; } } return 0; -error_events: - // Cleanup LLC miss events +error_init: + // Cleanup CPUs that were initialized for_each_possible_cpu(cpu) { - if (llc_miss_events[cpu]) { - perf_event_release_kernel(llc_miss_events[cpu]); - } + cleanup_cpu(cpu); } - kfree(llc_miss_events); + kfree(cpu_states); error_alloc: perf_event_disable(sampling_event); perf_event_release_kernel(sampling_event); return ret; } +// Update the exit function static void __exit memory_collector_exit(void) { int cpu; @@ -145,15 +226,12 @@ static void __exit memory_collector_exit(void) perf_event_disable(sampling_event); perf_event_release_kernel(sampling_event); } - - // Cleanup LLC miss events + // Cleanup all CPUs for_each_possible_cpu(cpu) { - if (llc_miss_events[cpu]) { - perf_event_release_kernel(llc_miss_events[cpu]); - } + cleanup_cpu(cpu); } - kfree(llc_miss_events); + kfree(cpu_states); } module_init(memory_collector_init); diff --git a/module/memory_collector_trace.h b/module/memory_collector_trace.h index 6f1dc7a..ae3ad5d 100644 --- a/module/memory_collector_trace.h +++ b/module/memory_collector_trace.h @@ -7,29 +7,31 @@ #include TRACE_EVENT(memory_collector_sample, - TP_PROTO(u32 core_id, u64 timestamp, const char *comm, u64 llc_misses), + TP_PROTO(u32 cpu, u64 timestamp, const char *comm, u64 llc_misses, u64 cycles, u64 instructions), - TP_ARGS(core_id, timestamp, comm, llc_misses), + TP_ARGS(cpu, timestamp, comm, llc_misses, cycles, instructions), TP_STRUCT__entry( - __field(u32, core_id) + __field(u32, cpu) __field(u64, timestamp) - __array(char, comm, 16) + __array(char, comm, TASK_COMM_LEN) __field(u64, llc_misses) + __field(u64, cycles) + __field(u64, instructions) ), TP_fast_assign( - __entry->core_id = core_id; + __entry->cpu = cpu; __entry->timestamp = timestamp; - memcpy(__entry->comm, comm, 16); + memcpy(__entry->comm, comm, TASK_COMM_LEN); __entry->llc_misses = llc_misses; + __entry->cycles = cycles; + __entry->instructions = instructions; ), - TP_printk("cpu=%u timestamp=%llu comm=%s llc_misses=%llu", - __entry->core_id, - __entry->timestamp, - __entry->comm, - __entry->llc_misses) + TP_printk("cpu=%u timestamp=%llu comm=%s llc_misses=%llu cycles=%llu instructions=%llu", + __entry->cpu, __entry->timestamp, __entry->comm, + __entry->llc_misses, __entry->cycles, __entry->instructions) ); #endif /* _MEMORY_COLLECTOR_TRACE_H */ From 38067b133afb522917fbae8356e63966aeb711dd Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sat, 8 Feb 2025 12:28:27 -0600 Subject: [PATCH 16/88] rename sample collection function --- module/memory_collector.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 36a013b..f27c837 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -36,7 +36,7 @@ static int init_cpu(int cpu); static void cleanup_cpu(int cpu); // Update the IPI handler to collect all metrics -static void memory_collector_ipi_handler(void *info) +static void collect_sample_on_current_cpu(void *info) { u64 timestamp; u32 cpu; @@ -74,10 +74,10 @@ static void memory_collector_overflow_handler(struct perf_event *event, // Send IPI to all other CPUs - smp_call_function_many(mask, memory_collector_ipi_handler, NULL, 1); + smp_call_function_many(mask, collect_sample_on_current_cpu, NULL, 1); // Run the trace on this CPU - memory_collector_ipi_handler(NULL); + collect_sample_on_current_cpu(NULL); } // Add the init_cpu function From 26f4996791c91e527facb713889c580f4b9b8288 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sat, 8 Feb 2025 12:31:52 -0600 Subject: [PATCH 17/88] add context switch event handler --- module/memory_collector.c | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index f27c837..06ee73c 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -25,6 +25,7 @@ struct cpu_state { struct perf_event *llc_miss; struct perf_event *cycles; struct perf_event *instructions; + struct perf_event *ctx_switch; // New field for context switch event }; // Replace the llc_miss_events global with cpu_states @@ -65,6 +66,15 @@ static void collect_sample_on_current_cpu(void *info) llc_misses, cycles, instructions); } +// Add context switch handler +static void context_switch_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + // Call the existing sample collection function + collect_sample_on_current_cpu(NULL); +} + // Overflow handler for the time sampling event static void memory_collector_overflow_handler(struct perf_event *event, struct perf_sample_data *data, @@ -80,7 +90,7 @@ static void memory_collector_overflow_handler(struct perf_event *event, collect_sample_on_current_cpu(NULL); } -// Add the init_cpu function +// Update init_cpu to include context switch event setup static int init_cpu(int cpu) { struct perf_event_attr attr; @@ -134,6 +144,24 @@ static int init_cpu(int cpu) goto error; } + // After setting up instructions event, add context switch event + memset(&attr, 0, sizeof(attr)); + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_CONTEXT_SWITCHES; + attr.size = sizeof(attr); + attr.disabled = 0; + attr.sample_period = 1; // Sample every context switch + + cpu_states[cpu].ctx_switch = perf_event_create_kernel_counter(&attr, cpu, NULL, + context_switch_handler, + NULL); + if (IS_ERR(cpu_states[cpu].ctx_switch)) { + ret = PTR_ERR(cpu_states[cpu].ctx_switch); + pr_err("Failed to create context switch event for CPU %d\n", cpu); + cpu_states[cpu].ctx_switch = NULL; + goto error; + } + return 0; error: @@ -141,7 +169,7 @@ static int init_cpu(int cpu) return ret; } -// Add the cleanup_cpu function +// Update cleanup_cpu to clean up context switch event static void cleanup_cpu(int cpu) { if (cpu_states[cpu].llc_miss) { @@ -156,6 +184,10 @@ static void cleanup_cpu(int cpu) perf_event_release_kernel(cpu_states[cpu].instructions); cpu_states[cpu].instructions = NULL; } + if (cpu_states[cpu].ctx_switch) { + perf_event_release_kernel(cpu_states[cpu].ctx_switch); + cpu_states[cpu].ctx_switch = NULL; + } } // Update the init function From 760b735c9d9ffca41da5ae263adbb8c8e918ee01 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Sat, 8 Feb 2025 12:47:40 -0600 Subject: [PATCH 18/88] add indication of whether a trace was due to a context switch --- module/memory_collector.c | 15 +++++++++------ module/memory_collector_trace.h | 10 ++++++---- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 06ee73c..47b0588 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -36,8 +36,7 @@ static struct perf_event *sampling_event; static int init_cpu(int cpu); static void cleanup_cpu(int cpu); -// Update the IPI handler to collect all metrics -static void collect_sample_on_current_cpu(void *info) +static void collect_sample_on_current_cpu(bool is_context_switch) { u64 timestamp; u32 cpu; @@ -63,7 +62,7 @@ static void collect_sample_on_current_cpu(void *info) } trace_memory_collector_sample(cpu, timestamp, current->comm, - llc_misses, cycles, instructions); + llc_misses, cycles, instructions, is_context_switch); } // Add context switch handler @@ -72,7 +71,11 @@ static void context_switch_handler(struct perf_event *event, struct pt_regs *regs) { // Call the existing sample collection function - collect_sample_on_current_cpu(NULL); + collect_sample_on_current_cpu(true); +} + +static void ipi_handler(void *info) { + collect_sample_on_current_cpu(false); } // Overflow handler for the time sampling event @@ -84,10 +87,10 @@ static void memory_collector_overflow_handler(struct perf_event *event, // Send IPI to all other CPUs - smp_call_function_many(mask, collect_sample_on_current_cpu, NULL, 1); + smp_call_function_many(mask, ipi_handler, NULL, 1); // Run the trace on this CPU - collect_sample_on_current_cpu(NULL); + ipi_handler(NULL); } // Update init_cpu to include context switch event setup diff --git a/module/memory_collector_trace.h b/module/memory_collector_trace.h index ae3ad5d..044d38e 100644 --- a/module/memory_collector_trace.h +++ b/module/memory_collector_trace.h @@ -7,9 +7,9 @@ #include TRACE_EVENT(memory_collector_sample, - TP_PROTO(u32 cpu, u64 timestamp, const char *comm, u64 llc_misses, u64 cycles, u64 instructions), + TP_PROTO(u32 cpu, u64 timestamp, const char *comm, u64 llc_misses, u64 cycles, u64 instructions, bool is_context_switch), - TP_ARGS(cpu, timestamp, comm, llc_misses, cycles, instructions), + TP_ARGS(cpu, timestamp, comm, llc_misses, cycles, instructions, is_context_switch), TP_STRUCT__entry( __field(u32, cpu) @@ -18,6 +18,7 @@ TRACE_EVENT(memory_collector_sample, __field(u64, llc_misses) __field(u64, cycles) __field(u64, instructions) + __field(bool, is_context_switch) ), TP_fast_assign( @@ -27,11 +28,12 @@ TRACE_EVENT(memory_collector_sample, __entry->llc_misses = llc_misses; __entry->cycles = cycles; __entry->instructions = instructions; + __entry->is_context_switch = is_context_switch; ), - TP_printk("cpu=%u timestamp=%llu comm=%s llc_misses=%llu cycles=%llu instructions=%llu", + TP_printk("cpu=%u timestamp=%llu comm=%s llc_misses=%llu cycles=%llu instructions=%llu is_context_switch=%d", __entry->cpu, __entry->timestamp, __entry->comm, - __entry->llc_misses, __entry->cycles, __entry->instructions) + __entry->llc_misses, __entry->cycles, __entry->instructions, __entry->is_context_switch) ); #endif /* _MEMORY_COLLECTOR_TRACE_H */ From f9573fd5033353151e9ab32c850317396cd0af4a Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 18:01:31 +0000 Subject: [PATCH 19/88] add skeleton initialization of resctrl --- Dockerfile.devcontainer | 22 ++++- module/Makefile | 28 +++++- module/memory_collector.c | 11 +++ module/resctrl.c | 120 +++++++++++++++++++++++++ module/resctrl.h | 27 ++++++ module/{test_pmu.sh => test_module.sh} | 6 +- 6 files changed, 205 insertions(+), 9 deletions(-) create mode 100644 module/resctrl.c create mode 100644 module/resctrl.h rename module/{test_pmu.sh => test_module.sh} (92%) diff --git a/Dockerfile.devcontainer b/Dockerfile.devcontainer index 9f9513c..0a93a4e 100644 --- a/Dockerfile.devcontainer +++ b/Dockerfile.devcontainer @@ -3,11 +3,27 @@ FROM ubuntu:latest # Avoid prompts during package installation ENV DEBIAN_FRONTEND=noninteractive +# Add amd64 architecture +RUN dpkg --add-architecture amd64 + +# Set up repositories for both arm64 and amd64 +RUN echo "deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports noble main restricted universe multiverse\n\ +deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports noble-updates main restricted universe multiverse\n\ +deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports noble-backports main restricted universe multiverse\n\ +deb [arch=arm64] http://ports.ubuntu.com/ubuntu-ports noble-security main restricted universe multiverse\n\ +deb [arch=amd64] http://archive.ubuntu.com/ubuntu noble main restricted universe multiverse\n\ +deb [arch=amd64] http://archive.ubuntu.com/ubuntu noble-updates main restricted universe multiverse\n\ +deb [arch=amd64] http://archive.ubuntu.com/ubuntu noble-backports main restricted universe multiverse\n\ +deb [arch=amd64] http://security.ubuntu.com/ubuntu noble-security main restricted universe multiverse" > /etc/apt/sources.list + +RUN rm /etc/apt/sources.list.d/ubuntu.sources + # Update and install essential build tools and kernel headers RUN apt-get update && apt-get install -y \ build-essential \ - linux-headers-6.8.0-52-generic \ - linux-image-6.8.0-52-generic \ + crossbuild-essential-amd64 \ + linux-headers-6.8.0-52-generic:amd64 \ + linux-image-6.8.0-52-generic:amd64 \ git \ vim \ curl \ @@ -18,4 +34,4 @@ RUN apt-get update && apt-get install -y \ WORKDIR /workspace # Keep container running -CMD ["sleep", "infinity"] \ No newline at end of file +CMD ["sleep", "infinity"] \ No newline at end of file diff --git a/module/Makefile b/module/Makefile index 9d87ed4..6146f7f 100644 --- a/module/Makefile +++ b/module/Makefile @@ -1,5 +1,11 @@ EXTRA_CFLAGS += -I$(src) -obj-m += memory_collector.o + +# Define the module name and its source files +obj-m := collector.o +collector-objs := memory_collector.o resctrl.o + +# Always set architecture to x86_64 (not just x86) +ARCH := x86_64 # Check if KVERSION is provided on command line ifdef KVERSION @@ -20,15 +26,31 @@ endif KDIR := /lib/modules/$(KERNEL_VERSION)/build BUILD_DIR := $(PWD)/build +# Set cross-compilation by default in container +ifneq (,$(wildcard /.dockerenv)) + CROSS_COMPILE := x86_64-linux-gnu- +endif + +# Also set cross-compilation on ARM machines +ifeq ($(shell uname -m),arm64) + CROSS_COMPILE := x86_64-linux-gnu- +endif + all: | $(BUILD_DIR) @echo "Building for kernel version: $(KERNEL_VERSION)" - $(MAKE) -C $(KDIR) M=$(BUILD_DIR) src=$(PWD) modules + $(MAKE) -C $(KDIR) M=$(BUILD_DIR) src=$(PWD) \ + ARCH=$(ARCH) \ + CROSS_COMPILE=$(CROSS_COMPILE) \ + modules $(BUILD_DIR): mkdir -p $(BUILD_DIR) clean: - $(MAKE) -C $(KDIR) M=$(BUILD_DIR) src=$(PWD) clean + $(MAKE) -C $(KDIR) M=$(BUILD_DIR) src=$(PWD) \ + ARCH=$(ARCH) \ + CROSS_COMPILE=$(CROSS_COMPILE) \ + clean rm -rf $(BUILD_DIR) rm -f Module.symvers Module.markers modules.order diff --git a/module/memory_collector.c b/module/memory_collector.c index 47b0588..26d1145 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -6,6 +6,7 @@ #include #include #include +#include "resctrl.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Memory Collector Project"); @@ -236,6 +237,13 @@ static int __init memory_collector_init(void) } } + ret = resctrl_init(); + if (ret < 0) { + pr_err("Failed to initialize resctrl: %d\n", ret); + // Add cleanup code here if needed + return ret; + } + return 0; error_init: @@ -262,6 +270,9 @@ static void __exit memory_collector_exit(void) perf_event_release_kernel(sampling_event); } + // Call resctrl exit first + resctrl_exit(); + // Cleanup all CPUs for_each_possible_cpu(cpu) { cleanup_cpu(cpu); diff --git a/module/resctrl.c b/module/resctrl.c new file mode 100644 index 0000000..07d1952 --- /dev/null +++ b/module/resctrl.c @@ -0,0 +1,120 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "resctrl.h" + +MODULE_LICENSE("GPL"); + +#ifndef RESCTRL_RESERVED_RMID +#define RESCTRL_RESERVED_RMID 0 +#endif + +#define RMID_VAL_ERROR BIT_ULL(63) +#define RMID_VAL_UNAVAIL BIT_ULL(62) + +/* Structure to pass RMID to IPI function */ +struct ipi_rmid_args { + u32 rmid; + int status; +}; + +/* + * IPI function to write RMID to MSR + * Called on each CPU by on_each_cpu() + */ +static void ipi_write_rmid(void *info) +{ + struct ipi_rmid_args *args = info; + u32 closid; + u64 val; + + rdmsrl(MSR_IA32_PQR_ASSOC, val); + closid = val >> 32; + + if (wrmsr_safe(MSR_IA32_PQR_ASSOC, args->rmid, closid) != 0) { + args->status = -EIO; + } else { + args->status = 0; + } +} + +int resctrl_init(void) +{ + int cpu; + int ret = 0; + + for_each_online_cpu(cpu) { + struct ipi_rmid_args args = { + .rmid = cpu + 1, + .status = 0 + }; + + on_each_cpu_mask(cpumask_of(cpu), ipi_write_rmid, &args, 1); + + if (args.status) { + pr_err("Failed to set RMID %u on CPU %d\n", args.rmid, cpu); + ret = args.status; + break; + } + pr_info("Successfully set RMID %u on CPU %d\n", args.rmid, cpu); + } + + return ret; +} + +int resctrl_exit(void) +{ + int failure_count = 0; + int cpu; + + struct ipi_rmid_args args = { + .rmid = RESCTRL_RESERVED_RMID, + .status = 0 + }; + + for_each_online_cpu(cpu) { + on_each_cpu_mask(cpumask_of(cpu), ipi_write_rmid, &args, 1); + + if (args.status) { + pr_err("Failed to set RMID %u on CPU %d\n", args.rmid, cpu); + failure_count++; + continue; + } + pr_info("Successfully set RMID %u on CPU %d\n", args.rmid, cpu); + } + + if (failure_count > 0) { + pr_err("Failed to reset RMIDs to default on %d CPUs\n", failure_count); + return -EIO; + } + + pr_info("Successfully reset all RMIDs to default\n"); + return 0; +} + +int read_rmid_mbm(u32 rmid, u64 *val) +{ + int err; + + err = wrmsr_safe(MSR_IA32_QM_EVTSEL, + rmid, + QOS_L3_MBM_TOTAL_EVENT_ID); + if (err) + return err; + + err = rdmsrl_safe(MSR_IA32_QM_CTR, val); + if (err) + return err; + + if (*val & RMID_VAL_ERROR) + return -EIO; + if (*val & RMID_VAL_UNAVAIL) + return -EINVAL; + + return 0; +} \ No newline at end of file diff --git a/module/resctrl.h b/module/resctrl.h new file mode 100644 index 0000000..484f01b --- /dev/null +++ b/module/resctrl.h @@ -0,0 +1,27 @@ +#ifndef _COLLECTOR_RESCTRL_H_ +#define _COLLECTOR_RESCTRL_H_ + + +/* Function declarations */ + +/* + * Initialize RMIDs per CPU via IPI + * Returns 0 on success, negative error code on failure + */ +int resctrl_init(void); + +/* + * Reset all CPU RMIDs to default via IPI + * Returns 0 on success, negative error code on failure + */ +int resctrl_exit(void); + +/* + * Read memory bandwidth counter for given RMID + * val is set to the bandwidth value on success + * Returns -EIO if error occurred + * Returns -EINVAL if data unavailable + */ +int read_rmid_mbm(u32 rmid, u64 *val); + +#endif /* _COLLECTOR_RESCTRL_H_ */ \ No newline at end of file diff --git a/module/test_pmu.sh b/module/test_module.sh similarity index 92% rename from module/test_pmu.sh rename to module/test_module.sh index 7af5eb3..3672c4a 100755 --- a/module/test_pmu.sh +++ b/module/test_module.sh @@ -11,10 +11,10 @@ make clean make echo "Loading kernel module..." -sudo insmod build/memory_collector.ko +sudo insmod build/collector.ko echo "Verifying module is loaded..." -lsmod | grep memory_collector || { +lsmod | grep collector || { echo "Error: Module failed to load" exit 1 } @@ -46,7 +46,7 @@ if [ $SAMPLE_COUNT -lt $EXPECTED_MIN ]; then fi echo "Unloading module..." -sudo rmmod memory_collector +sudo rmmod collector echo "Cleaning up trace..." sudo trace-cmd reset From 119bdd3878b856feea4a9d6a07ab345673ef6794 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 18:13:43 +0000 Subject: [PATCH 20/88] update github workflow for kernel module --- .github/workflows/test-kernel-module.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test-kernel-module.yaml b/.github/workflows/test-kernel-module.yaml index e257712..2002461 100644 --- a/.github/workflows/test-kernel-module.yaml +++ b/.github/workflows/test-kernel-module.yaml @@ -105,25 +105,25 @@ jobs: # Now try the actual build make - ls -l build/memory_collector.ko + ls -l build/collector.ko - name: Load and test module working-directory: module run: | # Load module - sudo insmod build/memory_collector.ko + sudo insmod build/collector.ko # Verify module is loaded - lsmod | grep memory_collector + lsmod | grep collector # Check kernel logs for module initialization dmesg | grep "Memory Collector" || true # Unload module - sudo rmmod memory_collector + sudo rmmod collector # Verify module unloaded successfully - if lsmod | grep -q memory_collector; then + if lsmod | grep -q collector; then echo "Error: Module still loaded" exit 1 fi From cf7ed75e9a103efad98cbb25a5c6d1a278e7d6ba Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 18:14:32 +0000 Subject: [PATCH 21/88] qualify kernel logs with Memory Collector --- module/resctrl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/module/resctrl.c b/module/resctrl.c index 07d1952..93b29d5 100644 --- a/module/resctrl.c +++ b/module/resctrl.c @@ -57,11 +57,11 @@ int resctrl_init(void) on_each_cpu_mask(cpumask_of(cpu), ipi_write_rmid, &args, 1); if (args.status) { - pr_err("Failed to set RMID %u on CPU %d\n", args.rmid, cpu); + pr_err("Memory Collector: Failed to set RMID %u on CPU %d\n", args.rmid, cpu); ret = args.status; break; } - pr_info("Successfully set RMID %u on CPU %d\n", args.rmid, cpu); + pr_info("Memory Collector: Successfully set RMID %u on CPU %d\n", args.rmid, cpu); } return ret; @@ -81,19 +81,19 @@ int resctrl_exit(void) on_each_cpu_mask(cpumask_of(cpu), ipi_write_rmid, &args, 1); if (args.status) { - pr_err("Failed to set RMID %u on CPU %d\n", args.rmid, cpu); + pr_err("Memory Collector: Failed to set RMID %u on CPU %d\n", args.rmid, cpu); failure_count++; continue; } - pr_info("Successfully set RMID %u on CPU %d\n", args.rmid, cpu); + pr_info("Memory Collector: Successfully set RMID %u on CPU %d\n", args.rmid, cpu); } if (failure_count > 0) { - pr_err("Failed to reset RMIDs to default on %d CPUs\n", failure_count); + pr_err("Memory Collector: Failed to reset RMIDs to default on %d CPUs\n", failure_count); return -EIO; } - pr_info("Successfully reset all RMIDs to default\n"); + pr_info("Memory Collector: Successfully reset all RMIDs to default\n"); return 0; } From 5649aecb468184e4994c14a7fa4718c6832d6e0b Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 18:23:31 +0000 Subject: [PATCH 22/88] add more debugging to workflow, on failure --- .github/workflows/test-kernel-module.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/test-kernel-module.yaml b/.github/workflows/test-kernel-module.yaml index 2002461..52085cb 100644 --- a/.github/workflows/test-kernel-module.yaml +++ b/.github/workflows/test-kernel-module.yaml @@ -108,8 +108,14 @@ jobs: ls -l build/collector.ko - name: Load and test module + id: load-and-test-module working-directory: module run: | + + # Check undefined symbols + sudo modinfo -F depends build/collector.ko + sudo objdump -d build/collector.ko | grep undefined + # Load module sudo insmod build/collector.ko @@ -130,6 +136,13 @@ jobs: # Check kernel logs for cleanup message dmesg | grep "Memory Collector" || true + + - name: Check dmesg on failure + if: steps.load-and-test-module.outcome == 'failure' + run: | + echo "load and test module failed, showing last kernel messages:" + sudo dmesg | tail -n 100 + exit 1 stop-runner: name: Stop EC2 runner From ea14af5760f4c1f97e044346e39b114fc5a38b87 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 18:32:45 +0000 Subject: [PATCH 23/88] fix error handling in kernel module test --- .github/workflows/test-kernel-module.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-kernel-module.yaml b/.github/workflows/test-kernel-module.yaml index 52085cb..dddd7b2 100644 --- a/.github/workflows/test-kernel-module.yaml +++ b/.github/workflows/test-kernel-module.yaml @@ -109,12 +109,13 @@ jobs: - name: Load and test module id: load-and-test-module + continue-on-error: true working-directory: module run: | # Check undefined symbols sudo modinfo -F depends build/collector.ko - sudo objdump -d build/collector.ko | grep undefined + sudo objdump -d build/collector.ko | grep undefined || true # Load module sudo insmod build/collector.ko From 127eb4251cdf2a778cc37ad19deeb40aa1b0459c Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 18:46:06 +0000 Subject: [PATCH 24/88] update kernel test to bare-metal machine for resctrl tests --- .github/workflows/test-kernel-module.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-kernel-module.yaml b/.github/workflows/test-kernel-module.yaml index dddd7b2..3dd30c2 100644 --- a/.github/workflows/test-kernel-module.yaml +++ b/.github/workflows/test-kernel-module.yaml @@ -26,7 +26,7 @@ jobs: mode: start github-token: ${{ secrets.REPO_ADMIN_TOKEN }} ec2-image-id: ami-0884d2865dbe9de4b # Ubuntu 22.04 LTS in us-east-2 - ec2-instance-type: t3.large + ec2-instance-type: c7i.metal-24xl market-type: spot subnet-id: ${{ secrets.AWS_SUBNET_ID }} security-group-id: ${{ secrets.AWS_SECURITY_GROUP_ID }} From a987bcd4ab0505354ddea8a8b16d3fa6933a2994 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 19:06:15 +0000 Subject: [PATCH 25/88] disable resctrl_{init,exit} to see if they cause panic on bare metal --- module/memory_collector.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 26d1145..5de9a58 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -237,12 +237,12 @@ static int __init memory_collector_init(void) } } - ret = resctrl_init(); - if (ret < 0) { - pr_err("Failed to initialize resctrl: %d\n", ret); - // Add cleanup code here if needed - return ret; - } + // ret = resctrl_init(); + // if (ret < 0) { + // pr_err("Failed to initialize resctrl: %d\n", ret); + // // Add cleanup code here if needed + // return ret; + // } return 0; @@ -270,8 +270,8 @@ static void __exit memory_collector_exit(void) perf_event_release_kernel(sampling_event); } - // Call resctrl exit first - resctrl_exit(); + // // Call resctrl exit first + // resctrl_exit(); // Cleanup all CPUs for_each_possible_cpu(cpu) { From 37f0d77f9ed9d22791be3f52d8e9faf17d4fddc3 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 19:42:31 +0000 Subject: [PATCH 26/88] try a more limited configuration of RDT MSRs --- module/memory_collector.c | 16 ++++++++-------- module/resctrl.c | 9 ++++++--- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 5de9a58..26d1145 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -237,12 +237,12 @@ static int __init memory_collector_init(void) } } - // ret = resctrl_init(); - // if (ret < 0) { - // pr_err("Failed to initialize resctrl: %d\n", ret); - // // Add cleanup code here if needed - // return ret; - // } + ret = resctrl_init(); + if (ret < 0) { + pr_err("Failed to initialize resctrl: %d\n", ret); + // Add cleanup code here if needed + return ret; + } return 0; @@ -270,8 +270,8 @@ static void __exit memory_collector_exit(void) perf_event_release_kernel(sampling_event); } - // // Call resctrl exit first - // resctrl_exit(); + // Call resctrl exit first + resctrl_exit(); // Cleanup all CPUs for_each_possible_cpu(cpu) { diff --git a/module/resctrl.c b/module/resctrl.c index 93b29d5..e66f911 100644 --- a/module/resctrl.c +++ b/module/resctrl.c @@ -30,11 +30,14 @@ struct ipi_rmid_args { static void ipi_write_rmid(void *info) { struct ipi_rmid_args *args = info; - u32 closid; + u32 closid = 0; u64 val; - rdmsrl(MSR_IA32_PQR_ASSOC, val); - closid = val >> 32; + // if we're not on CPU 2, don't do anything + if (smp_processor_id() != 2) { + args->status = 0; + return; + } if (wrmsr_safe(MSR_IA32_PQR_ASSOC, args->rmid, closid) != 0) { args->status = -EIO; From 7a85d13ac30aacf9c2d840f0bc888bc82d8d0d03 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 19:43:10 +0000 Subject: [PATCH 27/88] reduce the timeout for kernel test --- .github/workflows/test-kernel-module.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-kernel-module.yaml b/.github/workflows/test-kernel-module.yaml index 3dd30c2..1c9532b 100644 --- a/.github/workflows/test-kernel-module.yaml +++ b/.github/workflows/test-kernel-module.yaml @@ -45,7 +45,7 @@ jobs: test-module: needs: start-runner runs-on: ${{ needs.start-runner.outputs.label }} - timeout-minutes: 10 # Add timeout in case system hangs + timeout-minutes: 2 # Add timeout in case system hangs steps: - name: Checkout code uses: actions/checkout@v4 From 3691c30be6a2303bc0d8e2d30b46b8c59e71555f Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 19:49:46 +0000 Subject: [PATCH 28/88] make resctrl initialization almost a no-op to test kernel freeze --- module/resctrl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/module/resctrl.c b/module/resctrl.c index e66f911..5c0e90e 100644 --- a/module/resctrl.c +++ b/module/resctrl.c @@ -33,6 +33,10 @@ static void ipi_write_rmid(void *info) u32 closid = 0; u64 val; + // just return, this is to check if on_each_cpu_mask() causes CPUs to freeze + args->status = 0; + return; + // if we're not on CPU 2, don't do anything if (smp_processor_id() != 2) { args->status = 0; From c21d58128cc3b2e5a15e956ecc955ee200046419 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 20:30:13 +0000 Subject: [PATCH 29/88] try writing to the MSR (without reading the MSR) --- module/resctrl.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/module/resctrl.c b/module/resctrl.c index 5c0e90e..e66f911 100644 --- a/module/resctrl.c +++ b/module/resctrl.c @@ -33,10 +33,6 @@ static void ipi_write_rmid(void *info) u32 closid = 0; u64 val; - // just return, this is to check if on_each_cpu_mask() causes CPUs to freeze - args->status = 0; - return; - // if we're not on CPU 2, don't do anything if (smp_processor_id() != 2) { args->status = 0; From 3793a7682ac66182764f5a6071f62110d8ee6437 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 20:58:30 +0000 Subject: [PATCH 30/88] try the wrmsr in context switch rather than IPI --- module/memory_collector.c | 21 +++++++++++++-------- module/resctrl.c | 1 - 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 26d1145..236ecff 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -73,6 +73,11 @@ static void context_switch_handler(struct perf_event *event, { // Call the existing sample collection function collect_sample_on_current_cpu(true); + + if (wrmsr_safe(MSR_IA32_PQR_ASSOC, 1, 0) != 0) { + WARN_ONCE(1, "Memory Collector: Failed to set RMID 1 on CPU %d\n", smp_processor_id()); + } + } static void ipi_handler(void *info) { @@ -237,12 +242,12 @@ static int __init memory_collector_init(void) } } - ret = resctrl_init(); - if (ret < 0) { - pr_err("Failed to initialize resctrl: %d\n", ret); - // Add cleanup code here if needed - return ret; - } + // ret = resctrl_init(); + // if (ret < 0) { + // pr_err("Failed to initialize resctrl: %d\n", ret); + // // Add cleanup code here if needed + // return ret; + // } return 0; @@ -270,8 +275,8 @@ static void __exit memory_collector_exit(void) perf_event_release_kernel(sampling_event); } - // Call resctrl exit first - resctrl_exit(); + // // Call resctrl exit first + // resctrl_exit(); // Cleanup all CPUs for_each_possible_cpu(cpu) { diff --git a/module/resctrl.c b/module/resctrl.c index e66f911..7da6e35 100644 --- a/module/resctrl.c +++ b/module/resctrl.c @@ -31,7 +31,6 @@ static void ipi_write_rmid(void *info) { struct ipi_rmid_args *args = info; u32 closid = 0; - u64 val; // if we're not on CPU 2, don't do anything if (smp_processor_id() != 2) { From 2a5866d75995a319e4c3aea00d145d6ce09c74ad Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 21:03:10 +0000 Subject: [PATCH 31/88] run test script --- .github/workflows/test-kernel-module.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-kernel-module.yaml b/.github/workflows/test-kernel-module.yaml index 1c9532b..e522376 100644 --- a/.github/workflows/test-kernel-module.yaml +++ b/.github/workflows/test-kernel-module.yaml @@ -137,7 +137,13 @@ jobs: # Check kernel logs for cleanup message dmesg | grep "Memory Collector" || true - + + - name: Run module test script + working-directory: module + run: | + + ./test_module.sh + - name: Check dmesg on failure if: steps.load-and-test-module.outcome == 'failure' run: | From baabdab39f592d986a5ed408491c5c201b703bbc Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 21:03:29 +0000 Subject: [PATCH 32/88] expose a few lines at the beginning and end of trace --- module/test_module.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/module/test_module.sh b/module/test_module.sh index 3672c4a..3bdbb5d 100755 --- a/module/test_module.sh +++ b/module/test_module.sh @@ -34,6 +34,12 @@ sudo trace-cmd extract -o "$TRACE_DATA" echo "Reading trace report..." sudo trace-cmd report -i "$TRACE_DATA" > "$TRACE_OUTPUT" +echo "Head of trace report:" +head "$TRACE_OUTPUT" + +echo "Tail of trace report:" +tail "$TRACE_OUTPUT" + echo "Validating output..." # Check if we have any trace entries SAMPLE_COUNT=$(grep "memory_collector_sample:" "$TRACE_OUTPUT" | wc -l) From 7566b7eebd3eec30f3ddf5e6330213dcc2087db8 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 21:04:54 +0000 Subject: [PATCH 33/88] fix indentation in github action --- .github/workflows/test-kernel-module.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-kernel-module.yaml b/.github/workflows/test-kernel-module.yaml index e522376..87be9bf 100644 --- a/.github/workflows/test-kernel-module.yaml +++ b/.github/workflows/test-kernel-module.yaml @@ -138,11 +138,11 @@ jobs: # Check kernel logs for cleanup message dmesg | grep "Memory Collector" || true - - name: Run module test script - working-directory: module - run: | + - name: Run module test script + working-directory: module + run: | - ./test_module.sh + ./test_module.sh - name: Check dmesg on failure if: steps.load-and-test-module.outcome == 'failure' From 84c4f9635667a93126c5d90ba87ebb0f85c6d652 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 21:17:56 +0000 Subject: [PATCH 34/88] mount resctrl and check capabilities before loading kernel module --- .github/workflows/test-kernel-module.yaml | 29 +++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/.github/workflows/test-kernel-module.yaml b/.github/workflows/test-kernel-module.yaml index 87be9bf..184d917 100644 --- a/.github/workflows/test-kernel-module.yaml +++ b/.github/workflows/test-kernel-module.yaml @@ -107,6 +107,35 @@ jobs: ls -l build/collector.ko + - name: Check RDT Capabilities + run: | + sudo mkdir -p /sys/fs/resctrl + sudo mount -t resctrl resctrl /sys/fs/resctrl || true + + echo "Mounting resctrl filesystem" + mount | grep resctrl || true + + echo "Checking RDT capabilities" + ls /sys/fs/resctrl/info || true + + echo "Monitoring features:" + cat /sys/fs/resctrl/info/L3_MON/mon_features || true + + echo "Number of available RMIDs:" + cat /sys/fs/resctrl/info/L3_MON/num_rmids || true + + echo "Number of CAT classes:" + cat /sys/fs/resctrl/info/L3/num_closids || true + + echo "head -n 35 /proc/cpuinfo:" + head -n 35 /proc/cpuinfo || true + + echo "CPU RDT features (head):" + grep -E "cat_l3|cdp_l3|cqm_occup_llc|cqm_mbm_total|cqm_mbm_local" /proc/cpuinfo || head + + # we do not unmount, maybe mounting affects the intel_cqm checks below + #sudo umount /sys/fs/resctrl || true + - name: Load and test module id: load-and-test-module continue-on-error: true From aea64f16a16cc174952e7c38d798a96463b06d3d Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 21:26:35 +0000 Subject: [PATCH 35/88] try disabling preemption around the MSR write --- module/memory_collector.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 236ecff..5f7aeef 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -71,13 +71,17 @@ static void context_switch_handler(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { + int res; // Call the existing sample collection function collect_sample_on_current_cpu(true); - if (wrmsr_safe(MSR_IA32_PQR_ASSOC, 1, 0) != 0) { - WARN_ONCE(1, "Memory Collector: Failed to set RMID 1 on CPU %d\n", smp_processor_id()); - } + preempt_disable(); + res = wrmsr_safe(MSR_IA32_PQR_ASSOC, 1, 0); + preempt_enable(); + if (res != 0) { + WARN_ONCE(1, "Memory Collector: Failed to set RMID 1 on CPU %d, error %d\n", smp_processor_id(), res); + } } static void ipi_handler(void *info) { From ec71c7ca45f528aabd15ca424a123835f16a1c60 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 21:43:39 +0000 Subject: [PATCH 36/88] call wrmsr rather than wrmsr_safe --- module/memory_collector.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 5f7aeef..635e96d 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -71,17 +71,12 @@ static void context_switch_handler(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { - int res; // Call the existing sample collection function collect_sample_on_current_cpu(true); preempt_disable(); - res = wrmsr_safe(MSR_IA32_PQR_ASSOC, 1, 0); + wrmsr(MSR_IA32_PQR_ASSOC, 1, 0); preempt_enable(); - - if (res != 0) { - WARN_ONCE(1, "Memory Collector: Failed to set RMID 1 on CPU %d, error %d\n", smp_processor_id(), res); - } } static void ipi_handler(void *info) { From bd171c1639b28ba9097e95d71322947e1250641f Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 22:01:21 +0000 Subject: [PATCH 37/88] show CPU features for only 10 cores --- .github/workflows/test-kernel-module.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-kernel-module.yaml b/.github/workflows/test-kernel-module.yaml index 184d917..5546d14 100644 --- a/.github/workflows/test-kernel-module.yaml +++ b/.github/workflows/test-kernel-module.yaml @@ -131,7 +131,7 @@ jobs: head -n 35 /proc/cpuinfo || true echo "CPU RDT features (head):" - grep -E "cat_l3|cdp_l3|cqm_occup_llc|cqm_mbm_total|cqm_mbm_local" /proc/cpuinfo || head + grep -E "cat_l3|cdp_l3|cqm_occup_llc|cqm_mbm_total|cqm_mbm_local" /proc/cpuinfo | head || true # we do not unmount, maybe mounting affects the intel_cqm checks below #sudo umount /sys/fs/resctrl || true From 796d28a279c40ab61b83dd03edf222d5765ceb9d Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 22:01:35 +0000 Subject: [PATCH 38/88] write 0,0 to PQR_ASSOC --- module/memory_collector.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 635e96d..1f08d1a 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -71,11 +71,14 @@ static void context_switch_handler(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { + u32 rmid = 0; + u32 clos = 0; + // Call the existing sample collection function collect_sample_on_current_cpu(true); preempt_disable(); - wrmsr(MSR_IA32_PQR_ASSOC, 1, 0); + wrmsr(MSR_IA32_PQR_ASSOC, rmid, clos); preempt_enable(); } From d81b1c7305b71bb9fe8e1a22e87e2207ad3c3925 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 22:41:27 +0000 Subject: [PATCH 39/88] add cpuid enumeration as specified in the Intel SDM vol 3 (19.18.3) --- module/memory_collector.c | 24 ++++------- module/resctrl.c | 89 +++++++++++++++++++++++++++++---------- 2 files changed, 75 insertions(+), 38 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 1f08d1a..73ad9d3 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -71,15 +71,8 @@ static void context_switch_handler(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { - u32 rmid = 0; - u32 clos = 0; - // Call the existing sample collection function collect_sample_on_current_cpu(true); - - preempt_disable(); - wrmsr(MSR_IA32_PQR_ASSOC, rmid, clos); - preempt_enable(); } static void ipi_handler(void *info) { @@ -244,15 +237,16 @@ static int __init memory_collector_init(void) } } - // ret = resctrl_init(); - // if (ret < 0) { - // pr_err("Failed to initialize resctrl: %d\n", ret); - // // Add cleanup code here if needed - // return ret; - // } + ret = resctrl_init(); + if (ret < 0) { + pr_err("Failed to initialize resctrl: %d\n", ret); + goto error_resctrl; + } return 0; +error_resctrl: + resctrl_exit(); error_init: // Cleanup CPUs that were initialized for_each_possible_cpu(cpu) { @@ -277,8 +271,8 @@ static void __exit memory_collector_exit(void) perf_event_release_kernel(sampling_event); } - // // Call resctrl exit first - // resctrl_exit(); + // Call resctrl exit first + resctrl_exit(); // Cleanup all CPUs for_each_possible_cpu(cpu) { diff --git a/module/resctrl.c b/module/resctrl.c index 7da6e35..96a995f 100644 --- a/module/resctrl.c +++ b/module/resctrl.c @@ -45,26 +45,69 @@ static void ipi_write_rmid(void *info) } } +static int enumerate_cpuid(void) +{ + u32 max_leaf = 0; + u32 has_rdt = 0; + u32 has_cmt = 0; + u32 highest_rmid = 0; + + max_leaf = cpuid_eax(0); + pr_info("Memory Collector: max_leaf: %u\n", max_leaf); + + if (max_leaf < 7) { + pr_err("Memory Collector: max_leaf is less than 7\n"); + return -EINVAL; + } + + has_rdt = (cpuid_ebx(0x7) >> 12) & 0x1; + pr_info("Memory Collector: has_rdt: %u\n", has_rdt); + + if (!has_rdt) { + pr_err("Memory Collector: has_rdt is 0\n"); + return -EINVAL; + } + + has_cmt = (cpuid_edx(0xf) >> 1) & 0x1; + pr_info("Memory Collector: has_cmt: %u\n", has_cmt); + + if (!has_cmt) { + pr_err("Memory Collector: has_cmt is 0\n"); + return -EINVAL; + } + + highest_rmid = cpuid_ebx(0xf); + pr_info("Memory Collector: highest_rmid: %u\n", highest_rmid); + + return 0; +} + int resctrl_init(void) { int cpu; int ret = 0; - - for_each_online_cpu(cpu) { - struct ipi_rmid_args args = { - .rmid = cpu + 1, - .status = 0 - }; + + ret = enumerate_cpuid(); + if (ret) { + pr_err("Memory Collector: Failed to enumerate CPUID\n"); + return ret; + } + + // for_each_online_cpu(cpu) { + // struct ipi_rmid_args args = { + // .rmid = cpu + 1, + // .status = 0 + // }; - on_each_cpu_mask(cpumask_of(cpu), ipi_write_rmid, &args, 1); + // on_each_cpu_mask(cpumask_of(cpu), ipi_write_rmid, &args, 1); - if (args.status) { - pr_err("Memory Collector: Failed to set RMID %u on CPU %d\n", args.rmid, cpu); - ret = args.status; - break; - } - pr_info("Memory Collector: Successfully set RMID %u on CPU %d\n", args.rmid, cpu); - } + // if (args.status) { + // pr_err("Memory Collector: Failed to set RMID %u on CPU %d\n", args.rmid, cpu); + // ret = args.status; + // break; + // } + // pr_info("Memory Collector: Successfully set RMID %u on CPU %d\n", args.rmid, cpu); + // } return ret; } @@ -79,16 +122,16 @@ int resctrl_exit(void) .status = 0 }; - for_each_online_cpu(cpu) { - on_each_cpu_mask(cpumask_of(cpu), ipi_write_rmid, &args, 1); + // for_each_online_cpu(cpu) { + // on_each_cpu_mask(cpumask_of(cpu), ipi_write_rmid, &args, 1); - if (args.status) { - pr_err("Memory Collector: Failed to set RMID %u on CPU %d\n", args.rmid, cpu); - failure_count++; - continue; - } - pr_info("Memory Collector: Successfully set RMID %u on CPU %d\n", args.rmid, cpu); - } + // if (args.status) { + // pr_err("Memory Collector: Failed to set RMID %u on CPU %d\n", args.rmid, cpu); + // failure_count++; + // continue; + // } + // pr_info("Memory Collector: Successfully set RMID %u on CPU %d\n", args.rmid, cpu); + // } if (failure_count > 0) { pr_err("Memory Collector: Failed to reset RMIDs to default on %d CPUs\n", failure_count); From 560bb4d8e21461705dea075325b016a2a50faf9e Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 17:11:57 -0600 Subject: [PATCH 40/88] add more debug prints in enumerate_cpuid --- module/resctrl.c | 50 ++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/module/resctrl.c b/module/resctrl.c index 96a995f..a48ce5b 100644 --- a/module/resctrl.c +++ b/module/resctrl.c @@ -47,39 +47,39 @@ static void ipi_write_rmid(void *info) static int enumerate_cpuid(void) { - u32 max_leaf = 0; - u32 has_rdt = 0; - u32 has_cmt = 0; - u32 highest_rmid = 0; - - max_leaf = cpuid_eax(0); - pr_info("Memory Collector: max_leaf: %u\n", max_leaf); + pr_info("memory_collector: Starting enumerate_cpuid\n"); + + unsigned int eax, ebx, ecx, edx; + int ret = 0; - if (max_leaf < 7) { - pr_err("Memory Collector: max_leaf is less than 7\n"); - return -EINVAL; + if (!boot_cpu_has(X86_FEATURE_RESCTRL_QOSMON)) { + pr_err("memory_collector: CPU does not support QoS monitoring\n"); + return -ENODEV; } - has_rdt = (cpuid_ebx(0x7) >> 12) & 0x1; - pr_info("Memory Collector: has_rdt: %u\n", has_rdt); - - if (!has_rdt) { - pr_err("Memory Collector: has_rdt is 0\n"); - return -EINVAL; + pr_debug("memory_collector: Checking CPUID.0x7.0 for RDT support\n"); + cpuid_count(0x7, 0, &eax, &ebx, &ecx, &edx); + if (!(ebx & (1 << 12))) { + pr_err("memory_collector: RDT monitoring not supported (CPUID.0x7.0:EBX.12)\n"); + return -ENODEV; } - has_cmt = (cpuid_edx(0xf) >> 1) & 0x1; - pr_info("Memory Collector: has_cmt: %u\n", has_cmt); + pr_debug("memory_collector: Checking CPUID.0xF.0 for L3 monitoring\n"); + cpuid_count(0xF, 0, &eax, &ebx, &ecx, &edx); + if (!(edx & (1 << 1))) { + pr_err("memory_collector: L3 monitoring not supported (CPUID.0xF.0:EDX.1)\n"); + return -ENODEV; + } - if (!has_cmt) { - pr_err("Memory Collector: has_cmt is 0\n"); - return -EINVAL; + pr_debug("memory_collector: Checking CPUID.0xF.1 for L3 occupancy monitoring\n"); + cpuid_count(0xF, 1, &eax, &ebx, &ecx, &edx); + if (!(edx & (1 << 0))) { + pr_err("memory_collector: L3 occupancy monitoring not supported (CPUID.0xF.1:EDX.0)\n"); + return -ENODEV; } - - highest_rmid = cpuid_ebx(0xf); - pr_info("Memory Collector: highest_rmid: %u\n", highest_rmid); - return 0; + pr_info("memory_collector: enumerate_cpuid completed successfully\n"); + return ret; } int resctrl_init(void) From e2323253bbbaae41a1f76f8f20e23d40fdec2f5a Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 23:22:31 +0000 Subject: [PATCH 41/88] add incremental checking of cpuid --- .github/workflows/test-kernel-module.yaml | 5 ++ module/resctrl.c | 71 +++++++++++++++++++++-- run_module.sh | 23 ++++++++ 3 files changed, 93 insertions(+), 6 deletions(-) create mode 100644 run_module.sh diff --git a/.github/workflows/test-kernel-module.yaml b/.github/workflows/test-kernel-module.yaml index 5546d14..ccc9b1d 100644 --- a/.github/workflows/test-kernel-module.yaml +++ b/.github/workflows/test-kernel-module.yaml @@ -135,6 +135,11 @@ jobs: # we do not unmount, maybe mounting affects the intel_cqm checks below #sudo umount /sys/fs/resctrl || true + + - name: Run the interval dmesg reader + working-directory: module + run: | + ./run_module.sh - name: Load and test module id: load-and-test-module diff --git a/module/resctrl.c b/module/resctrl.c index a48ce5b..41e1675 100644 --- a/module/resctrl.c +++ b/module/resctrl.c @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include "resctrl.h" @@ -23,6 +25,11 @@ struct ipi_rmid_args { int status; }; +/* Add these declarations at the top after the includes */ +static struct timer_list cpuid_check_timer; +static atomic_t check_counter = ATOMIC_INIT(0); +#define MAX_CHECKS 6 + /* * IPI function to write RMID to MSR * Called on each CPU by on_each_cpu() @@ -52,7 +59,7 @@ static int enumerate_cpuid(void) unsigned int eax, ebx, ecx, edx; int ret = 0; - if (!boot_cpu_has(X86_FEATURE_RESCTRL_QOSMON)) { + if (!boot_cpu_has( X86_FEATURE_CQM_LLC)) { pr_err("memory_collector: CPU does not support QoS monitoring\n"); return -ENODEV; } @@ -82,16 +89,61 @@ static int enumerate_cpuid(void) return ret; } +static void cpuid_check_timer_callback(struct timer_list *t) +{ + int check_num = atomic_inc_return(&check_counter); + unsigned int eax, ebx, ecx, edx; + + pr_info("memory_collector: Running CPUID check %d of %d\n", check_num, MAX_CHECKS); + + switch (check_num) { + case 1: + if (!boot_cpu_has( X86_FEATURE_CQM_LLC)) { + pr_err("memory_collector: CPU does not support QoS monitoring\n"); + } + break; + case 2: + pr_debug("memory_collector: Checking CPUID.0x7.0 for RDT support\n"); + cpuid_count(0x7, 0, &eax, &ebx, &ecx, &edx); + if (!(ebx & (1 << 12))) { + pr_err("memory_collector: RDT monitoring not supported (CPUID.0x7.0:EBX.12)\n"); + } + break; + case 3: + pr_debug("memory_collector: Checking CPUID.0xF.0 for L3 monitoring\n"); + cpuid_count(0xF, 0, &eax, &ebx, &ecx, &edx); + if (!(edx & (1 << 1))) { + pr_err("memory_collector: L3 monitoring not supported (CPUID.0xF.0:EDX.1)\n"); + } + break; + case 4: + pr_debug("memory_collector: Checking CPUID.0xF.1 for L3 occupancy monitoring\n"); + cpuid_count(0xF, 1, &eax, &ebx, &ecx, &edx); + if (!(edx & (1 << 0))) { + pr_err("memory_collector: L3 occupancy monitoring not supported (CPUID.0xF.1:EDX.0)\n"); + } + break; + default: + pr_info("memory_collector: Completed all %d CPUID checks\n", MAX_CHECKS); + break; + } + + +} + int resctrl_init(void) { int cpu; int ret = 0; - ret = enumerate_cpuid(); - if (ret) { - pr_err("Memory Collector: Failed to enumerate CPUID\n"); - return ret; - } + // Initialize the timer + timer_setup(&cpuid_check_timer, cpuid_check_timer_callback, 0); + + // ret = enumerate_cpuid(); + // if (ret) { + // pr_err("Memory Collector: Failed to enumerate CPUID\n"); + // return ret; + // } // for_each_online_cpu(cpu) { // struct ipi_rmid_args args = { @@ -108,12 +160,19 @@ int resctrl_init(void) // } // pr_info("Memory Collector: Successfully set RMID %u on CPU %d\n", args.rmid, cpu); // } + + // Start the timer for first check in 5 seconds + mod_timer(&cpuid_check_timer, jiffies + (5 * HZ)); + pr_info("memory_collector: Started periodic CPUID checks\n"); return ret; } int resctrl_exit(void) { + // Delete the timer first + del_timer_sync(&cpuid_check_timer); + int failure_count = 0; int cpu; diff --git a/run_module.sh b/run_module.sh new file mode 100644 index 0000000..e35ddf1 --- /dev/null +++ b/run_module.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Clear dmesg buffer +sudo dmesg -c > /dev/null + +# Load the module +sudo insmod module/collector.ko + +# Store the timestamp when we started +start_time=$(date +%s) + +echo "Module loaded, monitoring dmesg output..." +echo "Press Ctrl+C to stop monitoring" + +while true; do + # Get new dmesg entries and timestamp them + sudo dmesg -c | while read line; do + current_time=$(($(date +%s) - start_time)) + echo "[$current_time sec] $line" + done + + sleep 1 +done \ No newline at end of file From 1525612ceae76d2a4422757bb2698b0603855a1c Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 23:28:02 +0000 Subject: [PATCH 42/88] move run_module.sh, fix permissions and path --- run_module.sh => module/run_module.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename run_module.sh => module/run_module.sh (93%) mode change 100644 => 100755 diff --git a/run_module.sh b/module/run_module.sh old mode 100644 new mode 100755 similarity index 93% rename from run_module.sh rename to module/run_module.sh index e35ddf1..dc46f34 --- a/run_module.sh +++ b/module/run_module.sh @@ -4,7 +4,7 @@ sudo dmesg -c > /dev/null # Load the module -sudo insmod module/collector.ko +sudo insmod build/collector.ko # Store the timestamp when we started start_time=$(date +%s) From 2cab06a488a19b10044f1f637b50e654de265a7d Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 23:29:52 +0000 Subject: [PATCH 43/88] fix race where timer fires before the per-cpu structs are initialized --- module/memory_collector.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 73ad9d3..2f7dbc3 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -205,7 +205,7 @@ static int __init memory_collector_init(void) .config = PERF_COUNT_SW_CPU_CLOCK, }; - printk(KERN_INFO "Memory Collector: initializing PMU module\n"); + printk(KERN_INFO "Memory Collector: initializing\n"); // Create sampling event sampling_event = perf_event_create_kernel_counter(&attr, @@ -219,9 +219,6 @@ static int __init memory_collector_init(void) return ret; } - // Enable the event - perf_event_enable(sampling_event); - // Allocate array for CPU states cpu_states = kcalloc(num_possible_cpus(), sizeof(*cpu_states), GFP_KERNEL); if (!cpu_states) { @@ -243,9 +240,13 @@ static int __init memory_collector_init(void) goto error_resctrl; } + // Enable the samplingevent + perf_event_enable(sampling_event); + return 0; error_resctrl: + perf_event_disable(sampling_event); resctrl_exit(); error_init: // Cleanup CPUs that were initialized @@ -254,7 +255,6 @@ static int __init memory_collector_init(void) } kfree(cpu_states); error_alloc: - perf_event_disable(sampling_event); perf_event_release_kernel(sampling_event); return ret; } From 33996a295083ee37b554dd2ab84a465daccb07f4 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 23:43:42 +0000 Subject: [PATCH 44/88] reorder initialization to avoid timer firing before all CPUs are ready --- module/memory_collector.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 2f7dbc3..262b6e6 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -207,18 +207,6 @@ static int __init memory_collector_init(void) printk(KERN_INFO "Memory Collector: initializing\n"); - // Create sampling event - sampling_event = perf_event_create_kernel_counter(&attr, - 0, // any CPU - NULL, // all threads - memory_collector_overflow_handler, - NULL); - if (IS_ERR(sampling_event)) { - ret = PTR_ERR(sampling_event); - printk(KERN_ERR "Memory Collector: failed to create sampling event: %d\n", ret); - return ret; - } - // Allocate array for CPU states cpu_states = kcalloc(num_possible_cpus(), sizeof(*cpu_states), GFP_KERNEL); if (!cpu_states) { @@ -230,7 +218,7 @@ static int __init memory_collector_init(void) for_each_possible_cpu(cpu) { ret = init_cpu(cpu); if (ret < 0) { - goto error_init; + goto error_cpu_init; } } @@ -240,22 +228,35 @@ static int __init memory_collector_init(void) goto error_resctrl; } + // Create sampling event + sampling_event = perf_event_create_kernel_counter(&attr, + 0, // any CPU + NULL, // all threads + memory_collector_overflow_handler, + NULL); + if (IS_ERR(sampling_event)) { + ret = PTR_ERR(sampling_event); + printk(KERN_ERR "Memory Collector: failed to create sampling event: %d\n", ret); + goto error_sampling; + } + // Enable the samplingevent perf_event_enable(sampling_event); return 0; -error_resctrl: +error_sampling: perf_event_disable(sampling_event); + perf_event_release_kernel(sampling_event); +error_resctrl: resctrl_exit(); -error_init: +error_cpu_init: // Cleanup CPUs that were initialized for_each_possible_cpu(cpu) { cleanup_cpu(cpu); } kfree(cpu_states); error_alloc: - perf_event_release_kernel(sampling_event); return ret; } From da36ceeb309e82fed6f526b1a070e1765922f98f Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 23:58:36 +0000 Subject: [PATCH 45/88] disable context switch monitoring to elminate nested handling --- module/memory_collector.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 262b6e6..aac227e 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -156,15 +156,16 @@ static int init_cpu(int cpu) attr.disabled = 0; attr.sample_period = 1; // Sample every context switch - cpu_states[cpu].ctx_switch = perf_event_create_kernel_counter(&attr, cpu, NULL, - context_switch_handler, - NULL); - if (IS_ERR(cpu_states[cpu].ctx_switch)) { - ret = PTR_ERR(cpu_states[cpu].ctx_switch); - pr_err("Failed to create context switch event for CPU %d\n", cpu); - cpu_states[cpu].ctx_switch = NULL; - goto error; - } + // cpu_states[cpu].ctx_switch = perf_event_create_kernel_counter(&attr, cpu, NULL, + // context_switch_handler, + // NULL); + // if (IS_ERR(cpu_states[cpu].ctx_switch)) { + // ret = PTR_ERR(cpu_states[cpu].ctx_switch); + // pr_err("Failed to create context switch event for CPU %d\n", cpu); + // cpu_states[cpu].ctx_switch = NULL; + // goto error; + // } + cpu_states[cpu].ctx_switch = NULL; return 0; From 50146a88d45ae3194d0483db1d2cbe93c7cb3700 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Mon, 10 Feb 2025 23:58:49 +0000 Subject: [PATCH 46/88] reset the timer to see the next results --- module/resctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/resctrl.c b/module/resctrl.c index 41e1675..c0edfe5 100644 --- a/module/resctrl.c +++ b/module/resctrl.c @@ -128,7 +128,7 @@ static void cpuid_check_timer_callback(struct timer_list *t) break; } - + mod_timer(&cpuid_check_timer, jiffies + (5 * HZ)); } int resctrl_init(void) From d8bf94fb8e267e0bbc77cb029a1d564c30111cd2 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 00:01:57 +0000 Subject: [PATCH 47/88] add workflow parameter for instance type --- .github/workflows/test-kernel-module.yaml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-kernel-module.yaml b/.github/workflows/test-kernel-module.yaml index ccc9b1d..773151a 100644 --- a/.github/workflows/test-kernel-module.yaml +++ b/.github/workflows/test-kernel-module.yaml @@ -1,5 +1,12 @@ name: Test Kernel Module -on: workflow_dispatch # Manual trigger for testing +on: + workflow_dispatch: # Manual trigger for testing + inputs: + instance-type: + description: 'EC2 instance type to use' + required: false + default: 'c7i.metal-24xl' + type: string permissions: id-token: write # Required for requesting the JWT @@ -26,7 +33,7 @@ jobs: mode: start github-token: ${{ secrets.REPO_ADMIN_TOKEN }} ec2-image-id: ami-0884d2865dbe9de4b # Ubuntu 22.04 LTS in us-east-2 - ec2-instance-type: c7i.metal-24xl + ec2-instance-type: ${{ inputs.instance-type }} market-type: spot subnet-id: ${{ secrets.AWS_SUBNET_ID }} security-group-id: ${{ secrets.AWS_SECURITY_GROUP_ID }} From ebc67cedadbab2ac152d4af1ae30158f60d43cf8 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 00:28:14 +0000 Subject: [PATCH 48/88] defer initialization of the sampling timer --- module/memory_collector.c | 52 +++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index aac227e..81c411a 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -195,10 +195,8 @@ static void cleanup_cpu(int cpu) } } -// Update the init function -static int __init memory_collector_init(void) +static void enable_sampling_work(struct work_struct *work) { - int cpu, ret; struct perf_event_attr attr = { .type = PERF_TYPE_SOFTWARE, .size = sizeof(struct perf_event_attr), @@ -206,6 +204,32 @@ static int __init memory_collector_init(void) .config = PERF_COUNT_SW_CPU_CLOCK, }; + pr_info("Memory Collector: enabling time-based sampling\n"); + + // Create sampling event + sampling_event = perf_event_create_kernel_counter(&attr, + 0, // any CPU + NULL, // all threads + memory_collector_overflow_handler, + NULL); + if (IS_ERR(sampling_event)) { + pr_err("Memory Collector: failed to create sampling event: %ld\n", PTR_ERR(sampling_event)); + sampling_event = NULL; + return; + } + + // Enable the samplingevent + perf_event_enable(sampling_event); +} + +static DECLARE_DELAYED_WORK(enable_sampling_delayed, enable_sampling_work); + + +// Update the init function +static int __init memory_collector_init(void) +{ + int cpu, ret; + printk(KERN_INFO "Memory Collector: initializing\n"); // Allocate array for CPU states @@ -229,26 +253,16 @@ static int __init memory_collector_init(void) goto error_resctrl; } - // Create sampling event - sampling_event = perf_event_create_kernel_counter(&attr, - 0, // any CPU - NULL, // all threads - memory_collector_overflow_handler, - NULL); - if (IS_ERR(sampling_event)) { - ret = PTR_ERR(sampling_event); - printk(KERN_ERR "Memory Collector: failed to create sampling event: %d\n", ret); - goto error_sampling; + ret = schedule_delayed_work(&enable_sampling_delayed, msecs_to_jiffies(1000)); // 1 second delay + if (!ret) { + // unexpected since the work should not already be on a queue + pr_err("Memory Collector: failed to schedule sampling work\n"); + goto error_schedule; } - // Enable the samplingevent - perf_event_enable(sampling_event); - return 0; -error_sampling: - perf_event_disable(sampling_event); - perf_event_release_kernel(sampling_event); +error_schedule: error_resctrl: resctrl_exit(); error_cpu_init: From a50d1f64cce3602e2bc68c1c295bbaac40a91a96 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 01:29:51 +0000 Subject: [PATCH 49/88] disable LLC misses reading, to check if they are causing the kernel trace warning --- module/memory_collector.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 81c411a..301b0e2 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -47,10 +47,10 @@ static void collect_sample_on_current_cpu(bool is_context_switch) timestamp = ktime_get_ns(); cpu = smp_processor_id(); - // Read LLC misses - if (cpu_states[cpu].llc_miss) { - llc_misses = perf_event_read_value(cpu_states[cpu].llc_miss, &enabled, &running); - } + // // Read LLC misses + // if (cpu_states[cpu].llc_miss) { + // llc_misses = perf_event_read_value(cpu_states[cpu].llc_miss, &enabled, &running); + // } // Read cycles if (cpu_states[cpu].cycles) { From faabf6659937cf7f75cfbc407ccf4233ee806dbd Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 01:31:18 +0000 Subject: [PATCH 50/88] loop for 30 seconds, then unload module --- module/run_module.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/module/run_module.sh b/module/run_module.sh index dc46f34..8b4547c 100755 --- a/module/run_module.sh +++ b/module/run_module.sh @@ -12,7 +12,7 @@ start_time=$(date +%s) echo "Module loaded, monitoring dmesg output..." echo "Press Ctrl+C to stop monitoring" -while true; do +for i in {1..30}; do # Get new dmesg entries and timestamp them sudo dmesg -c | while read line; do current_time=$(($(date +%s) - start_time)) @@ -20,4 +20,6 @@ while true; do done sleep 1 -done \ No newline at end of file +done + +sudo rmmod collector From ff205bd6ad99f96bc075796e98413873242ade3f Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 04:01:25 +0000 Subject: [PATCH 51/88] use an hrtimer on each cpu to output measurements to avoid IPI problems --- module/memory_collector.c | 186 ++++++++++++++++++-------------------- 1 file changed, 87 insertions(+), 99 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 301b0e2..9cbddc9 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -21,17 +21,17 @@ MODULE_VERSION("1.0"); #define CREATE_TRACE_POINTS #include "memory_collector_trace.h" -// Add the cpu_state struct definition after the includes +// Replace the cpu_state struct and global variable definitions struct cpu_state { struct perf_event *llc_miss; struct perf_event *cycles; struct perf_event *instructions; - struct perf_event *ctx_switch; // New field for context switch event + struct perf_event *ctx_switch; + struct hrtimer timer; + ktime_t next_expected; }; -// Replace the llc_miss_events global with cpu_states -static struct cpu_state *cpu_states; -static struct perf_event *sampling_event; +static struct cpu_state __percpu *cpu_states; // Add the init_cpu and cleanup_cpu function declarations static int init_cpu(int cpu); @@ -75,30 +75,12 @@ static void context_switch_handler(struct perf_event *event, collect_sample_on_current_cpu(true); } -static void ipi_handler(void *info) { - collect_sample_on_current_cpu(false); -} - -// Overflow handler for the time sampling event -static void memory_collector_overflow_handler(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - const struct cpumask *mask = cpu_online_mask; - - - // Send IPI to all other CPUs - smp_call_function_many(mask, ipi_handler, NULL, 1); - - // Run the trace on this CPU - ipi_handler(NULL); -} - // Update init_cpu to include context switch event setup static int init_cpu(int cpu) { struct perf_event_attr attr; int ret; + struct cpu_state *state = per_cpu_ptr(cpu_states, cpu); // Setup LLC miss event memset(&attr, 0, sizeof(attr)); @@ -110,11 +92,11 @@ static int init_cpu(int cpu) attr.exclude_hv = 0; attr.exclude_idle = 0; - cpu_states[cpu].llc_miss = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); - if (IS_ERR(cpu_states[cpu].llc_miss)) { - ret = PTR_ERR(cpu_states[cpu].llc_miss); + state->llc_miss = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); + if (IS_ERR(state->llc_miss)) { + ret = PTR_ERR(state->llc_miss); pr_err("Failed to create LLC miss event for CPU %d\n", cpu); - cpu_states[cpu].llc_miss = NULL; + state->llc_miss = NULL; goto error; } @@ -125,11 +107,11 @@ static int init_cpu(int cpu) attr.size = sizeof(attr); attr.disabled = 0; - cpu_states[cpu].cycles = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); - if (IS_ERR(cpu_states[cpu].cycles)) { - ret = PTR_ERR(cpu_states[cpu].cycles); + state->cycles = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); + if (IS_ERR(state->cycles)) { + ret = PTR_ERR(state->cycles); pr_err("Failed to create cycles event for CPU %d\n", cpu); - cpu_states[cpu].cycles = NULL; + state->cycles = NULL; goto error; } @@ -140,11 +122,11 @@ static int init_cpu(int cpu) attr.size = sizeof(attr); attr.disabled = 0; - cpu_states[cpu].instructions = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); - if (IS_ERR(cpu_states[cpu].instructions)) { - ret = PTR_ERR(cpu_states[cpu].instructions); + state->instructions = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); + if (IS_ERR(state->instructions)) { + ret = PTR_ERR(state->instructions); pr_err("Failed to create instructions event for CPU %d\n", cpu); - cpu_states[cpu].instructions = NULL; + state->instructions = NULL; goto error; } @@ -156,16 +138,15 @@ static int init_cpu(int cpu) attr.disabled = 0; attr.sample_period = 1; // Sample every context switch - // cpu_states[cpu].ctx_switch = perf_event_create_kernel_counter(&attr, cpu, NULL, - // context_switch_handler, - // NULL); - // if (IS_ERR(cpu_states[cpu].ctx_switch)) { - // ret = PTR_ERR(cpu_states[cpu].ctx_switch); - // pr_err("Failed to create context switch event for CPU %d\n", cpu); - // cpu_states[cpu].ctx_switch = NULL; - // goto error; - // } - cpu_states[cpu].ctx_switch = NULL; + state->ctx_switch = perf_event_create_kernel_counter(&attr, cpu, NULL, + context_switch_handler, + NULL); + if (IS_ERR(state->ctx_switch)) { + ret = PTR_ERR(state->ctx_switch); + pr_err("Failed to create context switch event for CPU %d\n", cpu); + state->ctx_switch = NULL; + goto error; + } return 0; @@ -177,53 +158,62 @@ static int init_cpu(int cpu) // Update cleanup_cpu to clean up context switch event static void cleanup_cpu(int cpu) { - if (cpu_states[cpu].llc_miss) { - perf_event_release_kernel(cpu_states[cpu].llc_miss); - cpu_states[cpu].llc_miss = NULL; + struct cpu_state *state = per_cpu_ptr(cpu_states, cpu); + if (state->ctx_switch) { + perf_event_release_kernel(state->ctx_switch); + state->ctx_switch = NULL; } - if (cpu_states[cpu].cycles) { - perf_event_release_kernel(cpu_states[cpu].cycles); - cpu_states[cpu].cycles = NULL; + if (state->llc_miss) { + perf_event_release_kernel(state->llc_miss); + state->llc_miss = NULL; } - if (cpu_states[cpu].instructions) { - perf_event_release_kernel(cpu_states[cpu].instructions); - cpu_states[cpu].instructions = NULL; + if (state->cycles) { + perf_event_release_kernel(state->cycles); + state->cycles = NULL; } - if (cpu_states[cpu].ctx_switch) { - perf_event_release_kernel(cpu_states[cpu].ctx_switch); - cpu_states[cpu].ctx_switch = NULL; + if (state->instructions) { + perf_event_release_kernel(state->instructions); + state->instructions = NULL; } } -static void enable_sampling_work(struct work_struct *work) +static enum hrtimer_restart timer_fn(struct hrtimer *timer) { - struct perf_event_attr attr = { - .type = PERF_TYPE_SOFTWARE, - .size = sizeof(struct perf_event_attr), - .sample_period = 1000000, // 1ms - .config = PERF_COUNT_SW_CPU_CLOCK, - }; - - pr_info("Memory Collector: enabling time-based sampling\n"); + struct cpu_state *state = container_of(timer, struct cpu_state, timer); + ktime_t now = ktime_get(); - // Create sampling event - sampling_event = perf_event_create_kernel_counter(&attr, - 0, // any CPU - NULL, // all threads - memory_collector_overflow_handler, - NULL); - if (IS_ERR(sampling_event)) { - pr_err("Memory Collector: failed to create sampling event: %ld\n", PTR_ERR(sampling_event)); - sampling_event = NULL; - return; - } - - // Enable the samplingevent - perf_event_enable(sampling_event); + // Collect the sample + collect_sample_on_current_cpu(false); + + // Schedule next timer + state->next_expected = ktime_add_ns(now, NSEC_PER_MSEC); + state->next_expected = ktime_set(ktime_to_ns(state->next_expected) / NSEC_PER_SEC, + (ktime_to_ns(state->next_expected) % NSEC_PER_SEC) / + NSEC_PER_MSEC * NSEC_PER_MSEC); + + hrtimer_forward_now(timer, state->next_expected); + return HRTIMER_RESTART; } -static DECLARE_DELAYED_WORK(enable_sampling_delayed, enable_sampling_work); - +static void start_cpu_timer(void *info) +{ + struct cpu_state *state; + ktime_t now; + + state = this_cpu_ptr(cpu_states); + + hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + state->timer.function = timer_fn; + + // Start timer at next millisecond boundary + now = ktime_get(); + state->next_expected = ktime_add_ns(now, NSEC_PER_MSEC); + state->next_expected = ktime_set(ktime_to_ns(state->next_expected) / NSEC_PER_SEC, + (ktime_to_ns(state->next_expected) % NSEC_PER_SEC) / + NSEC_PER_MSEC * NSEC_PER_MSEC); + + hrtimer_start(&state->timer, state->next_expected, HRTIMER_MODE_ABS); +} // Update the init function static int __init memory_collector_init(void) @@ -232,8 +222,8 @@ static int __init memory_collector_init(void) printk(KERN_INFO "Memory Collector: initializing\n"); - // Allocate array for CPU states - cpu_states = kcalloc(num_possible_cpus(), sizeof(*cpu_states), GFP_KERNEL); + // Allocate percpu array + cpu_states = alloc_percpu(struct cpu_state); if (!cpu_states) { ret = -ENOMEM; goto error_alloc; @@ -253,24 +243,19 @@ static int __init memory_collector_init(void) goto error_resctrl; } - ret = schedule_delayed_work(&enable_sampling_delayed, msecs_to_jiffies(1000)); // 1 second delay - if (!ret) { - // unexpected since the work should not already be on a queue - pr_err("Memory Collector: failed to schedule sampling work\n"); - goto error_schedule; - } + + // Start timers on all CPUs + on_each_cpu(start_cpu_timer, NULL, 1); return 0; -error_schedule: error_resctrl: resctrl_exit(); error_cpu_init: - // Cleanup CPUs that were initialized for_each_possible_cpu(cpu) { cleanup_cpu(cpu); } - kfree(cpu_states); + free_percpu(cpu_states); error_alloc: return ret; } @@ -282,19 +267,22 @@ static void __exit memory_collector_exit(void) printk(KERN_INFO "Memory Collector: unregistering PMU module\n"); - if (sampling_event) { - perf_event_disable(sampling_event); - perf_event_release_kernel(sampling_event); + + // Cancel timers on all CPUs + for_each_possible_cpu(cpu) { + struct cpu_state *state = per_cpu_ptr(cpu_states, cpu); + hrtimer_cancel(&state->timer); } - // Call resctrl exit first + // Call resctrl exit resctrl_exit(); // Cleanup all CPUs for_each_possible_cpu(cpu) { cleanup_cpu(cpu); } - kfree(cpu_states); + + free_percpu(cpu_states); } module_init(memory_collector_init); From 6558b6c3e28fc547c6a81f95e6e351991217560f Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 04:11:08 +0000 Subject: [PATCH 52/88] fix access to percpu array --- module/memory_collector.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 9cbddc9..6176c1d 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -46,20 +46,21 @@ static void collect_sample_on_current_cpu(bool is_context_switch) timestamp = ktime_get_ns(); cpu = smp_processor_id(); + struct cpu_state *state = this_cpu_ptr(cpu_states); - // // Read LLC misses - // if (cpu_states[cpu].llc_miss) { - // llc_misses = perf_event_read_value(cpu_states[cpu].llc_miss, &enabled, &running); - // } + // Read LLC misses + if (state->llc_miss) { + llc_misses = perf_event_read_value(state->llc_miss, &enabled, &running); + } // Read cycles - if (cpu_states[cpu].cycles) { - cycles = perf_event_read_value(cpu_states[cpu].cycles, &enabled, &running); + if (state->cycles) { + cycles = perf_event_read_value(state->cycles, &enabled, &running); } // Read instructions - if (cpu_states[cpu].instructions) { - instructions = perf_event_read_value(cpu_states[cpu].instructions, &enabled, &running); + if (state->instructions) { + instructions = perf_event_read_value(state->instructions, &enabled, &running); } trace_memory_collector_sample(cpu, timestamp, current->comm, From 535a6f55e5b7a3ad5e968a68f53a790ecd421bd1 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 04:26:28 +0000 Subject: [PATCH 53/88] add debug prints --- module/memory_collector.c | 5 +++-- module/resctrl.c | 36 +++++++++++++++++++----------------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 6176c1d..d62c422 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -231,6 +231,7 @@ static int __init memory_collector_init(void) } // Initialize each CPU + pr_info("Memory Collector: initializing per-cpu perf events\n"); for_each_possible_cpu(cpu) { ret = init_cpu(cpu); if (ret < 0) { @@ -238,14 +239,14 @@ static int __init memory_collector_init(void) } } + pr_info("Memory Collector: initializing resctrl\n"); ret = resctrl_init(); if (ret < 0) { pr_err("Failed to initialize resctrl: %d\n", ret); goto error_resctrl; } - - // Start timers on all CPUs + pr_info("Memory Collector: starting timers on all CPUs\n"); on_each_cpu(start_cpu_timer, NULL, 1); return 0; diff --git a/module/resctrl.c b/module/resctrl.c index c0edfe5..ed64c67 100644 --- a/module/resctrl.c +++ b/module/resctrl.c @@ -54,38 +54,38 @@ static void ipi_write_rmid(void *info) static int enumerate_cpuid(void) { - pr_info("memory_collector: Starting enumerate_cpuid\n"); + pr_info("Memory Collector: Starting enumerate_cpuid\n"); unsigned int eax, ebx, ecx, edx; int ret = 0; if (!boot_cpu_has( X86_FEATURE_CQM_LLC)) { - pr_err("memory_collector: CPU does not support QoS monitoring\n"); + pr_err("Memory Collector: CPU does not support QoS monitoring\n"); return -ENODEV; } pr_debug("memory_collector: Checking CPUID.0x7.0 for RDT support\n"); cpuid_count(0x7, 0, &eax, &ebx, &ecx, &edx); if (!(ebx & (1 << 12))) { - pr_err("memory_collector: RDT monitoring not supported (CPUID.0x7.0:EBX.12)\n"); + pr_err("Memory Collector: RDT monitoring not supported (CPUID.0x7.0:EBX.12)\n"); return -ENODEV; } pr_debug("memory_collector: Checking CPUID.0xF.0 for L3 monitoring\n"); cpuid_count(0xF, 0, &eax, &ebx, &ecx, &edx); if (!(edx & (1 << 1))) { - pr_err("memory_collector: L3 monitoring not supported (CPUID.0xF.0:EDX.1)\n"); + pr_err("Memory Collector: L3 monitoring not supported (CPUID.0xF.0:EDX.1)\n"); return -ENODEV; } - pr_debug("memory_collector: Checking CPUID.0xF.1 for L3 occupancy monitoring\n"); + pr_debug("Memory Collector: Checking CPUID.0xF.1 for L3 occupancy monitoring\n"); cpuid_count(0xF, 1, &eax, &ebx, &ecx, &edx); if (!(edx & (1 << 0))) { - pr_err("memory_collector: L3 occupancy monitoring not supported (CPUID.0xF.1:EDX.0)\n"); + pr_err("Memory Collector: L3 occupancy monitoring not supported (CPUID.0xF.1:EDX.0)\n"); return -ENODEV; } - pr_info("memory_collector: enumerate_cpuid completed successfully\n"); + pr_info("Memory Collector: enumerate_cpuid completed successfully\n"); return ret; } @@ -94,37 +94,37 @@ static void cpuid_check_timer_callback(struct timer_list *t) int check_num = atomic_inc_return(&check_counter); unsigned int eax, ebx, ecx, edx; - pr_info("memory_collector: Running CPUID check %d of %d\n", check_num, MAX_CHECKS); + pr_info("Memory Collector: Running CPUID check %d of %d\n", check_num, MAX_CHECKS); switch (check_num) { case 1: if (!boot_cpu_has( X86_FEATURE_CQM_LLC)) { - pr_err("memory_collector: CPU does not support QoS monitoring\n"); + pr_err("Memory Collector: CPU does not support QoS monitoring\n"); } break; case 2: - pr_debug("memory_collector: Checking CPUID.0x7.0 for RDT support\n"); + pr_debug("Memory Collector: Checking CPUID.0x7.0 for RDT support\n"); cpuid_count(0x7, 0, &eax, &ebx, &ecx, &edx); if (!(ebx & (1 << 12))) { - pr_err("memory_collector: RDT monitoring not supported (CPUID.0x7.0:EBX.12)\n"); + pr_err("Memory Collector: RDT monitoring not supported (CPUID.0x7.0:EBX.12)\n"); } break; case 3: - pr_debug("memory_collector: Checking CPUID.0xF.0 for L3 monitoring\n"); + pr_debug("Memory Collector: Checking CPUID.0xF.0 for L3 monitoring\n"); cpuid_count(0xF, 0, &eax, &ebx, &ecx, &edx); if (!(edx & (1 << 1))) { - pr_err("memory_collector: L3 monitoring not supported (CPUID.0xF.0:EDX.1)\n"); + pr_err("Memory Collector: L3 monitoring not supported (CPUID.0xF.0:EDX.1)\n"); } break; case 4: - pr_debug("memory_collector: Checking CPUID.0xF.1 for L3 occupancy monitoring\n"); + pr_debug("Memory Collector: Checking CPUID.0xF.1 for L3 occupancy monitoring\n"); cpuid_count(0xF, 1, &eax, &ebx, &ecx, &edx); if (!(edx & (1 << 0))) { - pr_err("memory_collector: L3 occupancy monitoring not supported (CPUID.0xF.1:EDX.0)\n"); + pr_err("Memory Collector: L3 occupancy monitoring not supported (CPUID.0xF.1:EDX.0)\n"); } break; default: - pr_info("memory_collector: Completed all %d CPUID checks\n", MAX_CHECKS); + pr_info("Memory Collector: Completed all %d CPUID checks\n", MAX_CHECKS); break; } @@ -136,6 +136,8 @@ int resctrl_init(void) int cpu; int ret = 0; + pr_info("Memory Collector: Initializing resctrl\n"); + // Initialize the timer timer_setup(&cpuid_check_timer, cpuid_check_timer_callback, 0); @@ -163,7 +165,7 @@ int resctrl_init(void) // Start the timer for first check in 5 seconds mod_timer(&cpuid_check_timer, jiffies + (5 * HZ)); - pr_info("memory_collector: Started periodic CPUID checks\n"); + pr_info("Memory Collector: Started periodic CPUID checks\n"); return ret; } From 368d397314139b89c37e8a753234e9912d6b4c1c Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 04:46:18 +0000 Subject: [PATCH 54/88] initialize the perf events on the CPU where they will be used --- module/memory_collector.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index d62c422..ff6462b 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -34,7 +34,6 @@ struct cpu_state { static struct cpu_state __percpu *cpu_states; // Add the init_cpu and cleanup_cpu function declarations -static int init_cpu(int cpu); static void cleanup_cpu(int cpu); static void collect_sample_on_current_cpu(bool is_context_switch) @@ -77,11 +76,17 @@ static void context_switch_handler(struct perf_event *event, } // Update init_cpu to include context switch event setup -static int init_cpu(int cpu) +static void init_cpu(void *info) { struct perf_event_attr attr; int ret; - struct cpu_state *state = per_cpu_ptr(cpu_states, cpu); + int cpu = smp_processor_id(); + struct cpu_state *state = this_cpu_ptr(cpu_states); + + state->llc_miss = NULL; + state->cycles = NULL; + state->instructions = NULL; + state->ctx_switch = NULL; // Setup LLC miss event memset(&attr, 0, sizeof(attr)); @@ -149,11 +154,9 @@ static int init_cpu(int cpu) goto error; } - return 0; - + return; error: cleanup_cpu(cpu); - return ret; } // Update cleanup_cpu to clean up context switch event @@ -232,9 +235,13 @@ static int __init memory_collector_init(void) // Initialize each CPU pr_info("Memory Collector: initializing per-cpu perf events\n"); + on_each_cpu(init_cpu, NULL, 1); + + pr_info("Memory Collector: checking per-cpu perf events\n"); for_each_possible_cpu(cpu) { - ret = init_cpu(cpu); - if (ret < 0) { + struct cpu_state *state = per_cpu_ptr(cpu_states, cpu); + if (state->ctx_switch == NULL) { + // there was an error during one of the init_cpu calls goto error_cpu_init; } } From 1ab3ed244eac06f7c7626072bfdb8d93861dc6ea Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 05:05:04 +0000 Subject: [PATCH 55/88] use work queues to set up monitoring --- module/memory_collector.c | 92 ++++++++++++++++++++++++--------------- 1 file changed, 56 insertions(+), 36 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index ff6462b..96355d4 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -7,6 +7,7 @@ #include #include #include "resctrl.h" +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Memory Collector Project"); @@ -33,8 +34,11 @@ struct cpu_state { static struct cpu_state __percpu *cpu_states; -// Add the init_cpu and cleanup_cpu function declarations static void cleanup_cpu(int cpu); +static enum hrtimer_restart timer_fn(struct hrtimer *timer); + +// Add workqueue declaration at the top with other global variables +static struct workqueue_struct *collector_wq; static void collect_sample_on_current_cpu(bool is_context_switch) { @@ -75,13 +79,14 @@ static void context_switch_handler(struct perf_event *event, collect_sample_on_current_cpu(true); } -// Update init_cpu to include context switch event setup -static void init_cpu(void *info) +// Merge init_cpu and start_cpu_timer into a single function +static void init_cpu_state(struct work_struct *work) { struct perf_event_attr attr; int ret; int cpu = smp_processor_id(); struct cpu_state *state = this_cpu_ptr(cpu_states); + ktime_t now; state->llc_miss = NULL; state->cycles = NULL; @@ -136,17 +141,17 @@ static void init_cpu(void *info) goto error; } - // After setting up instructions event, add context switch event + // Setup context switch event memset(&attr, 0, sizeof(attr)); attr.type = PERF_TYPE_SOFTWARE; attr.config = PERF_COUNT_SW_CONTEXT_SWITCHES; attr.size = sizeof(attr); attr.disabled = 0; - attr.sample_period = 1; // Sample every context switch + attr.sample_period = 1; state->ctx_switch = perf_event_create_kernel_counter(&attr, cpu, NULL, - context_switch_handler, - NULL); + context_switch_handler, + NULL); if (IS_ERR(state->ctx_switch)) { ret = PTR_ERR(state->ctx_switch); pr_err("Failed to create context switch event for CPU %d\n", cpu); @@ -154,7 +159,19 @@ static void init_cpu(void *info) goto error; } + // Initialize and start the timer (moved from start_cpu_timer) + hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + state->timer.function = timer_fn; + + now = ktime_get(); + state->next_expected = ktime_add_ns(now, NSEC_PER_MSEC); + state->next_expected = ktime_set(ktime_to_ns(state->next_expected) / NSEC_PER_SEC, + (ktime_to_ns(state->next_expected) % NSEC_PER_SEC) / + NSEC_PER_MSEC * NSEC_PER_MSEC); + + hrtimer_start(&state->timer, state->next_expected, HRTIMER_MODE_ABS); return; + error: cleanup_cpu(cpu); } @@ -199,30 +216,11 @@ static enum hrtimer_restart timer_fn(struct hrtimer *timer) return HRTIMER_RESTART; } -static void start_cpu_timer(void *info) -{ - struct cpu_state *state; - ktime_t now; - - state = this_cpu_ptr(cpu_states); - - hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - state->timer.function = timer_fn; - - // Start timer at next millisecond boundary - now = ktime_get(); - state->next_expected = ktime_add_ns(now, NSEC_PER_MSEC); - state->next_expected = ktime_set(ktime_to_ns(state->next_expected) / NSEC_PER_SEC, - (ktime_to_ns(state->next_expected) % NSEC_PER_SEC) / - NSEC_PER_MSEC * NSEC_PER_MSEC); - - hrtimer_start(&state->timer, state->next_expected, HRTIMER_MODE_ABS); -} - -// Update the init function +// Update the init function to use workqueue static int __init memory_collector_init(void) { int cpu, ret; + struct work_struct __percpu *cpu_works; printk(KERN_INFO "Memory Collector: initializing\n"); @@ -233,15 +231,37 @@ static int __init memory_collector_init(void) goto error_alloc; } - // Initialize each CPU + // Create workqueue + collector_wq = alloc_workqueue("collector_wq", WQ_UNBOUND, 0); + if (!collector_wq) { + ret = -ENOMEM; + goto error_wq; + } + + // Allocate per-CPU work structures + cpu_works = alloc_percpu(struct work_struct); + if (!cpu_works) { + ret = -ENOMEM; + goto error_work_alloc; + } + + // Initialize and queue work for each CPU pr_info("Memory Collector: initializing per-cpu perf events\n"); - on_each_cpu(init_cpu, NULL, 1); + for_each_possible_cpu(cpu) { + struct work_struct *work = per_cpu_ptr(cpu_works, cpu); + INIT_WORK(work, init_cpu_state); + queue_work_on(cpu, collector_wq, work); + } + + // Wait for all work to complete + flush_workqueue(collector_wq); + free_percpu(cpu_works); + // Check initialization results pr_info("Memory Collector: checking per-cpu perf events\n"); for_each_possible_cpu(cpu) { struct cpu_state *state = per_cpu_ptr(cpu_states, cpu); if (state->ctx_switch == NULL) { - // there was an error during one of the init_cpu calls goto error_cpu_init; } } @@ -253,9 +273,6 @@ static int __init memory_collector_init(void) goto error_resctrl; } - pr_info("Memory Collector: starting timers on all CPUs\n"); - on_each_cpu(start_cpu_timer, NULL, 1); - return 0; error_resctrl: @@ -264,19 +281,21 @@ static int __init memory_collector_init(void) for_each_possible_cpu(cpu) { cleanup_cpu(cpu); } +error_work_alloc: + destroy_workqueue(collector_wq); +error_wq: free_percpu(cpu_states); error_alloc: return ret; } -// Update the exit function +// Update the exit function to destroy workqueue static void __exit memory_collector_exit(void) { int cpu; printk(KERN_INFO "Memory Collector: unregistering PMU module\n"); - // Cancel timers on all CPUs for_each_possible_cpu(cpu) { struct cpu_state *state = per_cpu_ptr(cpu_states, cpu); @@ -291,6 +310,7 @@ static void __exit memory_collector_exit(void) cleanup_cpu(cpu); } + destroy_workqueue(collector_wq); free_percpu(cpu_states); } From 88700243246f0089b411cc1b1a9f1f80cd7da74e Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 14:14:20 +0000 Subject: [PATCH 56/88] fortify hrtimer init and cleanup --- module/memory_collector.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 96355d4..dd5809a 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -180,6 +180,7 @@ static void init_cpu_state(struct work_struct *work) static void cleanup_cpu(int cpu) { struct cpu_state *state = per_cpu_ptr(cpu_states, cpu); + hrtimer_cancel(&state->timer); if (state->ctx_switch) { perf_event_release_kernel(state->ctx_switch); state->ctx_switch = NULL; @@ -230,6 +231,16 @@ static int __init memory_collector_init(void) ret = -ENOMEM; goto error_alloc; } + // initialize the cpu_states, so we can query whether each variable is initialized + for_each_possible_cpu(cpu) { + struct cpu_state *state = per_cpu_ptr(cpu_states, cpu); + state->llc_miss = NULL; + state->cycles = NULL; + state->instructions = NULL; + state->ctx_switch = NULL; + hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + state->timer.function = timer_fn; + } // Create workqueue collector_wq = alloc_workqueue("collector_wq", WQ_UNBOUND, 0); @@ -296,12 +307,6 @@ static void __exit memory_collector_exit(void) printk(KERN_INFO "Memory Collector: unregistering PMU module\n"); - // Cancel timers on all CPUs - for_each_possible_cpu(cpu) { - struct cpu_state *state = per_cpu_ptr(cpu_states, cpu); - hrtimer_cancel(&state->timer); - } - // Call resctrl exit resctrl_exit(); From e3856e61d3036fa01ab89894b254047c27ae2765 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 14:14:42 +0000 Subject: [PATCH 57/88] disable all perf events to check stability without perf --- module/memory_collector.c | 104 +++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index dd5809a..d4163d6 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -103,61 +103,61 @@ static void init_cpu_state(struct work_struct *work) attr.exclude_hv = 0; attr.exclude_idle = 0; - state->llc_miss = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); - if (IS_ERR(state->llc_miss)) { - ret = PTR_ERR(state->llc_miss); - pr_err("Failed to create LLC miss event for CPU %d\n", cpu); - state->llc_miss = NULL; - goto error; - } - - // Setup cycles event - memset(&attr, 0, sizeof(attr)); - attr.type = PERF_TYPE_HARDWARE; - attr.config = PERF_COUNT_HW_CPU_CYCLES; - attr.size = sizeof(attr); - attr.disabled = 0; + // state->llc_miss = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); + // if (IS_ERR(state->llc_miss)) { + // ret = PTR_ERR(state->llc_miss); + // pr_err("Failed to create LLC miss event for CPU %d\n", cpu); + // state->llc_miss = NULL; + // goto error; + // } + + // // Setup cycles event + // memset(&attr, 0, sizeof(attr)); + // attr.type = PERF_TYPE_HARDWARE; + // attr.config = PERF_COUNT_HW_CPU_CYCLES; + // attr.size = sizeof(attr); + // attr.disabled = 0; - state->cycles = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); - if (IS_ERR(state->cycles)) { - ret = PTR_ERR(state->cycles); - pr_err("Failed to create cycles event for CPU %d\n", cpu); - state->cycles = NULL; - goto error; - } - - // Setup instructions event - memset(&attr, 0, sizeof(attr)); - attr.type = PERF_TYPE_HARDWARE; - attr.config = PERF_COUNT_HW_INSTRUCTIONS; - attr.size = sizeof(attr); - attr.disabled = 0; + // state->cycles = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); + // if (IS_ERR(state->cycles)) { + // ret = PTR_ERR(state->cycles); + // pr_err("Failed to create cycles event for CPU %d\n", cpu); + // state->cycles = NULL; + // goto error; + // } + + // // Setup instructions event + // memset(&attr, 0, sizeof(attr)); + // attr.type = PERF_TYPE_HARDWARE; + // attr.config = PERF_COUNT_HW_INSTRUCTIONS; + // attr.size = sizeof(attr); + // attr.disabled = 0; - state->instructions = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); - if (IS_ERR(state->instructions)) { - ret = PTR_ERR(state->instructions); - pr_err("Failed to create instructions event for CPU %d\n", cpu); - state->instructions = NULL; - goto error; - } - - // Setup context switch event - memset(&attr, 0, sizeof(attr)); - attr.type = PERF_TYPE_SOFTWARE; - attr.config = PERF_COUNT_SW_CONTEXT_SWITCHES; - attr.size = sizeof(attr); - attr.disabled = 0; - attr.sample_period = 1; + // state->instructions = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); + // if (IS_ERR(state->instructions)) { + // ret = PTR_ERR(state->instructions); + // pr_err("Failed to create instructions event for CPU %d\n", cpu); + // state->instructions = NULL; + // goto error; + // } + + // // Setup context switch event + // memset(&attr, 0, sizeof(attr)); + // attr.type = PERF_TYPE_SOFTWARE; + // attr.config = PERF_COUNT_SW_CONTEXT_SWITCHES; + // attr.size = sizeof(attr); + // attr.disabled = 0; + // attr.sample_period = 1; - state->ctx_switch = perf_event_create_kernel_counter(&attr, cpu, NULL, - context_switch_handler, - NULL); - if (IS_ERR(state->ctx_switch)) { - ret = PTR_ERR(state->ctx_switch); - pr_err("Failed to create context switch event for CPU %d\n", cpu); - state->ctx_switch = NULL; - goto error; - } + // state->ctx_switch = perf_event_create_kernel_counter(&attr, cpu, NULL, + // context_switch_handler, + // NULL); + // if (IS_ERR(state->ctx_switch)) { + // ret = PTR_ERR(state->ctx_switch); + // pr_err("Failed to create context switch event for CPU %d\n", cpu); + // state->ctx_switch = NULL; + // goto error; + // } // Initialize and start the timer (moved from start_cpu_timer) hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); From 721c81f4704211e3413842ebe1f8b2d60f625fea Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 14:19:00 +0000 Subject: [PATCH 58/88] continue test even if cannot create resctrl directory --- .github/workflows/test-kernel-module.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-kernel-module.yaml b/.github/workflows/test-kernel-module.yaml index 773151a..3f2f8c9 100644 --- a/.github/workflows/test-kernel-module.yaml +++ b/.github/workflows/test-kernel-module.yaml @@ -116,7 +116,7 @@ jobs: - name: Check RDT Capabilities run: | - sudo mkdir -p /sys/fs/resctrl + sudo mkdir -p /sys/fs/resctrl || true sudo mount -t resctrl resctrl /sys/fs/resctrl || true echo "Mounting resctrl filesystem" From fb16a6ce6e6ad5fb131a33d88d5872166b7fcfd9 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 14:44:52 +0000 Subject: [PATCH 59/88] disable hrtimer --- module/memory_collector.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index d4163d6..a649dfb 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -159,17 +159,17 @@ static void init_cpu_state(struct work_struct *work) // goto error; // } - // Initialize and start the timer (moved from start_cpu_timer) - hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - state->timer.function = timer_fn; + // // Initialize and start the timer (moved from start_cpu_timer) + // hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + // state->timer.function = timer_fn; - now = ktime_get(); - state->next_expected = ktime_add_ns(now, NSEC_PER_MSEC); - state->next_expected = ktime_set(ktime_to_ns(state->next_expected) / NSEC_PER_SEC, - (ktime_to_ns(state->next_expected) % NSEC_PER_SEC) / - NSEC_PER_MSEC * NSEC_PER_MSEC); + // now = ktime_get(); + // state->next_expected = ktime_add_ns(now, NSEC_PER_MSEC); + // state->next_expected = ktime_set(ktime_to_ns(state->next_expected) / NSEC_PER_SEC, + // (ktime_to_ns(state->next_expected) % NSEC_PER_SEC) / + // NSEC_PER_MSEC * NSEC_PER_MSEC); - hrtimer_start(&state->timer, state->next_expected, HRTIMER_MODE_ABS); + // hrtimer_start(&state->timer, state->next_expected, HRTIMER_MODE_ABS); return; error: From 07501ac489c98efb62804289a24f08e33223dba0 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 14:45:41 +0000 Subject: [PATCH 60/88] enforce that init_cpu_state runs only on the intended CPU --- module/memory_collector.c | 43 +++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index a649dfb..682810d 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -37,7 +37,8 @@ static struct cpu_state __percpu *cpu_states; static void cleanup_cpu(int cpu); static enum hrtimer_restart timer_fn(struct hrtimer *timer); -// Add workqueue declaration at the top with other global variables +// Add global cpu_works declaration after other global variables +static struct work_struct __percpu *cpu_works; static struct workqueue_struct *collector_wq; static void collect_sample_on_current_cpu(bool is_context_switch) @@ -79,14 +80,24 @@ static void context_switch_handler(struct perf_event *event, collect_sample_on_current_cpu(true); } -// Merge init_cpu and start_cpu_timer into a single function +// Modify init_cpu_state to verify the work struct matches the current CPU static void init_cpu_state(struct work_struct *work) { struct perf_event_attr attr; int ret; int cpu = smp_processor_id(); - struct cpu_state *state = this_cpu_ptr(cpu_states); - ktime_t now; + struct work_struct *expected_work = per_cpu_ptr(cpu_works, cpu); + struct cpu_state *state; + + // Verify this work matches the expected work for this CPU + if (work != expected_work) { + pr_err("CPU mismatch in init_cpu_state. Expected work %px for CPU %d, got %px\n", + expected_work, cpu, work); + return; + } + + state = this_cpu_ptr(cpu_states); + pr_info("init_cpu_state for CPU %d\n", cpu); state->llc_miss = NULL; state->cycles = NULL; @@ -180,6 +191,9 @@ static void init_cpu_state(struct work_struct *work) static void cleanup_cpu(int cpu) { struct cpu_state *state = per_cpu_ptr(cpu_states, cpu); + + pr_debug("cleanup_cpu for CPU %d\n", cpu); + hrtimer_cancel(&state->timer); if (state->ctx_switch) { perf_event_release_kernel(state->ctx_switch); @@ -221,17 +235,19 @@ static enum hrtimer_restart timer_fn(struct hrtimer *timer) static int __init memory_collector_init(void) { int cpu, ret; - struct work_struct __percpu *cpu_works; printk(KERN_INFO "Memory Collector: initializing\n"); + cpu_works = NULL; + // Allocate percpu array cpu_states = alloc_percpu(struct cpu_state); if (!cpu_states) { ret = -ENOMEM; goto error_alloc; } - // initialize the cpu_states, so we can query whether each variable is initialized + + // Initialize the cpu_states for_each_possible_cpu(cpu) { struct cpu_state *state = per_cpu_ptr(cpu_states, cpu); state->llc_miss = NULL; @@ -249,7 +265,7 @@ static int __init memory_collector_init(void) goto error_wq; } - // Allocate per-CPU work structures + // Allocate per-CPU work structures (now global) cpu_works = alloc_percpu(struct work_struct); if (!cpu_works) { ret = -ENOMEM; @@ -267,6 +283,8 @@ static int __init memory_collector_init(void) // Wait for all work to complete flush_workqueue(collector_wq); free_percpu(cpu_works); + cpu_works = NULL; + pr_info("Memory Collector: workqueue flushed\n"); // Check initialization results pr_info("Memory Collector: checking per-cpu perf events\n"); @@ -292,6 +310,10 @@ static int __init memory_collector_init(void) for_each_possible_cpu(cpu) { cleanup_cpu(cpu); } + if (cpu_works) { + free_percpu(cpu_works); + cpu_works = NULL; + } error_work_alloc: destroy_workqueue(collector_wq); error_wq: @@ -307,16 +329,19 @@ static void __exit memory_collector_exit(void) printk(KERN_INFO "Memory Collector: unregistering PMU module\n"); - // Call resctrl exit resctrl_exit(); - // Cleanup all CPUs for_each_possible_cpu(cpu) { cleanup_cpu(cpu); } destroy_workqueue(collector_wq); free_percpu(cpu_states); + if (cpu_works) { + // should be NULL already in any execution outcome of init, but adding for clarity + free_percpu(cpu_works); + cpu_works = NULL; + } } module_init(memory_collector_init); From 863e00ba3ed5e5b3eb790c344f0f4d463cbadd10 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 15:02:08 +0000 Subject: [PATCH 61/88] stop failing the insert when context switches are not set up --- module/memory_collector.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 682810d..c75ccbb 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -286,14 +286,15 @@ static int __init memory_collector_init(void) cpu_works = NULL; pr_info("Memory Collector: workqueue flushed\n"); - // Check initialization results - pr_info("Memory Collector: checking per-cpu perf events\n"); - for_each_possible_cpu(cpu) { - struct cpu_state *state = per_cpu_ptr(cpu_states, cpu); - if (state->ctx_switch == NULL) { - goto error_cpu_init; - } - } + // // Check initialization results + // pr_info("Memory Collector: checking per-cpu perf events\n"); + // for_each_possible_cpu(cpu) { + // struct cpu_state *state = per_cpu_ptr(cpu_states, cpu); + // if (state->ctx_switch == NULL) { + // res = -ENODEV; + // goto error_cpu_init; + // } + // } pr_info("Memory Collector: initializing resctrl\n"); ret = resctrl_init(); @@ -302,6 +303,7 @@ static int __init memory_collector_init(void) goto error_resctrl; } + pr_info("Memory Collector: initialization completed\n"); return 0; error_resctrl: @@ -319,6 +321,7 @@ static int __init memory_collector_init(void) error_wq: free_percpu(cpu_states); error_alloc: + pr_err("Memory Collector: initialization failed, ret = %d\n", ret); return ret; } @@ -334,14 +337,14 @@ static void __exit memory_collector_exit(void) for_each_possible_cpu(cpu) { cleanup_cpu(cpu); } - - destroy_workqueue(collector_wq); - free_percpu(cpu_states); if (cpu_works) { // should be NULL already in any execution outcome of init, but adding for clarity free_percpu(cpu_works); cpu_works = NULL; } + + destroy_workqueue(collector_wq); + free_percpu(cpu_states); } module_init(memory_collector_init); From 6d83546911f4177b115cb701f2ef2b2b08891c97 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 09:38:39 -0600 Subject: [PATCH 62/88] have workqueue enforce locality of work items --- module/memory_collector.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index c75ccbb..ed9f377 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -91,13 +91,13 @@ static void init_cpu_state(struct work_struct *work) // Verify this work matches the expected work for this CPU if (work != expected_work) { - pr_err("CPU mismatch in init_cpu_state. Expected work %px for CPU %d, got %px\n", - expected_work, cpu, work); + pr_err("CPU mismatch in init_cpu_state. On CPU %d, expected work %px, got %px\n", + cpu, expected_work, work); return; } state = this_cpu_ptr(cpu_states); - pr_info("init_cpu_state for CPU %d\n", cpu); + pr_info("init_cpu_state for CPU %d, got work %px\n", cpu, work); state->llc_miss = NULL; state->cycles = NULL; @@ -259,7 +259,7 @@ static int __init memory_collector_init(void) } // Create workqueue - collector_wq = alloc_workqueue("collector_wq", WQ_UNBOUND, 0); + collector_wq = alloc_workqueue("collector_wq", 0, 0); if (!collector_wq) { ret = -ENOMEM; goto error_wq; From aec0cb600be8cf62d5ea5e7ccfa0406bd57eccd1 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 15:40:48 +0000 Subject: [PATCH 63/88] only try to enqueue work on online CPUs. when enqueueing on CPUs that have never been online, `queue_work_on` says users get a splat. --- module/memory_collector.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index ed9f377..5ef6f2e 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -274,7 +274,7 @@ static int __init memory_collector_init(void) // Initialize and queue work for each CPU pr_info("Memory Collector: initializing per-cpu perf events\n"); - for_each_possible_cpu(cpu) { + for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(cpu_works, cpu); INIT_WORK(work, init_cpu_state); queue_work_on(cpu, collector_wq, work); From f51979ae95dcb5f16ec30769a1fbbe9fed323961 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 15:57:46 +0000 Subject: [PATCH 64/88] install trace-cmd before running a test that requires it --- .github/workflows/test-kernel-module.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/test-kernel-module.yaml b/.github/workflows/test-kernel-module.yaml index 3f2f8c9..43bf292 100644 --- a/.github/workflows/test-kernel-module.yaml +++ b/.github/workflows/test-kernel-module.yaml @@ -179,6 +179,10 @@ jobs: # Check kernel logs for cleanup message dmesg | grep "Memory Collector" || true + - name: Install trace dependencies + run: | + sudo apt-get install -y trace-cmd + - name: Run module test script working-directory: module run: | From 1866536f5a10483f4eff2d65c22a8af95544a12c Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 16:08:23 +0000 Subject: [PATCH 65/88] start timers --- module/memory_collector.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 5ef6f2e..eeaf805 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -170,17 +170,16 @@ static void init_cpu_state(struct work_struct *work) // goto error; // } - // // Initialize and start the timer (moved from start_cpu_timer) - // hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - // state->timer.function = timer_fn; + // Initialize and start the timer (moved from start_cpu_timer) + hrtimer_setup(&state->timer, timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); - // now = ktime_get(); - // state->next_expected = ktime_add_ns(now, NSEC_PER_MSEC); - // state->next_expected = ktime_set(ktime_to_ns(state->next_expected) / NSEC_PER_SEC, - // (ktime_to_ns(state->next_expected) % NSEC_PER_SEC) / - // NSEC_PER_MSEC * NSEC_PER_MSEC); + now = ktime_get(); + state->next_expected = ktime_add_ns(now, NSEC_PER_MSEC); + state->next_expected = ktime_set(ktime_to_ns(state->next_expected) / NSEC_PER_SEC, + (ktime_to_ns(state->next_expected) % NSEC_PER_SEC) / + NSEC_PER_MSEC * NSEC_PER_MSEC); - // hrtimer_start(&state->timer, state->next_expected, HRTIMER_MODE_ABS); + hrtimer_start(&state->timer, state->next_expected, HRTIMER_MODE_ABS_PINNED); return; error: @@ -195,6 +194,7 @@ static void cleanup_cpu(int cpu) pr_debug("cleanup_cpu for CPU %d\n", cpu); hrtimer_cancel(&state->timer); + if (state->ctx_switch) { perf_event_release_kernel(state->ctx_switch); state->ctx_switch = NULL; @@ -254,8 +254,7 @@ static int __init memory_collector_init(void) state->cycles = NULL; state->instructions = NULL; state->ctx_switch = NULL; - hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - state->timer.function = timer_fn; + hrtimer_setup(&state->timer, timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); } // Create workqueue From 20463adc7172a6790a61ae52a8213260f606aae0 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 16:12:11 +0000 Subject: [PATCH 66/88] use hrtimer_init rather than hrtimer_setup --- module/memory_collector.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index eeaf805..695f2cb 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -88,6 +88,7 @@ static void init_cpu_state(struct work_struct *work) int cpu = smp_processor_id(); struct work_struct *expected_work = per_cpu_ptr(cpu_works, cpu); struct cpu_state *state; + ktime_t now; // Verify this work matches the expected work for this CPU if (work != expected_work) { @@ -171,7 +172,8 @@ static void init_cpu_state(struct work_struct *work) // } // Initialize and start the timer (moved from start_cpu_timer) - hrtimer_setup(&state->timer, timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + state->timer.function = timer_fn; now = ktime_get(); state->next_expected = ktime_add_ns(now, NSEC_PER_MSEC); @@ -254,7 +256,8 @@ static int __init memory_collector_init(void) state->cycles = NULL; state->instructions = NULL; state->ctx_switch = NULL; - hrtimer_setup(&state->timer, timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + state->timer.function = timer_fn; } // Create workqueue From 36bc46b30337c8d709ab1c21a5082ebdad1662a5 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 16:20:48 +0000 Subject: [PATCH 67/88] enable perf counters --- module/memory_collector.c | 100 +++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 50 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 695f2cb..259b703 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -115,61 +115,61 @@ static void init_cpu_state(struct work_struct *work) attr.exclude_hv = 0; attr.exclude_idle = 0; - // state->llc_miss = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); - // if (IS_ERR(state->llc_miss)) { - // ret = PTR_ERR(state->llc_miss); - // pr_err("Failed to create LLC miss event for CPU %d\n", cpu); - // state->llc_miss = NULL; - // goto error; - // } + state->llc_miss = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); + if (IS_ERR(state->llc_miss)) { + ret = PTR_ERR(state->llc_miss); + pr_err("Failed to create LLC miss event for CPU %d\n", cpu); + state->llc_miss = NULL; + goto error; + } - // // Setup cycles event - // memset(&attr, 0, sizeof(attr)); - // attr.type = PERF_TYPE_HARDWARE; - // attr.config = PERF_COUNT_HW_CPU_CYCLES; - // attr.size = sizeof(attr); - // attr.disabled = 0; + // Setup cycles event + memset(&attr, 0, sizeof(attr)); + attr.type = PERF_TYPE_HARDWARE; + attr.config = PERF_COUNT_HW_CPU_CYCLES; + attr.size = sizeof(attr); + attr.disabled = 0; - // state->cycles = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); - // if (IS_ERR(state->cycles)) { - // ret = PTR_ERR(state->cycles); - // pr_err("Failed to create cycles event for CPU %d\n", cpu); - // state->cycles = NULL; - // goto error; - // } + state->cycles = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); + if (IS_ERR(state->cycles)) { + ret = PTR_ERR(state->cycles); + pr_err("Failed to create cycles event for CPU %d\n", cpu); + state->cycles = NULL; + goto error; + } - // // Setup instructions event - // memset(&attr, 0, sizeof(attr)); - // attr.type = PERF_TYPE_HARDWARE; - // attr.config = PERF_COUNT_HW_INSTRUCTIONS; - // attr.size = sizeof(attr); - // attr.disabled = 0; + // Setup instructions event + memset(&attr, 0, sizeof(attr)); + attr.type = PERF_TYPE_HARDWARE; + attr.config = PERF_COUNT_HW_INSTRUCTIONS; + attr.size = sizeof(attr); + attr.disabled = 0; - // state->instructions = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); - // if (IS_ERR(state->instructions)) { - // ret = PTR_ERR(state->instructions); - // pr_err("Failed to create instructions event for CPU %d\n", cpu); - // state->instructions = NULL; - // goto error; - // } + state->instructions = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); + if (IS_ERR(state->instructions)) { + ret = PTR_ERR(state->instructions); + pr_err("Failed to create instructions event for CPU %d\n", cpu); + state->instructions = NULL; + goto error; + } - // // Setup context switch event - // memset(&attr, 0, sizeof(attr)); - // attr.type = PERF_TYPE_SOFTWARE; - // attr.config = PERF_COUNT_SW_CONTEXT_SWITCHES; - // attr.size = sizeof(attr); - // attr.disabled = 0; - // attr.sample_period = 1; + // Setup context switch event + memset(&attr, 0, sizeof(attr)); + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_CONTEXT_SWITCHES; + attr.size = sizeof(attr); + attr.disabled = 0; + attr.sample_period = 1; - // state->ctx_switch = perf_event_create_kernel_counter(&attr, cpu, NULL, - // context_switch_handler, - // NULL); - // if (IS_ERR(state->ctx_switch)) { - // ret = PTR_ERR(state->ctx_switch); - // pr_err("Failed to create context switch event for CPU %d\n", cpu); - // state->ctx_switch = NULL; - // goto error; - // } + state->ctx_switch = perf_event_create_kernel_counter(&attr, cpu, NULL, + context_switch_handler, + NULL); + if (IS_ERR(state->ctx_switch)) { + ret = PTR_ERR(state->ctx_switch); + pr_err("Failed to create context switch event for CPU %d\n", cpu); + state->ctx_switch = NULL; + goto error; + } // Initialize and start the timer (moved from start_cpu_timer) hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); @@ -196,7 +196,7 @@ static void cleanup_cpu(int cpu) pr_debug("cleanup_cpu for CPU %d\n", cpu); hrtimer_cancel(&state->timer); - + if (state->ctx_switch) { perf_event_release_kernel(state->ctx_switch); state->ctx_switch = NULL; From d2e3272929be02506f4186fb85e9758dcb913344 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 16:27:00 +0000 Subject: [PATCH 68/88] best effort to set up perf timers --- module/memory_collector.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 259b703..b6245f8 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -118,9 +118,8 @@ static void init_cpu_state(struct work_struct *work) state->llc_miss = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); if (IS_ERR(state->llc_miss)) { ret = PTR_ERR(state->llc_miss); - pr_err("Failed to create LLC miss event for CPU %d\n", cpu); + pr_err("Failed to create LLC miss event for CPU %d: error %d\n", cpu, ret); state->llc_miss = NULL; - goto error; } // Setup cycles event @@ -133,9 +132,8 @@ static void init_cpu_state(struct work_struct *work) state->cycles = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); if (IS_ERR(state->cycles)) { ret = PTR_ERR(state->cycles); - pr_err("Failed to create cycles event for CPU %d\n", cpu); + pr_err("Failed to create cycles event for CPU %d: error %d\n", cpu, ret); state->cycles = NULL; - goto error; } // Setup instructions event @@ -148,9 +146,8 @@ static void init_cpu_state(struct work_struct *work) state->instructions = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); if (IS_ERR(state->instructions)) { ret = PTR_ERR(state->instructions); - pr_err("Failed to create instructions event for CPU %d\n", cpu); + pr_err("Failed to create instructions event for CPU %d: error %d\n", cpu, ret); state->instructions = NULL; - goto error; } // Setup context switch event @@ -166,9 +163,8 @@ static void init_cpu_state(struct work_struct *work) NULL); if (IS_ERR(state->ctx_switch)) { ret = PTR_ERR(state->ctx_switch); - pr_err("Failed to create context switch event for CPU %d\n", cpu); + pr_err("Failed to create context switch event for CPU %d: error %d\n", cpu, ret); state->ctx_switch = NULL; - goto error; } // Initialize and start the timer (moved from start_cpu_timer) @@ -182,10 +178,6 @@ static void init_cpu_state(struct work_struct *work) NSEC_PER_MSEC * NSEC_PER_MSEC); hrtimer_start(&state->timer, state->next_expected, HRTIMER_MODE_ABS_PINNED); - return; - -error: - cleanup_cpu(cpu); } // Update cleanup_cpu to clean up context switch event From 249f1d1da38994c2afc5c1916993970620170288 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 16:49:13 +0000 Subject: [PATCH 69/88] give kernel the time we want to reset the timer to --- module/memory_collector.c | 2 +- module/test_module.sh | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index b6245f8..1a878ff 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -221,7 +221,7 @@ static enum hrtimer_restart timer_fn(struct hrtimer *timer) (ktime_to_ns(state->next_expected) % NSEC_PER_SEC) / NSEC_PER_MSEC * NSEC_PER_MSEC); - hrtimer_forward_now(timer, state->next_expected); + hrtimer_set_expires(timer, state->next_expected); return HRTIMER_RESTART; } diff --git a/module/test_module.sh b/module/test_module.sh index 3bdbb5d..aabf126 100755 --- a/module/test_module.sh +++ b/module/test_module.sh @@ -40,6 +40,12 @@ head "$TRACE_OUTPUT" echo "Tail of trace report:" tail "$TRACE_OUTPUT" +echo "Head of is_context_switch=0:" +grep "is_context_switch=0" "$TRACE_OUTPUT" | head || true + +echo "Head of is_context_switch=1:" +grep "is_context_switch=1" "$TRACE_OUTPUT" | head || true + echo "Validating output..." # Check if we have any trace entries SAMPLE_COUNT=$(grep "memory_collector_sample:" "$TRACE_OUTPUT" | wc -l) From bb53370575dd377626fbfea07430ac1697ac4f97 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 16:59:35 +0000 Subject: [PATCH 70/88] remove pr_info in workqueue when there is no error, to avoid "hogged CPU" warning --- module/memory_collector.c | 1 - 1 file changed, 1 deletion(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 1a878ff..5b2e7cb 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -98,7 +98,6 @@ static void init_cpu_state(struct work_struct *work) } state = this_cpu_ptr(cpu_states); - pr_info("init_cpu_state for CPU %d, got work %px\n", cpu, work); state->llc_miss = NULL; state->cycles = NULL; From ff36734424c444281842a9768ef592fb87d0f9fe Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 17:01:40 +0000 Subject: [PATCH 71/88] add auto test on push to main --- .github/workflows/test-kernel-module.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-kernel-module.yaml b/.github/workflows/test-kernel-module.yaml index 43bf292..d017f09 100644 --- a/.github/workflows/test-kernel-module.yaml +++ b/.github/workflows/test-kernel-module.yaml @@ -7,6 +7,9 @@ on: required: false default: 'c7i.metal-24xl' type: string + push: + branches: + - main permissions: id-token: write # Required for requesting the JWT @@ -33,7 +36,7 @@ jobs: mode: start github-token: ${{ secrets.REPO_ADMIN_TOKEN }} ec2-image-id: ami-0884d2865dbe9de4b # Ubuntu 22.04 LTS in us-east-2 - ec2-instance-type: ${{ inputs.instance-type }} + ec2-instance-type: ${{ inputs.instance-type || 'c5.9xlarge' }} market-type: spot subnet-id: ${{ secrets.AWS_SUBNET_ID }} security-group-id: ${{ secrets.AWS_SECURITY_GROUP_ID }} From c4d9c74829ebb0f8484d4e19ae3f699e497c0a7a Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 17:38:27 +0000 Subject: [PATCH 72/88] use perf_event_read_local (rather than the non-local variant) --- module/memory_collector.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 5b2e7cb..a6edbc4 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -47,6 +47,7 @@ static void collect_sample_on_current_cpu(bool is_context_switch) u32 cpu; u64 llc_misses = 0, cycles = 0, instructions = 0; u64 enabled, running; + int ret; timestamp = ktime_get_ns(); cpu = smp_processor_id(); @@ -54,7 +55,10 @@ static void collect_sample_on_current_cpu(bool is_context_switch) // Read LLC misses if (state->llc_miss) { - llc_misses = perf_event_read_value(state->llc_miss, &enabled, &running); + ret = perf_event_read_local(state->llc_miss, &llc_misses, &enabled, &running); + if (ret) { + llc_misses = 0; + } } // Read cycles From b38ce0e71ed1617ddd8691c3064211d6658b8f03 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 19:05:25 +0000 Subject: [PATCH 73/88] we cannot read perf counters safely at context switch or timer, planning to move those to ebpf --- module/memory_collector.c | 115 ++++---------------------------- module/memory_collector_trace.h | 15 ++--- 2 files changed, 16 insertions(+), 114 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index a6edbc4..feeeeed 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -24,9 +24,6 @@ MODULE_VERSION("1.0"); // Replace the cpu_state struct and global variable definitions struct cpu_state { - struct perf_event *llc_miss; - struct perf_event *cycles; - struct perf_event *instructions; struct perf_event *ctx_switch; struct hrtimer timer; ktime_t next_expected; @@ -43,36 +40,11 @@ static struct workqueue_struct *collector_wq; static void collect_sample_on_current_cpu(bool is_context_switch) { - u64 timestamp; - u32 cpu; - u64 llc_misses = 0, cycles = 0, instructions = 0; - u64 enabled, running; - int ret; - - timestamp = ktime_get_ns(); - cpu = smp_processor_id(); + u64 timestamp = ktime_get_ns(); + u32 cpu = smp_processor_id(); struct cpu_state *state = this_cpu_ptr(cpu_states); - // Read LLC misses - if (state->llc_miss) { - ret = perf_event_read_local(state->llc_miss, &llc_misses, &enabled, &running); - if (ret) { - llc_misses = 0; - } - } - - // Read cycles - if (state->cycles) { - cycles = perf_event_read_value(state->cycles, &enabled, &running); - } - - // Read instructions - if (state->instructions) { - instructions = perf_event_read_value(state->instructions, &enabled, &running); - } - - trace_memory_collector_sample(cpu, timestamp, current->comm, - llc_misses, cycles, instructions, is_context_switch); + trace_memory_collector_sample(cpu, timestamp, current->comm, is_context_switch); } // Add context switch handler @@ -103,56 +75,8 @@ static void init_cpu_state(struct work_struct *work) state = this_cpu_ptr(cpu_states); - state->llc_miss = NULL; - state->cycles = NULL; - state->instructions = NULL; state->ctx_switch = NULL; - // Setup LLC miss event - memset(&attr, 0, sizeof(attr)); - attr.type = PERF_TYPE_HW_CACHE; - attr.config = PERF_COUNT_HW_CACHE_MISSES; - attr.size = sizeof(attr); - attr.disabled = 0; - attr.exclude_kernel = 0; - attr.exclude_hv = 0; - attr.exclude_idle = 0; - - state->llc_miss = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); - if (IS_ERR(state->llc_miss)) { - ret = PTR_ERR(state->llc_miss); - pr_err("Failed to create LLC miss event for CPU %d: error %d\n", cpu, ret); - state->llc_miss = NULL; - } - - // Setup cycles event - memset(&attr, 0, sizeof(attr)); - attr.type = PERF_TYPE_HARDWARE; - attr.config = PERF_COUNT_HW_CPU_CYCLES; - attr.size = sizeof(attr); - attr.disabled = 0; - - state->cycles = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); - if (IS_ERR(state->cycles)) { - ret = PTR_ERR(state->cycles); - pr_err("Failed to create cycles event for CPU %d: error %d\n", cpu, ret); - state->cycles = NULL; - } - - // Setup instructions event - memset(&attr, 0, sizeof(attr)); - attr.type = PERF_TYPE_HARDWARE; - attr.config = PERF_COUNT_HW_INSTRUCTIONS; - attr.size = sizeof(attr); - attr.disabled = 0; - - state->instructions = perf_event_create_kernel_counter(&attr, cpu, NULL, NULL, NULL); - if (IS_ERR(state->instructions)) { - ret = PTR_ERR(state->instructions); - pr_err("Failed to create instructions event for CPU %d: error %d\n", cpu, ret); - state->instructions = NULL; - } - // Setup context switch event memset(&attr, 0, sizeof(attr)); attr.type = PERF_TYPE_SOFTWARE; @@ -196,18 +120,6 @@ static void cleanup_cpu(int cpu) perf_event_release_kernel(state->ctx_switch); state->ctx_switch = NULL; } - if (state->llc_miss) { - perf_event_release_kernel(state->llc_miss); - state->llc_miss = NULL; - } - if (state->cycles) { - perf_event_release_kernel(state->cycles); - state->cycles = NULL; - } - if (state->instructions) { - perf_event_release_kernel(state->instructions); - state->instructions = NULL; - } } static enum hrtimer_restart timer_fn(struct hrtimer *timer) @@ -247,9 +159,6 @@ static int __init memory_collector_init(void) // Initialize the cpu_states for_each_possible_cpu(cpu) { struct cpu_state *state = per_cpu_ptr(cpu_states, cpu); - state->llc_miss = NULL; - state->cycles = NULL; - state->instructions = NULL; state->ctx_switch = NULL; hrtimer_init(&state->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); state->timer.function = timer_fn; @@ -283,15 +192,15 @@ static int __init memory_collector_init(void) cpu_works = NULL; pr_info("Memory Collector: workqueue flushed\n"); - // // Check initialization results - // pr_info("Memory Collector: checking per-cpu perf events\n"); - // for_each_possible_cpu(cpu) { - // struct cpu_state *state = per_cpu_ptr(cpu_states, cpu); - // if (state->ctx_switch == NULL) { - // res = -ENODEV; - // goto error_cpu_init; - // } - // } + // Check initialization results + pr_info("Memory Collector: checking per-cpu perf events\n"); + for_each_possible_cpu(cpu) { + struct cpu_state *state = per_cpu_ptr(cpu_states, cpu); + if (state->ctx_switch == NULL) { + ret = -ENODEV; + goto error_cpu_init; + } + } pr_info("Memory Collector: initializing resctrl\n"); ret = resctrl_init(); diff --git a/module/memory_collector_trace.h b/module/memory_collector_trace.h index 044d38e..f9d23db 100644 --- a/module/memory_collector_trace.h +++ b/module/memory_collector_trace.h @@ -7,17 +7,14 @@ #include TRACE_EVENT(memory_collector_sample, - TP_PROTO(u32 cpu, u64 timestamp, const char *comm, u64 llc_misses, u64 cycles, u64 instructions, bool is_context_switch), + TP_PROTO(u32 cpu, u64 timestamp, const char *comm, bool is_context_switch), - TP_ARGS(cpu, timestamp, comm, llc_misses, cycles, instructions, is_context_switch), + TP_ARGS(cpu, timestamp, comm, is_context_switch), TP_STRUCT__entry( __field(u32, cpu) __field(u64, timestamp) __array(char, comm, TASK_COMM_LEN) - __field(u64, llc_misses) - __field(u64, cycles) - __field(u64, instructions) __field(bool, is_context_switch) ), @@ -25,15 +22,11 @@ TRACE_EVENT(memory_collector_sample, __entry->cpu = cpu; __entry->timestamp = timestamp; memcpy(__entry->comm, comm, TASK_COMM_LEN); - __entry->llc_misses = llc_misses; - __entry->cycles = cycles; - __entry->instructions = instructions; __entry->is_context_switch = is_context_switch; ), - TP_printk("cpu=%u timestamp=%llu comm=%s llc_misses=%llu cycles=%llu instructions=%llu is_context_switch=%d", - __entry->cpu, __entry->timestamp, __entry->comm, - __entry->llc_misses, __entry->cycles, __entry->instructions, __entry->is_context_switch) + TP_printk("cpu=%u timestamp=%llu comm=%s is_context_switch=%d", + __entry->cpu, __entry->timestamp, __entry->comm, __entry->is_context_switch) ); #endif /* _MEMORY_COLLECTOR_TRACE_H */ From e1e8a4c1416cca7a7df7ecc8b01448072ddf361f Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 19:25:00 +0000 Subject: [PATCH 74/88] add per-cpu initialization of RDT state --- module/memory_collector.c | 14 +--- module/resctrl.c | 157 ++++++-------------------------------- module/resctrl.h | 22 ++++-- 3 files changed, 40 insertions(+), 153 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index feeeeed..62c58a5 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -27,6 +27,7 @@ struct cpu_state { struct perf_event *ctx_switch; struct hrtimer timer; ktime_t next_expected; + struct rdt_state rdt_state; }; static struct cpu_state __percpu *cpu_states; @@ -105,6 +106,8 @@ static void init_cpu_state(struct work_struct *work) NSEC_PER_MSEC * NSEC_PER_MSEC); hrtimer_start(&state->timer, state->next_expected, HRTIMER_MODE_ABS_PINNED); + + resctrl_init_cpu(&state->rdt_state); } // Update cleanup_cpu to clean up context switch event @@ -202,18 +205,9 @@ static int __init memory_collector_init(void) } } - pr_info("Memory Collector: initializing resctrl\n"); - ret = resctrl_init(); - if (ret < 0) { - pr_err("Failed to initialize resctrl: %d\n", ret); - goto error_resctrl; - } - pr_info("Memory Collector: initialization completed\n"); return 0; -error_resctrl: - resctrl_exit(); error_cpu_init: for_each_possible_cpu(cpu) { cleanup_cpu(cpu); @@ -238,8 +232,6 @@ static void __exit memory_collector_exit(void) printk(KERN_INFO "Memory Collector: unregistering PMU module\n"); - resctrl_exit(); - for_each_possible_cpu(cpu) { cleanup_cpu(cpu); } diff --git a/module/resctrl.c b/module/resctrl.c index ed64c67..5b311b6 100644 --- a/module/resctrl.c +++ b/module/resctrl.c @@ -24,12 +24,6 @@ struct ipi_rmid_args { u32 rmid; int status; }; - -/* Add these declarations at the top after the includes */ -static struct timer_list cpuid_check_timer; -static atomic_t check_counter = ATOMIC_INIT(0); -#define MAX_CHECKS 6 - /* * IPI function to write RMID to MSR * Called on each CPU by on_each_cpu() @@ -52,157 +46,52 @@ static void ipi_write_rmid(void *info) } } -static int enumerate_cpuid(void) +int resctrl_init_cpu(struct rdt_state *rdt_state) { - pr_info("Memory Collector: Starting enumerate_cpuid\n"); - + int cpu = smp_processor_id(); unsigned int eax, ebx, ecx, edx; int ret = 0; + pr_debug("Memory Collector: Starting enumerate_cpuid on CPU %d\n", cpu); + + memset(rdt_state, 0, sizeof(struct rdt_state)); + if (!boot_cpu_has( X86_FEATURE_CQM_LLC)) { - pr_err("Memory Collector: CPU does not support QoS monitoring\n"); + pr_debug("Memory Collector: CPU does not support QoS monitoring\n"); return -ENODEV; } - pr_debug("memory_collector: Checking CPUID.0x7.0 for RDT support\n"); + pr_debug("Memory Collector: Checking CPUID.0x7.0 for RDT support\n"); cpuid_count(0x7, 0, &eax, &ebx, &ecx, &edx); if (!(ebx & (1 << 12))) { - pr_err("Memory Collector: RDT monitoring not supported (CPUID.0x7.0:EBX.12)\n"); + pr_debug("Memory Collector: RDT monitoring not supported (CPUID.0x7.0:EBX.12)\n"); return -ENODEV; } - pr_debug("memory_collector: Checking CPUID.0xF.0 for L3 monitoring\n"); + pr_debug("Memory Collector: Checking CPUID.0xF.0 for L3 monitoring\n"); cpuid_count(0xF, 0, &eax, &ebx, &ecx, &edx); if (!(edx & (1 << 1))) { - pr_err("Memory Collector: L3 monitoring not supported (CPUID.0xF.0:EDX.1)\n"); + pr_debug("Memory Collector: L3 monitoring not supported (CPUID.0xF.0:EDX.1)\n"); return -ENODEV; } pr_debug("Memory Collector: Checking CPUID.0xF.1 for L3 occupancy monitoring\n"); cpuid_count(0xF, 1, &eax, &ebx, &ecx, &edx); - if (!(edx & (1 << 0))) { - pr_err("Memory Collector: L3 occupancy monitoring not supported (CPUID.0xF.1:EDX.0)\n"); - return -ENODEV; - } - - pr_info("Memory Collector: enumerate_cpuid completed successfully\n"); + rdt_state->supports_llc_occupancy = (edx & (1 << 0)); + rdt_state->supports_mbm_total = (edx & (1 << 1)); + rdt_state->supports_mbm_local = (edx & (1 << 2)); + rdt_state->max_rmid = ecx; + rdt_state->counter_width = (eax & 0xFF) + 24; + rdt_state->has_overflow_bit = (eax & (1 << 8)); + rdt_state->supports_non_cpu_agent_cache = (eax & (1 << 8)); + rdt_state->supports_non_cpu_agent_mbm = (eax & (1 << 10)); + + pr_debug("Memory Collector: capabilities of core %d: llc_occupancy: %d, mbm_total: %d, mbm_local: %d, max_rmid: %d, counter_width: %d, has_overflow_bit: %d, supports_non_cpu_agent_cache: %d, supports_non_cpu_agent_mbm: %d\n", + cpu, rdt_state->supports_llc_occupancy, rdt_state->supports_mbm_total, rdt_state->supports_mbm_local, rdt_state->max_rmid, rdt_state->counter_width, rdt_state->has_overflow_bit, rdt_state->supports_non_cpu_agent_cache, rdt_state->supports_non_cpu_agent_mbm); + pr_debug("Memory Collector: enumerate_cpuid completed successfully on CPU %d\n", cpu); return ret; } -static void cpuid_check_timer_callback(struct timer_list *t) -{ - int check_num = atomic_inc_return(&check_counter); - unsigned int eax, ebx, ecx, edx; - - pr_info("Memory Collector: Running CPUID check %d of %d\n", check_num, MAX_CHECKS); - - switch (check_num) { - case 1: - if (!boot_cpu_has( X86_FEATURE_CQM_LLC)) { - pr_err("Memory Collector: CPU does not support QoS monitoring\n"); - } - break; - case 2: - pr_debug("Memory Collector: Checking CPUID.0x7.0 for RDT support\n"); - cpuid_count(0x7, 0, &eax, &ebx, &ecx, &edx); - if (!(ebx & (1 << 12))) { - pr_err("Memory Collector: RDT monitoring not supported (CPUID.0x7.0:EBX.12)\n"); - } - break; - case 3: - pr_debug("Memory Collector: Checking CPUID.0xF.0 for L3 monitoring\n"); - cpuid_count(0xF, 0, &eax, &ebx, &ecx, &edx); - if (!(edx & (1 << 1))) { - pr_err("Memory Collector: L3 monitoring not supported (CPUID.0xF.0:EDX.1)\n"); - } - break; - case 4: - pr_debug("Memory Collector: Checking CPUID.0xF.1 for L3 occupancy monitoring\n"); - cpuid_count(0xF, 1, &eax, &ebx, &ecx, &edx); - if (!(edx & (1 << 0))) { - pr_err("Memory Collector: L3 occupancy monitoring not supported (CPUID.0xF.1:EDX.0)\n"); - } - break; - default: - pr_info("Memory Collector: Completed all %d CPUID checks\n", MAX_CHECKS); - break; - } - - mod_timer(&cpuid_check_timer, jiffies + (5 * HZ)); -} - -int resctrl_init(void) -{ - int cpu; - int ret = 0; - - pr_info("Memory Collector: Initializing resctrl\n"); - - // Initialize the timer - timer_setup(&cpuid_check_timer, cpuid_check_timer_callback, 0); - - // ret = enumerate_cpuid(); - // if (ret) { - // pr_err("Memory Collector: Failed to enumerate CPUID\n"); - // return ret; - // } - - // for_each_online_cpu(cpu) { - // struct ipi_rmid_args args = { - // .rmid = cpu + 1, - // .status = 0 - // }; - - // on_each_cpu_mask(cpumask_of(cpu), ipi_write_rmid, &args, 1); - - // if (args.status) { - // pr_err("Memory Collector: Failed to set RMID %u on CPU %d\n", args.rmid, cpu); - // ret = args.status; - // break; - // } - // pr_info("Memory Collector: Successfully set RMID %u on CPU %d\n", args.rmid, cpu); - // } - - // Start the timer for first check in 5 seconds - mod_timer(&cpuid_check_timer, jiffies + (5 * HZ)); - pr_info("Memory Collector: Started periodic CPUID checks\n"); - - return ret; -} - -int resctrl_exit(void) -{ - // Delete the timer first - del_timer_sync(&cpuid_check_timer); - - int failure_count = 0; - int cpu; - - struct ipi_rmid_args args = { - .rmid = RESCTRL_RESERVED_RMID, - .status = 0 - }; - - // for_each_online_cpu(cpu) { - // on_each_cpu_mask(cpumask_of(cpu), ipi_write_rmid, &args, 1); - - // if (args.status) { - // pr_err("Memory Collector: Failed to set RMID %u on CPU %d\n", args.rmid, cpu); - // failure_count++; - // continue; - // } - // pr_info("Memory Collector: Successfully set RMID %u on CPU %d\n", args.rmid, cpu); - // } - - if (failure_count > 0) { - pr_err("Memory Collector: Failed to reset RMIDs to default on %d CPUs\n", failure_count); - return -EIO; - } - - pr_info("Memory Collector: Successfully reset all RMIDs to default\n"); - return 0; -} - int read_rmid_mbm(u32 rmid, u64 *val) { int err; diff --git a/module/resctrl.h b/module/resctrl.h index 484f01b..73e61c8 100644 --- a/module/resctrl.h +++ b/module/resctrl.h @@ -2,19 +2,25 @@ #define _COLLECTOR_RESCTRL_H_ -/* Function declarations */ +// per-CPU state for RDT +struct rdt_state { + bool supports_llc_occupancy; + bool supports_mbm_total; + bool supports_mbm_local; + bool has_overflow_bit; + bool supports_non_cpu_agent_mbm; + bool supports_non_cpu_agent_cache; + u32 max_rmid; + u32 counter_width; +}; -/* - * Initialize RMIDs per CPU via IPI - * Returns 0 on success, negative error code on failure - */ -int resctrl_init(void); +/* Function declarations */ /* - * Reset all CPU RMIDs to default via IPI + * Initialize RDT state for given CPU * Returns 0 on success, negative error code on failure */ -int resctrl_exit(void); +int resctrl_init_cpu(struct rdt_state *rdt_state); /* * Read memory bandwidth counter for given RMID From 8301f36fe2b70c15a8199c17abcb29faf63214e0 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 19:25:32 +0000 Subject: [PATCH 75/88] switch to RDT-capable VMs for tests --- .github/workflows/test-kernel-module.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-kernel-module.yaml b/.github/workflows/test-kernel-module.yaml index d017f09..e06d394 100644 --- a/.github/workflows/test-kernel-module.yaml +++ b/.github/workflows/test-kernel-module.yaml @@ -36,7 +36,7 @@ jobs: mode: start github-token: ${{ secrets.REPO_ADMIN_TOKEN }} ec2-image-id: ami-0884d2865dbe9de4b # Ubuntu 22.04 LTS in us-east-2 - ec2-instance-type: ${{ inputs.instance-type || 'c5.9xlarge' }} + ec2-instance-type: ${{ inputs.instance-type || 'c7i.metal-24xl' }} market-type: spot subnet-id: ${{ secrets.AWS_SUBNET_ID }} security-group-id: ${{ secrets.AWS_SECURITY_GROUP_ID }} From 79d3ac9b1a6649337d05334967468296c1e23857 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 19:29:51 +0000 Subject: [PATCH 76/88] log resctrl capabilities as pr_info --- module/resctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/resctrl.c b/module/resctrl.c index 5b311b6..0708402 100644 --- a/module/resctrl.c +++ b/module/resctrl.c @@ -86,7 +86,7 @@ int resctrl_init_cpu(struct rdt_state *rdt_state) rdt_state->supports_non_cpu_agent_cache = (eax & (1 << 8)); rdt_state->supports_non_cpu_agent_mbm = (eax & (1 << 10)); - pr_debug("Memory Collector: capabilities of core %d: llc_occupancy: %d, mbm_total: %d, mbm_local: %d, max_rmid: %d, counter_width: %d, has_overflow_bit: %d, supports_non_cpu_agent_cache: %d, supports_non_cpu_agent_mbm: %d\n", + pr_info("Memory Collector: capabilities of core %d: llc_occupancy: %d, mbm_total: %d, mbm_local: %d, max_rmid: %d, counter_width: %d, has_overflow_bit: %d, supports_non_cpu_agent_cache: %d, supports_non_cpu_agent_mbm: %d\n", cpu, rdt_state->supports_llc_occupancy, rdt_state->supports_mbm_total, rdt_state->supports_mbm_local, rdt_state->max_rmid, rdt_state->counter_width, rdt_state->has_overflow_bit, rdt_state->supports_non_cpu_agent_cache, rdt_state->supports_non_cpu_agent_mbm); pr_debug("Memory Collector: enumerate_cpuid completed successfully on CPU %d\n", cpu); return ret; From a5d431ea0244ce44c91b24124d047e19d0891a96 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 19:58:57 +0000 Subject: [PATCH 77/88] change default VM type due to lack of inventory --- .github/workflows/test-kernel-module.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-kernel-module.yaml b/.github/workflows/test-kernel-module.yaml index e06d394..2c517a9 100644 --- a/.github/workflows/test-kernel-module.yaml +++ b/.github/workflows/test-kernel-module.yaml @@ -36,7 +36,7 @@ jobs: mode: start github-token: ${{ secrets.REPO_ADMIN_TOKEN }} ec2-image-id: ami-0884d2865dbe9de4b # Ubuntu 22.04 LTS in us-east-2 - ec2-instance-type: ${{ inputs.instance-type || 'c7i.metal-24xl' }} + ec2-instance-type: ${{ inputs.instance-type || 'm7i.metal-24xl' }} market-type: spot subnet-id: ${{ secrets.AWS_SUBNET_ID }} security-group-id: ${{ secrets.AWS_SECURITY_GROUP_ID }} From 31d1cfe7ecba36d68855a29ca5598bd098dac484 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 20:01:18 +0000 Subject: [PATCH 78/88] read MBM values on every timer tick --- module/memory_collector.c | 2 ++ module/memory_collector_trace.h | 21 +++++++++++++++++++++ module/resctrl.c | 23 +++++++++++++++++++++++ module/resctrl.h | 5 +++++ module/run_module.sh | 2 +- module/test_module.sh | 9 ++++++--- 6 files changed, 58 insertions(+), 4 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 62c58a5..d73498d 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -46,6 +46,8 @@ static void collect_sample_on_current_cpu(bool is_context_switch) struct cpu_state *state = this_cpu_ptr(cpu_states); trace_memory_collector_sample(cpu, timestamp, current->comm, is_context_switch); + + resctrl_timer_tick(&state->rdt_state); } // Add context switch handler diff --git a/module/memory_collector_trace.h b/module/memory_collector_trace.h index f9d23db..7d04ce3 100644 --- a/module/memory_collector_trace.h +++ b/module/memory_collector_trace.h @@ -29,6 +29,27 @@ TRACE_EVENT(memory_collector_sample, __entry->cpu, __entry->timestamp, __entry->comm, __entry->is_context_switch) ); +TRACE_EVENT(memory_collector_mbm_total, + TP_PROTO(u32 rmid, u64 timestamp, u64 val, int err), + TP_ARGS(rmid, timestamp, val, err), + TP_STRUCT__entry( + __field(u32, rmid) + __field(u64, timestamp) + __field(u64, val) + __field(int, err) + ), + + TP_fast_assign( + __entry->rmid = rmid; + __entry->timestamp = timestamp; + __entry->val = val; + __entry->err = err; + ), + + TP_printk("rmid=%u timestamp=%llu val=%llu err=%d", + __entry->rmid, __entry->timestamp, __entry->val, __entry->err) +); + #endif /* _MEMORY_COLLECTOR_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/module/resctrl.c b/module/resctrl.c index 0708402..1d5adb2 100644 --- a/module/resctrl.c +++ b/module/resctrl.c @@ -9,6 +9,7 @@ #include #include "resctrl.h" +#include "memory_collector_trace.h" MODULE_LICENSE("GPL"); @@ -46,6 +47,28 @@ static void ipi_write_rmid(void *info) } } +void resctrl_timer_tick(struct rdt_state *rdt_state) +{ + int cpu = smp_processor_id(); + u64 now = ktime_get_ns(); + int mbm_total_err = 0; + u64 mbm_total_val = 0; + + // for now, just output the first 4 RMID, on CPUs 0..3 + if (cpu > 4) { + return; + } + + // if we support mbm, read it on this CPU + if (rdt_state->supports_mbm_total) { + mbm_total_err = read_rmid_mbm(cpu, &mbm_total_val); + } else { + mbm_total_err = -ENODEV; + } + + trace_memory_collector_mbm_total(cpu, now, mbm_total_val, mbm_total_err); +} + int resctrl_init_cpu(struct rdt_state *rdt_state) { int cpu = smp_processor_id(); diff --git a/module/resctrl.h b/module/resctrl.h index 73e61c8..7be338b 100644 --- a/module/resctrl.h +++ b/module/resctrl.h @@ -22,6 +22,11 @@ struct rdt_state { */ int resctrl_init_cpu(struct rdt_state *rdt_state); +/* + * Read memory bandwidth counter for given RMID and output to trace + */ +void resctrl_timer_tick(struct rdt_state *rdt_state); + /* * Read memory bandwidth counter for given RMID * val is set to the bandwidth value on success diff --git a/module/run_module.sh b/module/run_module.sh index 8b4547c..2a65468 100755 --- a/module/run_module.sh +++ b/module/run_module.sh @@ -12,7 +12,7 @@ start_time=$(date +%s) echo "Module loaded, monitoring dmesg output..." echo "Press Ctrl+C to stop monitoring" -for i in {1..30}; do +for i in {1..5}; do # Get new dmesg entries and timestamp them sudo dmesg -c | while read line; do current_time=$(($(date +%s) - start_time)) diff --git a/module/test_module.sh b/module/test_module.sh index aabf126..f09a55e 100755 --- a/module/test_module.sh +++ b/module/test_module.sh @@ -20,7 +20,7 @@ lsmod | grep collector || { } echo "Setting up tracing..." -sudo trace-cmd start -e memory_collector_sample +sudo trace-cmd start -e memory_collector_sample -e memory_collector_mbm_total echo "Collecting samples for 1 second..." sleep 1 @@ -41,10 +41,13 @@ echo "Tail of trace report:" tail "$TRACE_OUTPUT" echo "Head of is_context_switch=0:" -grep "is_context_switch=0" "$TRACE_OUTPUT" | head || true +grep "is_context_switch=0" "$TRACE_OUTPUT" | head echo "Head of is_context_switch=1:" -grep "is_context_switch=1" "$TRACE_OUTPUT" | head || true +grep "is_context_switch=1" "$TRACE_OUTPUT" | head + +echo "Head of mbm_total:" +grep "memory_collector_mbm_total:" "$TRACE_OUTPUT" | head echo "Validating output..." # Check if we have any trace entries From 7f7c0b61631196b33b729656ca194c60b4091c3d Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 20:12:11 +0000 Subject: [PATCH 79/88] set RMID on CPU 2 --- module/resctrl.c | 13 ++++++++++++- module/test_module.sh | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/module/resctrl.c b/module/resctrl.c index 1d5adb2..0a22e44 100644 --- a/module/resctrl.c +++ b/module/resctrl.c @@ -109,8 +109,19 @@ int resctrl_init_cpu(struct rdt_state *rdt_state) rdt_state->supports_non_cpu_agent_cache = (eax & (1 << 8)); rdt_state->supports_non_cpu_agent_mbm = (eax & (1 << 10)); - pr_info("Memory Collector: capabilities of core %d: llc_occupancy: %d, mbm_total: %d, mbm_local: %d, max_rmid: %d, counter_width: %d, has_overflow_bit: %d, supports_non_cpu_agent_cache: %d, supports_non_cpu_agent_mbm: %d\n", + pr_debug("Memory Collector: capabilities of core %d: llc_occupancy: %d, mbm_total: %d, mbm_local: %d, max_rmid: %d, counter_width: %d, has_overflow_bit: %d, supports_non_cpu_agent_cache: %d, supports_non_cpu_agent_mbm: %d\n", cpu, rdt_state->supports_llc_occupancy, rdt_state->supports_mbm_total, rdt_state->supports_mbm_local, rdt_state->max_rmid, rdt_state->counter_width, rdt_state->has_overflow_bit, rdt_state->supports_non_cpu_agent_cache, rdt_state->supports_non_cpu_agent_mbm); + + if (cpu == 2) { + u32 rmid = 1; + u32 closid = 0; + if (wrmsr_safe(MSR_IA32_PQR_ASSOC, rmid, closid) != 0) { + pr_err("Memory Collector: failed to write RMID %d to MSR_IA32_PQR_ASSOC\n", rmid); + } else { + pr_info("Memory Collector: wrote RMID %d to MSR_IA32_PQR_ASSOC\n", rmid); + } + } + pr_debug("Memory Collector: enumerate_cpuid completed successfully on CPU %d\n", cpu); return ret; } diff --git a/module/test_module.sh b/module/test_module.sh index f09a55e..b17a795 100755 --- a/module/test_module.sh +++ b/module/test_module.sh @@ -47,7 +47,7 @@ echo "Head of is_context_switch=1:" grep "is_context_switch=1" "$TRACE_OUTPUT" | head echo "Head of mbm_total:" -grep "memory_collector_mbm_total:" "$TRACE_OUTPUT" | head +grep "memory_collector_mbm_total:" "$TRACE_OUTPUT" | head -n 20 echo "Validating output..." # Check if we have any trace entries From cc52614a8ef6fe72dec5975ad1155f0237d29707 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 20:18:14 +0000 Subject: [PATCH 80/88] fix event selection in mbm read --- module/resctrl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/resctrl.c b/module/resctrl.c index 0a22e44..019b082 100644 --- a/module/resctrl.c +++ b/module/resctrl.c @@ -131,8 +131,8 @@ int read_rmid_mbm(u32 rmid, u64 *val) int err; err = wrmsr_safe(MSR_IA32_QM_EVTSEL, - rmid, - QOS_L3_MBM_TOTAL_EVENT_ID); + QOS_L3_MBM_TOTAL_EVENT_ID, + rmid); if (err) return err; From 60da7ef2ecd0792949d6363a1041d6b8a5da0d1e Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 20:24:41 +0000 Subject: [PATCH 81/88] generalize resctrl read function --- module/resctrl.c | 6 +++--- module/resctrl.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/module/resctrl.c b/module/resctrl.c index 019b082..e2f2fa2 100644 --- a/module/resctrl.c +++ b/module/resctrl.c @@ -61,7 +61,7 @@ void resctrl_timer_tick(struct rdt_state *rdt_state) // if we support mbm, read it on this CPU if (rdt_state->supports_mbm_total) { - mbm_total_err = read_rmid_mbm(cpu, &mbm_total_val); + mbm_total_err = read_resctrl_value(cpu, QOS_L3_MBM_TOTAL_EVENT_ID, &mbm_total_val); } else { mbm_total_err = -ENODEV; } @@ -126,12 +126,12 @@ int resctrl_init_cpu(struct rdt_state *rdt_state) return ret; } -int read_rmid_mbm(u32 rmid, u64 *val) +int read_resctrl_value(u32 rmid, u32 event_id, u64 *val) { int err; err = wrmsr_safe(MSR_IA32_QM_EVTSEL, - QOS_L3_MBM_TOTAL_EVENT_ID, + event_id, rmid); if (err) return err; diff --git a/module/resctrl.h b/module/resctrl.h index 7be338b..7f081bf 100644 --- a/module/resctrl.h +++ b/module/resctrl.h @@ -28,11 +28,11 @@ int resctrl_init_cpu(struct rdt_state *rdt_state); void resctrl_timer_tick(struct rdt_state *rdt_state); /* - * Read memory bandwidth counter for given RMID - * val is set to the bandwidth value on success + * Read RDT counter for given RMID and event ID + * val is set to the counter value on success * Returns -EIO if error occurred * Returns -EINVAL if data unavailable */ -int read_rmid_mbm(u32 rmid, u64 *val); +int read_resctrl_value(u32 rmid, u32 event_id, u64 *val); #endif /* _COLLECTOR_RESCTRL_H_ */ \ No newline at end of file From ead889543bd5024da043a877554bd146868aeee9 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 20:29:56 +0000 Subject: [PATCH 82/88] add llc and local mbm to resctrl measurements --- module/memory_collector_trace.h | 26 +++++++++++++++++--------- module/resctrl.c | 20 +++++++++++++++++++- module/test_module.sh | 6 +++--- 3 files changed, 39 insertions(+), 13 deletions(-) diff --git a/module/memory_collector_trace.h b/module/memory_collector_trace.h index 7d04ce3..929a24d 100644 --- a/module/memory_collector_trace.h +++ b/module/memory_collector_trace.h @@ -29,25 +29,33 @@ TRACE_EVENT(memory_collector_sample, __entry->cpu, __entry->timestamp, __entry->comm, __entry->is_context_switch) ); -TRACE_EVENT(memory_collector_mbm_total, - TP_PROTO(u32 rmid, u64 timestamp, u64 val, int err), - TP_ARGS(rmid, timestamp, val, err), +TRACE_EVENT(memory_collector_resctrl, + TP_PROTO(u32 rmid, u64 timestamp, u64 llc_occupancy_val, int llc_occupancy_err, u64 mbm_total_val, int mbm_total_err, u64 mbm_local_val, int mbm_local_err), + TP_ARGS(rmid, timestamp, llc_occupancy_val, llc_occupancy_err, mbm_total_val, mbm_total_err, mbm_local_val, mbm_local_err), TP_STRUCT__entry( __field(u32, rmid) __field(u64, timestamp) - __field(u64, val) - __field(int, err) + __field(u64, llc_occupancy_val) + __field(int, llc_occupancy_err) + __field(u64, mbm_total_val) + __field(int, mbm_total_err) + __field(u64, mbm_local_val) + __field(int, mbm_local_err) ), TP_fast_assign( __entry->rmid = rmid; __entry->timestamp = timestamp; - __entry->val = val; - __entry->err = err; + __entry->llc_occupancy_val = llc_occupancy_val; + __entry->llc_occupancy_err = llc_occupancy_err; + __entry->mbm_total_val = mbm_total_val; + __entry->mbm_total_err = mbm_total_err; + __entry->mbm_local_val = mbm_local_val; + __entry->mbm_local_err = mbm_local_err; ), - TP_printk("rmid=%u timestamp=%llu val=%llu err=%d", - __entry->rmid, __entry->timestamp, __entry->val, __entry->err) + TP_printk("rmid=%u timestamp=%llu llc_occupancy_val=%llu llc_occupancy_err=%d mbm_total_val=%llu mbm_total_err=%d mbm_local_val=%llu mbm_local_err=%d", + __entry->rmid, __entry->timestamp, __entry->llc_occupancy_val, __entry->llc_occupancy_err, __entry->mbm_total_val, __entry->mbm_total_err, __entry->mbm_local_val, __entry->mbm_local_err) ); #endif /* _MEMORY_COLLECTOR_TRACE_H */ diff --git a/module/resctrl.c b/module/resctrl.c index e2f2fa2..e527a15 100644 --- a/module/resctrl.c +++ b/module/resctrl.c @@ -51,14 +51,25 @@ void resctrl_timer_tick(struct rdt_state *rdt_state) { int cpu = smp_processor_id(); u64 now = ktime_get_ns(); + int llc_occupancy_err = 0; + u64 llc_occupancy_val = 0; int mbm_total_err = 0; u64 mbm_total_val = 0; + int mbm_local_err = 0; + u64 mbm_local_val = 0; // for now, just output the first 4 RMID, on CPUs 0..3 if (cpu > 4) { return; } + // if we support cache, read it on this CPU + if (rdt_state->supports_llc_occupancy) { + llc_occupancy_err = read_resctrl_value(cpu, QOS_L3_OCCUP_EVENT_ID, &llc_occupancy_val); + } else { + llc_occupancy_err = -ENODEV; + } + // if we support mbm, read it on this CPU if (rdt_state->supports_mbm_total) { mbm_total_err = read_resctrl_value(cpu, QOS_L3_MBM_TOTAL_EVENT_ID, &mbm_total_val); @@ -66,7 +77,14 @@ void resctrl_timer_tick(struct rdt_state *rdt_state) mbm_total_err = -ENODEV; } - trace_memory_collector_mbm_total(cpu, now, mbm_total_val, mbm_total_err); + // if we support mbm local, read it on this CPU + if (rdt_state->supports_mbm_local) { + mbm_local_err = read_resctrl_value(cpu, QOS_L3_MBM_LOCAL_EVENT_ID, &mbm_local_val); + } else { + mbm_local_err = -ENODEV; + } + + trace_memory_collector_resctrl(cpu, now, llc_occupancy_val, llc_occupancy_err, mbm_total_val, mbm_total_err, mbm_local_val, mbm_local_err); } int resctrl_init_cpu(struct rdt_state *rdt_state) diff --git a/module/test_module.sh b/module/test_module.sh index b17a795..6dd8fa7 100755 --- a/module/test_module.sh +++ b/module/test_module.sh @@ -20,7 +20,7 @@ lsmod | grep collector || { } echo "Setting up tracing..." -sudo trace-cmd start -e memory_collector_sample -e memory_collector_mbm_total +sudo trace-cmd start -e memory_collector_sample -e memory_collector_resctrl echo "Collecting samples for 1 second..." sleep 1 @@ -46,8 +46,8 @@ grep "is_context_switch=0" "$TRACE_OUTPUT" | head echo "Head of is_context_switch=1:" grep "is_context_switch=1" "$TRACE_OUTPUT" | head -echo "Head of mbm_total:" -grep "memory_collector_mbm_total:" "$TRACE_OUTPUT" | head -n 20 +echo "Head of resctrl:" +grep "memory_collector_resctrl:" "$TRACE_OUTPUT" | head -n 20 echo "Validating output..." # Check if we have any trace entries From 115f2fdd4fcbccc89ecdaaf3104b8e357841468c Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 21:41:20 +0000 Subject: [PATCH 83/88] polish module test github action --- .github/workflows/test-kernel-module.yaml | 40 ++++++++++++++--------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/.github/workflows/test-kernel-module.yaml b/.github/workflows/test-kernel-module.yaml index 2c517a9..3519a51 100644 --- a/.github/workflows/test-kernel-module.yaml +++ b/.github/workflows/test-kernel-module.yaml @@ -5,11 +5,13 @@ on: instance-type: description: 'EC2 instance type to use' required: false - default: 'c7i.metal-24xl' + default: 'm7i.metal-24xl' type: string push: branches: - main + paths: + - module/** permissions: id-token: write # Required for requesting the JWT @@ -146,11 +148,6 @@ jobs: # we do not unmount, maybe mounting affects the intel_cqm checks below #sudo umount /sys/fs/resctrl || true - - name: Run the interval dmesg reader - working-directory: module - run: | - ./run_module.sh - - name: Load and test module id: load-and-test-module continue-on-error: true @@ -162,25 +159,39 @@ jobs: sudo objdump -d build/collector.ko | grep undefined || true # Load module + echo "insmod build/collector.ko:" sudo insmod build/collector.ko # Verify module is loaded + echo "lsmod | grep collector:" lsmod | grep collector # Check kernel logs for module initialization - dmesg | grep "Memory Collector" || true + echo "dmesg | grep 'Memory Collector':" + dmesg -c | grep "Memory Collector" || true # Unload module + echo "rmmod collector:" sudo rmmod collector # Verify module unloaded successfully + echo "lsmod | grep collector:" + lsmod | grep collector if lsmod | grep -q collector; then echo "Error: Module still loaded" exit 1 fi # Check kernel logs for cleanup message - dmesg | grep "Memory Collector" || true + echo "dmesg | grep 'Memory Collector':" + dmesg -c | grep "Memory Collector" || true + + - name: Check dmesg on failure + if: steps.load-and-test-module.outcome == 'failure' + run: | + echo "load and test module failed, showing last kernel messages:" + sudo dmesg | tail -n 100 + exit 1 - name: Install trace dependencies run: | @@ -189,15 +200,12 @@ jobs: - name: Run module test script working-directory: module run: | + # run 10 times in quick succession to stress-test insmod/rmmod and collector + for i in {1..10}; do + echo "*** Run $i:" + ./test_module.sh + done - ./test_module.sh - - - name: Check dmesg on failure - if: steps.load-and-test-module.outcome == 'failure' - run: | - echo "load and test module failed, showing last kernel messages:" - sudo dmesg | tail -n 100 - exit 1 stop-runner: name: Stop EC2 runner From d28b92784b48c5f7bb58e16fce5aec8fccf97422 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 21:42:31 +0000 Subject: [PATCH 84/88] delete the simple runner, now we do not need to aggressively read kernel logs --- module/run_module.sh | 25 ------------------------- 1 file changed, 25 deletions(-) delete mode 100755 module/run_module.sh diff --git a/module/run_module.sh b/module/run_module.sh deleted file mode 100755 index 2a65468..0000000 --- a/module/run_module.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash - -# Clear dmesg buffer -sudo dmesg -c > /dev/null - -# Load the module -sudo insmod build/collector.ko - -# Store the timestamp when we started -start_time=$(date +%s) - -echo "Module loaded, monitoring dmesg output..." -echo "Press Ctrl+C to stop monitoring" - -for i in {1..5}; do - # Get new dmesg entries and timestamp them - sudo dmesg -c | while read line; do - current_time=$(($(date +%s) - start_time)) - echo "[$current_time sec] $line" - done - - sleep 1 -done - -sudo rmmod collector From f6dec89bd8d2a44b3581576b2d6c6258d91cc151 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 21:51:56 +0000 Subject: [PATCH 85/88] fix error code when verifying unload --- .github/workflows/test-kernel-module.yaml | 2 +- module/test_module.sh | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-kernel-module.yaml b/.github/workflows/test-kernel-module.yaml index 3519a51..2fda945 100644 --- a/.github/workflows/test-kernel-module.yaml +++ b/.github/workflows/test-kernel-module.yaml @@ -176,7 +176,7 @@ jobs: # Verify module unloaded successfully echo "lsmod | grep collector:" - lsmod | grep collector + ! lsmod | grep collector if lsmod | grep -q collector; then echo "Error: Module still loaded" exit 1 diff --git a/module/test_module.sh b/module/test_module.sh index 6dd8fa7..b5e19a5 100755 --- a/module/test_module.sh +++ b/module/test_module.sh @@ -63,6 +63,10 @@ fi echo "Unloading module..." sudo rmmod collector +# Verify module unloaded successfully +echo "Verifying unload..." +! lsmod | grep collector + echo "Cleaning up trace..." sudo trace-cmd reset From 23371a035b223cab4d2a86fae4b6e2c010bd4a76 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 21:52:37 +0000 Subject: [PATCH 86/88] use function to write to the PQR_ASSOC MSR --- module/resctrl.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/module/resctrl.c b/module/resctrl.c index e527a15..b8256b0 100644 --- a/module/resctrl.c +++ b/module/resctrl.c @@ -26,25 +26,11 @@ struct ipi_rmid_args { int status; }; /* - * IPI function to write RMID to MSR - * Called on each CPU by on_each_cpu() + * write RMID and CLOSID to MSR */ -static void ipi_write_rmid(void *info) +static int write_rmid_closid(u32 rmid, u32 closid) { - struct ipi_rmid_args *args = info; - u32 closid = 0; - - // if we're not on CPU 2, don't do anything - if (smp_processor_id() != 2) { - args->status = 0; - return; - } - - if (wrmsr_safe(MSR_IA32_PQR_ASSOC, args->rmid, closid) != 0) { - args->status = -EIO; - } else { - args->status = 0; - } + return wrmsr_safe(MSR_IA32_PQR_ASSOC, rmid, closid); } void resctrl_timer_tick(struct rdt_state *rdt_state) @@ -133,7 +119,7 @@ int resctrl_init_cpu(struct rdt_state *rdt_state) if (cpu == 2) { u32 rmid = 1; u32 closid = 0; - if (wrmsr_safe(MSR_IA32_PQR_ASSOC, rmid, closid) != 0) { + if (write_rmid_closid(rmid, closid) != 0) { pr_err("Memory Collector: failed to write RMID %d to MSR_IA32_PQR_ASSOC\n", rmid); } else { pr_info("Memory Collector: wrote RMID %d to MSR_IA32_PQR_ASSOC\n", rmid); From d495d86693e0c3d807a895170f3d20695c665901 Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 22:05:26 +0000 Subject: [PATCH 87/88] cleanup logging and remove unneeded clauses --- module/memory_collector.c | 24 +++++++++++------------- module/resctrl.c | 24 ++++++++++++------------ 2 files changed, 23 insertions(+), 25 deletions(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index d73498d..35ac70f 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -14,9 +14,7 @@ MODULE_AUTHOR("Memory Collector Project"); MODULE_DESCRIPTION("Memory subsystem monitoring for Kubernetes"); MODULE_VERSION("1.0"); -#ifndef CONFIG_PERF_EVENTS -#error "This module requires CONFIG_PERF_EVENTS" -#endif +#define LOG_PREFIX "Memory Collector: " // Define the tracepoint #define CREATE_TRACE_POINTS @@ -71,7 +69,7 @@ static void init_cpu_state(struct work_struct *work) // Verify this work matches the expected work for this CPU if (work != expected_work) { - pr_err("CPU mismatch in init_cpu_state. On CPU %d, expected work %px, got %px\n", + pr_err(LOG_PREFIX "CPU mismatch in init_cpu_state. On CPU %d, expected work %px, got %px\n", cpu, expected_work, work); return; } @@ -93,7 +91,7 @@ static void init_cpu_state(struct work_struct *work) NULL); if (IS_ERR(state->ctx_switch)) { ret = PTR_ERR(state->ctx_switch); - pr_err("Failed to create context switch event for CPU %d: error %d\n", cpu, ret); + pr_err(LOG_PREFIX "Failed to create context switch event for CPU %d: error %d\n", cpu, ret); state->ctx_switch = NULL; } @@ -117,7 +115,7 @@ static void cleanup_cpu(int cpu) { struct cpu_state *state = per_cpu_ptr(cpu_states, cpu); - pr_debug("cleanup_cpu for CPU %d\n", cpu); + pr_debug(LOG_PREFIX "cleanup_cpu for CPU %d\n", cpu); hrtimer_cancel(&state->timer); @@ -150,7 +148,7 @@ static int __init memory_collector_init(void) { int cpu, ret; - printk(KERN_INFO "Memory Collector: initializing\n"); + pr_info(LOG_PREFIX "initializing\n"); cpu_works = NULL; @@ -184,7 +182,7 @@ static int __init memory_collector_init(void) } // Initialize and queue work for each CPU - pr_info("Memory Collector: initializing per-cpu perf events\n"); + pr_info(LOG_PREFIX "initializing per-cpu perf events\n"); for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(cpu_works, cpu); INIT_WORK(work, init_cpu_state); @@ -195,10 +193,10 @@ static int __init memory_collector_init(void) flush_workqueue(collector_wq); free_percpu(cpu_works); cpu_works = NULL; - pr_info("Memory Collector: workqueue flushed\n"); + pr_info(LOG_PREFIX "workqueue flushed\n"); // Check initialization results - pr_info("Memory Collector: checking per-cpu perf events\n"); + pr_info(LOG_PREFIX "checking per-cpu perf events\n"); for_each_possible_cpu(cpu) { struct cpu_state *state = per_cpu_ptr(cpu_states, cpu); if (state->ctx_switch == NULL) { @@ -207,7 +205,7 @@ static int __init memory_collector_init(void) } } - pr_info("Memory Collector: initialization completed\n"); + pr_info(LOG_PREFIX "initialization completed\n"); return 0; error_cpu_init: @@ -223,7 +221,7 @@ static int __init memory_collector_init(void) error_wq: free_percpu(cpu_states); error_alloc: - pr_err("Memory Collector: initialization failed, ret = %d\n", ret); + pr_err(LOG_PREFIX "initialization failed, ret = %d\n", ret); return ret; } @@ -232,7 +230,7 @@ static void __exit memory_collector_exit(void) { int cpu; - printk(KERN_INFO "Memory Collector: unregistering PMU module\n"); + pr_info(LOG_PREFIX "unregistering PMU module\n"); for_each_possible_cpu(cpu) { cleanup_cpu(cpu); diff --git a/module/resctrl.c b/module/resctrl.c index b8256b0..ecb9e14 100644 --- a/module/resctrl.c +++ b/module/resctrl.c @@ -11,7 +11,7 @@ #include "resctrl.h" #include "memory_collector_trace.h" -MODULE_LICENSE("GPL"); +#define LOG_PREFIX "Memory Collector: " #ifndef RESCTRL_RESERVED_RMID #define RESCTRL_RESERVED_RMID 0 @@ -79,30 +79,30 @@ int resctrl_init_cpu(struct rdt_state *rdt_state) unsigned int eax, ebx, ecx, edx; int ret = 0; - pr_debug("Memory Collector: Starting enumerate_cpuid on CPU %d\n", cpu); + pr_debug(LOG_PREFIX "Starting enumerate_cpuid on CPU %d\n", cpu); memset(rdt_state, 0, sizeof(struct rdt_state)); if (!boot_cpu_has( X86_FEATURE_CQM_LLC)) { - pr_debug("Memory Collector: CPU does not support QoS monitoring\n"); + pr_debug(LOG_PREFIX "CPU does not support QoS monitoring\n"); return -ENODEV; } - pr_debug("Memory Collector: Checking CPUID.0x7.0 for RDT support\n"); + pr_debug(LOG_PREFIX "Checking CPUID.0x7.0 for RDT support\n"); cpuid_count(0x7, 0, &eax, &ebx, &ecx, &edx); if (!(ebx & (1 << 12))) { - pr_debug("Memory Collector: RDT monitoring not supported (CPUID.0x7.0:EBX.12)\n"); + pr_debug(LOG_PREFIX "RDT monitoring not supported (CPUID.0x7.0:EBX.12)\n"); return -ENODEV; } - pr_debug("Memory Collector: Checking CPUID.0xF.0 for L3 monitoring\n"); + pr_debug(LOG_PREFIX "Checking CPUID.0xF.0 for L3 monitoring\n"); cpuid_count(0xF, 0, &eax, &ebx, &ecx, &edx); if (!(edx & (1 << 1))) { - pr_debug("Memory Collector: L3 monitoring not supported (CPUID.0xF.0:EDX.1)\n"); + pr_debug(LOG_PREFIX "L3 monitoring not supported (CPUID.0xF.0:EDX.1)\n"); return -ENODEV; } - pr_debug("Memory Collector: Checking CPUID.0xF.1 for L3 occupancy monitoring\n"); + pr_debug(LOG_PREFIX "Checking CPUID.0xF.1 for L3 occupancy monitoring\n"); cpuid_count(0xF, 1, &eax, &ebx, &ecx, &edx); rdt_state->supports_llc_occupancy = (edx & (1 << 0)); rdt_state->supports_mbm_total = (edx & (1 << 1)); @@ -113,20 +113,20 @@ int resctrl_init_cpu(struct rdt_state *rdt_state) rdt_state->supports_non_cpu_agent_cache = (eax & (1 << 8)); rdt_state->supports_non_cpu_agent_mbm = (eax & (1 << 10)); - pr_debug("Memory Collector: capabilities of core %d: llc_occupancy: %d, mbm_total: %d, mbm_local: %d, max_rmid: %d, counter_width: %d, has_overflow_bit: %d, supports_non_cpu_agent_cache: %d, supports_non_cpu_agent_mbm: %d\n", + pr_debug(LOG_PREFIX "capabilities of core %d: llc_occupancy: %d, mbm_total: %d, mbm_local: %d, max_rmid: %d, counter_width: %d, has_overflow_bit: %d, supports_non_cpu_agent_cache: %d, supports_non_cpu_agent_mbm: %d\n", cpu, rdt_state->supports_llc_occupancy, rdt_state->supports_mbm_total, rdt_state->supports_mbm_local, rdt_state->max_rmid, rdt_state->counter_width, rdt_state->has_overflow_bit, rdt_state->supports_non_cpu_agent_cache, rdt_state->supports_non_cpu_agent_mbm); if (cpu == 2) { u32 rmid = 1; u32 closid = 0; if (write_rmid_closid(rmid, closid) != 0) { - pr_err("Memory Collector: failed to write RMID %d to MSR_IA32_PQR_ASSOC\n", rmid); + pr_err(LOG_PREFIX "failed to write RMID %d to MSR_IA32_PQR_ASSOC\n", rmid); } else { - pr_info("Memory Collector: wrote RMID %d to MSR_IA32_PQR_ASSOC\n", rmid); + pr_info(LOG_PREFIX "wrote RMID %d to MSR_IA32_PQR_ASSOC\n", rmid); } } - pr_debug("Memory Collector: enumerate_cpuid completed successfully on CPU %d\n", cpu); + pr_debug(LOG_PREFIX "enumerate_cpuid completed successfully on CPU %d\n", cpu); return ret; } From a6c56e59cb718b0430ce567a5bd2a9ecda435b3e Mon Sep 17 00:00:00 2001 From: Jonathan Perry Date: Tue, 11 Feb 2025 22:08:50 +0000 Subject: [PATCH 88/88] clarify unloading printks --- module/memory_collector.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/module/memory_collector.c b/module/memory_collector.c index 35ac70f..a5f69ef 100644 --- a/module/memory_collector.c +++ b/module/memory_collector.c @@ -230,7 +230,7 @@ static void __exit memory_collector_exit(void) { int cpu; - pr_info(LOG_PREFIX "unregistering PMU module\n"); + pr_info(LOG_PREFIX "unloading\n"); for_each_possible_cpu(cpu) { cleanup_cpu(cpu); @@ -243,6 +243,8 @@ static void __exit memory_collector_exit(void) destroy_workqueue(collector_wq); free_percpu(cpu_states); + + pr_info(LOG_PREFIX "done unloading\n"); } module_init(memory_collector_init);