From bacac2175c13a9482666f49d8af8a65174c82809 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Wed, 26 Feb 2025 18:31:59 +0000 Subject: [PATCH 01/80] bpf/helpers: Refactor bpf_dynptr_read and bpf_dynptr_write Refactor bpf_dynptr_read and bpf_dynptr_write helpers: extract code into the static functions namely __bpf_dynptr_read and __bpf_dynptr_write, this allows calling these without compiler warnings. Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250226183201.332713-2-mykyta.yatsenko5@gmail.com --- kernel/bpf/helpers.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 183298fc11ba4..6600aa4492ec4 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1759,8 +1759,8 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; -BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, - u32, offset, u64, flags) +static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *src, + u32 offset, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1793,6 +1793,12 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern } } +BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, + u32, offset, u64, flags) +{ + return __bpf_dynptr_read(dst, len, src, offset, flags); +} + static const struct bpf_func_proto bpf_dynptr_read_proto = { .func = bpf_dynptr_read, .gpl_only = false, @@ -1804,8 +1810,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .arg5_type = ARG_ANYTHING, }; -BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, - u32, len, u64, flags) +static int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, + u32 len, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1843,6 +1849,12 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v } } +BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, + u32, len, u64, flags) +{ + return __bpf_dynptr_write(dst, offset, src, len, flags); +} + static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, From 9d15404d055b345b161593958de459d998315261 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Wed, 26 Feb 2025 18:32:00 +0000 Subject: [PATCH 02/80] bpf/helpers: Introduce bpf_dynptr_copy kfunc Introducing bpf_dynptr_copy kfunc allowing copying data from one dynptr to another. This functionality is useful in scenarios such as capturing XDP data to a ring buffer. The implementation consists of 4 branches: * A fast branch for contiguous buffer capacity in both source and destination dynptrs * 3 branches utilizing __bpf_dynptr_read and __bpf_dynptr_write to copy data to/from non-contiguous buffer Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250226183201.332713-3-mykyta.yatsenko5@gmail.com --- kernel/bpf/helpers.c | 56 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6600aa4492ec4..5449756ba102e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -2770,6 +2770,61 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, return 0; } +/** + * bpf_dynptr_copy() - Copy data from one dynptr to another. + * @dst_ptr: Destination dynptr - where data should be copied to + * @dst_off: Offset into the destination dynptr + * @src_ptr: Source dynptr - where data should be copied from + * @src_off: Offset into the source dynptr + * @size: Length of the data to copy from source to destination + * + * Copies data from source dynptr to destination dynptr. + * Returns 0 on success; negative error, otherwise. + */ +__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, + struct bpf_dynptr *src_ptr, u32 src_off, u32 size) +{ + struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; + struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; + void *src_slice, *dst_slice; + char buf[256]; + u32 off; + + src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size); + dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size); + + if (src_slice && dst_slice) { + memmove(dst_slice, src_slice, size); + return 0; + } + + if (src_slice) + return __bpf_dynptr_write(dst, dst_off, src_slice, size, 0); + + if (dst_slice) + return __bpf_dynptr_read(dst_slice, size, src, src_off, 0); + + if (bpf_dynptr_check_off_len(dst, dst_off, size) || + bpf_dynptr_check_off_len(src, src_off, size)) + return -E2BIG; + + off = 0; + while (off < size) { + u32 chunk_sz = min_t(u32, sizeof(buf), size - off); + int err; + + err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0); + if (err) + return err; + err = __bpf_dynptr_write(dst, dst_off + off, buf, chunk_sz, 0); + if (err) + return err; + + off += chunk_sz; + } + return 0; +} + __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; @@ -3218,6 +3273,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) +BTF_ID_FLAGS(func, bpf_dynptr_copy) #ifdef CONFIG_NET BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif From 8fc1834cbde0fdc127defaa62f3498d8e35f5fb8 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Wed, 26 Feb 2025 18:32:01 +0000 Subject: [PATCH 03/80] selftests/bpf: Add tests for bpf_dynptr_copy Add XDP setup type for dynptr tests, enabling testing for non-contiguous buffer. Add 2 tests: - test_dynptr_copy - verify correctness for the fast (contiguous buffer) code path. - test_dynptr_copy_xdp - verifies code paths that handle non-contiguous buffer. Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250226183201.332713-4-mykyta.yatsenko5@gmail.com --- .../testing/selftests/bpf/prog_tests/dynptr.c | 21 +++ .../selftests/bpf/progs/dynptr_success.c | 123 +++++++++++++++++- 2 files changed, 139 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c index b614a5272dfd6..e29cc16124c2d 100644 --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c @@ -10,6 +10,7 @@ enum test_setup_type { SETUP_SYSCALL_SLEEP, SETUP_SKB_PROG, SETUP_SKB_PROG_TP, + SETUP_XDP_PROG, }; static struct { @@ -18,6 +19,8 @@ static struct { } success_tests[] = { {"test_read_write", SETUP_SYSCALL_SLEEP}, {"test_dynptr_data", SETUP_SYSCALL_SLEEP}, + {"test_dynptr_copy", SETUP_SYSCALL_SLEEP}, + {"test_dynptr_copy_xdp", SETUP_XDP_PROG}, {"test_ringbuf", SETUP_SYSCALL_SLEEP}, {"test_skb_readonly", SETUP_SKB_PROG}, {"test_dynptr_skb_data", SETUP_SKB_PROG}, @@ -120,6 +123,24 @@ static void verify_success(const char *prog_name, enum test_setup_type setup_typ break; } + case SETUP_XDP_PROG: + { + char data[5000]; + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &data, + .data_size_in = sizeof(data), + .repeat = 1, + ); + + prog_fd = bpf_program__fd(prog); + err = bpf_prog_test_run_opts(prog_fd, &opts); + + if (!ASSERT_OK(err, "test_run")) + goto cleanup; + + break; + } } ASSERT_EQ(skel->bss->err, 0, "err"); diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index bfcc85686cf04..e1fba28e4a868 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -1,20 +1,19 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2022 Facebook */ +#include #include #include -#include #include #include #include "bpf_misc.h" -#include "bpf_kfuncs.h" #include "errno.h" char _license[] SEC("license") = "GPL"; int pid, err, val; -struct sample { +struct ringbuf_sample { int pid; int seq; long value; @@ -121,7 +120,7 @@ int test_dynptr_data(void *ctx) static int ringbuf_callback(__u32 index, void *data) { - struct sample *sample; + struct ringbuf_sample *sample; struct bpf_dynptr *ptr = (struct bpf_dynptr *)data; @@ -138,7 +137,7 @@ SEC("?tp/syscalls/sys_enter_nanosleep") int test_ringbuf(void *ctx) { struct bpf_dynptr ptr; - struct sample *sample; + struct ringbuf_sample *sample; if (bpf_get_current_pid_tgid() >> 32 != pid) return 0; @@ -567,3 +566,117 @@ int BPF_PROG(test_dynptr_skb_tp_btf, void *skb, void *location) return 1; } + +static inline int bpf_memcmp(const char *a, const char *b, u32 size) +{ + int i; + + bpf_for(i, 0, size) { + if (a[i] != b[i]) + return a[i] < b[i] ? -1 : 1; + } + return 0; +} + +SEC("?tp/syscalls/sys_enter_nanosleep") +int test_dynptr_copy(void *ctx) +{ + char data[] = "hello there, world!!"; + char buf[32] = {'\0'}; + __u32 sz = sizeof(data); + struct bpf_dynptr src, dst; + + bpf_ringbuf_reserve_dynptr(&ringbuf, sz, 0, &src); + bpf_ringbuf_reserve_dynptr(&ringbuf, sz, 0, &dst); + + /* Test basic case of copying contiguous memory backed dynptrs */ + err = bpf_dynptr_write(&src, 0, data, sz, 0); + err = err ?: bpf_dynptr_copy(&dst, 0, &src, 0, sz); + err = err ?: bpf_dynptr_read(buf, sz, &dst, 0, 0); + err = err ?: bpf_memcmp(data, buf, sz); + + /* Test that offsets are handled correctly */ + err = err ?: bpf_dynptr_copy(&dst, 3, &src, 5, sz - 5); + err = err ?: bpf_dynptr_read(buf, sz - 5, &dst, 3, 0); + err = err ?: bpf_memcmp(data + 5, buf, sz - 5); + + bpf_ringbuf_discard_dynptr(&src, 0); + bpf_ringbuf_discard_dynptr(&dst, 0); + return 0; +} + +SEC("xdp") +int test_dynptr_copy_xdp(struct xdp_md *xdp) +{ + struct bpf_dynptr ptr_buf, ptr_xdp; + char data[] = "qwertyuiopasdfghjkl"; + char buf[32] = {'\0'}; + __u32 len = sizeof(data); + int i, chunks = 200; + + /* ptr_xdp is backed by non-contiguous memory */ + bpf_dynptr_from_xdp(xdp, 0, &ptr_xdp); + bpf_ringbuf_reserve_dynptr(&ringbuf, len * chunks, 0, &ptr_buf); + + /* Destination dynptr is backed by non-contiguous memory */ + bpf_for(i, 0, chunks) { + err = bpf_dynptr_write(&ptr_buf, i * len, data, len, 0); + if (err) + goto out; + } + + err = bpf_dynptr_copy(&ptr_xdp, 0, &ptr_buf, 0, len * chunks); + if (err) + goto out; + + bpf_for(i, 0, chunks) { + __builtin_memset(buf, 0, sizeof(buf)); + err = bpf_dynptr_read(&buf, len, &ptr_xdp, i * len, 0); + if (err) + goto out; + if (bpf_memcmp(data, buf, len) != 0) + goto out; + } + + /* Source dynptr is backed by non-contiguous memory */ + __builtin_memset(buf, 0, sizeof(buf)); + bpf_for(i, 0, chunks) { + err = bpf_dynptr_write(&ptr_buf, i * len, buf, len, 0); + if (err) + goto out; + } + + err = bpf_dynptr_copy(&ptr_buf, 0, &ptr_xdp, 0, len * chunks); + if (err) + goto out; + + bpf_for(i, 0, chunks) { + __builtin_memset(buf, 0, sizeof(buf)); + err = bpf_dynptr_read(&buf, len, &ptr_buf, i * len, 0); + if (err) + goto out; + if (bpf_memcmp(data, buf, len) != 0) + goto out; + } + + /* Both source and destination dynptrs are backed by non-contiguous memory */ + err = bpf_dynptr_copy(&ptr_xdp, 2, &ptr_xdp, len, len * (chunks - 1)); + if (err) + goto out; + + bpf_for(i, 0, chunks - 1) { + __builtin_memset(buf, 0, sizeof(buf)); + err = bpf_dynptr_read(&buf, len, &ptr_xdp, 2 + i * len, 0); + if (err) + goto out; + if (bpf_memcmp(data, buf, len) != 0) + goto out; + } + + if (bpf_dynptr_copy(&ptr_xdp, 2000, &ptr_xdp, 0, len * chunks) != -E2BIG) + err = 1; + +out: + bpf_ringbuf_discard_dynptr(&ptr_buf, 0); + return XDP_DROP; +} From 27e3162a036406ee9b81eae86c43b2fec1067a39 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Thu, 27 Feb 2025 22:26:44 +0800 Subject: [PATCH 04/80] selftests/bpf: Allow auto port binding for cgroup connect Allow auto port binding for cgroup connect test to avoid binding conflict. Result: ./test_progs -a cgroup_v1v2 59 cgroup_v1v2:OK Summary: 1/0 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Jiayuan Chen Link: https://lore.kernel.org/r/20250227142646.59711-2-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- .../testing/selftests/bpf/prog_tests/cgroup_v1v2.c | 13 +++++++++---- .../testing/selftests/bpf/progs/connect4_dropper.c | 4 +++- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c b/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c index 64abba72ac107..37c1cc52ed98e 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c @@ -10,12 +10,18 @@ static int run_test(int cgroup_fd, int server_fd, bool classid) { struct connect4_dropper *skel; - int fd, err = 0; + int fd, err = 0, port; skel = connect4_dropper__open_and_load(); if (!ASSERT_OK_PTR(skel, "skel_open")) return -1; + port = get_socket_local_port(server_fd); + if (!ASSERT_GE(port, 0, "get_socket_local_port")) + return -1; + + skel->bss->port = ntohs(port); + skel->links.connect_v4_dropper = bpf_program__attach_cgroup(skel->progs.connect_v4_dropper, cgroup_fd); @@ -48,10 +54,9 @@ void test_cgroup_v1v2(void) { struct network_helper_opts opts = {}; int server_fd, client_fd, cgroup_fd; - static const int port = 60120; /* Step 1: Check base connectivity works without any BPF. */ - server_fd = start_server(AF_INET, SOCK_STREAM, NULL, port, 0); + server_fd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0); if (!ASSERT_GE(server_fd, 0, "server_fd")) return; client_fd = connect_to_fd_opts(server_fd, &opts); @@ -66,7 +71,7 @@ void test_cgroup_v1v2(void) cgroup_fd = test__join_cgroup("/connect_dropper"); if (!ASSERT_GE(cgroup_fd, 0, "cgroup_fd")) return; - server_fd = start_server(AF_INET, SOCK_STREAM, NULL, port, 0); + server_fd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0); if (!ASSERT_GE(server_fd, 0, "server_fd")) { close(cgroup_fd); return; diff --git a/tools/testing/selftests/bpf/progs/connect4_dropper.c b/tools/testing/selftests/bpf/progs/connect4_dropper.c index d3f4c5e4fb696..a3819a5d09c8f 100644 --- a/tools/testing/selftests/bpf/progs/connect4_dropper.c +++ b/tools/testing/selftests/bpf/progs/connect4_dropper.c @@ -13,12 +13,14 @@ #define VERDICT_REJECT 0 #define VERDICT_PROCEED 1 +int port; + SEC("cgroup/connect4") int connect_v4_dropper(struct bpf_sock_addr *ctx) { if (ctx->type != SOCK_STREAM) return VERDICT_PROCEED; - if (ctx->user_port == bpf_htons(60120)) + if (ctx->user_port == bpf_htons(port)) return VERDICT_REJECT; return VERDICT_PROCEED; } From dbe7d46ed10937d15df682520d7a6adeedc3a15b Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Thu, 27 Feb 2025 22:26:45 +0800 Subject: [PATCH 05/80] selftests/bpf: Allow auto port binding for bpf nf Allow auto port binding for bpf nf test to avoid binding conflict. ./test_progs -a bpf_nf 24/1 bpf_nf/xdp-ct:OK 24/2 bpf_nf/tc-bpf-ct:OK 24/3 bpf_nf/alloc_release:OK 24/4 bpf_nf/insert_insert:OK 24/5 bpf_nf/lookup_insert:OK 24/6 bpf_nf/set_timeout_after_insert:OK 24/7 bpf_nf/set_status_after_insert:OK 24/8 bpf_nf/change_timeout_after_alloc:OK 24/9 bpf_nf/change_status_after_alloc:OK 24/10 bpf_nf/write_not_allowlisted_field:OK 24 bpf_nf:OK Summary: 1/10 PASSED, 0 SKIPPED, 0 FAILED Signed-off-by: Jiayuan Chen Link: https://lore.kernel.org/r/20250227142646.59711-3-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/bpf_nf.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c index a4a1f93878d40..dbd13f8e42a7a 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c @@ -72,11 +72,14 @@ static void test_bpf_nf_ct(int mode) if (!ASSERT_OK(system(cmd), cmd)) goto end; - srv_port = (mode == TEST_XDP) ? 5005 : 5006; - srv_fd = start_server(AF_INET, SOCK_STREAM, "127.0.0.1", srv_port, TIMEOUT_MS); + srv_fd = start_server(AF_INET, SOCK_STREAM, "127.0.0.1", 0, TIMEOUT_MS); if (!ASSERT_GE(srv_fd, 0, "start_server")) goto end; + srv_port = get_socket_local_port(srv_fd); + if (!ASSERT_GE(srv_port, 0, "get_sock_local_port")) + goto end; + client_fd = connect_to_server(srv_fd); if (!ASSERT_GE(client_fd, 0, "connect_to_server")) goto end; @@ -91,7 +94,7 @@ static void test_bpf_nf_ct(int mode) skel->bss->saddr = peer_addr.sin_addr.s_addr; skel->bss->sport = peer_addr.sin_port; skel->bss->daddr = peer_addr.sin_addr.s_addr; - skel->bss->dport = htons(srv_port); + skel->bss->dport = srv_port; if (mode == TEST_XDP) prog_fd = bpf_program__fd(skel->progs.nf_xdp_ct_test); From 09de329523c8b353dfeceb44a09740b40197ea89 Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Thu, 27 Feb 2025 22:26:46 +0800 Subject: [PATCH 06/80] selftests/bpf: Fixes for test_maps test BPF CI has failed 3 times in the last 24 hours. Add retry for ENOMEM. It's similar to the optimization plan: commit 2f553b032cad ("selftsets/bpf: Retry map update for non-preallocated per-cpu map") Failed CI: https://github.com/kernel-patches/bpf/actions/runs/13549227497/job/37868926343 https://github.com/kernel-patches/bpf/actions/runs/13548089029/job/37865812030 https://github.com/kernel-patches/bpf/actions/runs/13553536268/job/37883329296 selftests/bpf: Fixes for test_maps test Fork 100 tasks to 'test_update_delete' Fork 100 tasks to 'test_update_delete' Fork 100 tasks to 'test_update_delete' Fork 100 tasks to 'test_update_delete' ...... test_task_storage_map_stress_lookup:PASS test_maps: OK, 0 SKIPPED Signed-off-by: Jiayuan Chen Link: https://lore.kernel.org/r/20250227142646.59711-4-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_maps.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 8b40e9496af16..986ce32b113a3 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1396,9 +1396,10 @@ static void test_map_stress(void) #define MAX_DELAY_US 50000 #define MIN_DELAY_RANGE_US 5000 -static bool retry_for_again_or_busy(int err) +static bool can_retry(int err) { - return (err == EAGAIN || err == EBUSY); + return (err == EAGAIN || err == EBUSY || + (err == ENOMEM && map_opts.map_flags == BPF_F_NO_PREALLOC)); } int map_update_retriable(int map_fd, const void *key, const void *value, int flags, int attempts, @@ -1451,12 +1452,12 @@ static void test_update_delete(unsigned int fn, void *data) if (do_update) { err = map_update_retriable(fd, &key, &value, BPF_NOEXIST, MAP_RETRIES, - retry_for_again_or_busy); + can_retry); if (err) printf("error %d %d\n", err, errno); assert(err == 0); err = map_update_retriable(fd, &key, &value, BPF_EXIST, MAP_RETRIES, - retry_for_again_or_busy); + can_retry); if (err) printf("error %d %d\n", err, errno); assert(err == 0); From 78a8a8556040e9abde95fbd06dec83dd0ef2b2df Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 24 Feb 2025 15:01:16 -0800 Subject: [PATCH 07/80] bpf: Allow pre-ordering for bpf cgroup progs Currently for bpf progs in a cgroup hierarchy, the effective prog array is computed from bottom cgroup to upper cgroups (post-ordering). For example, the following cgroup hierarchy root cgroup: p1, p2 subcgroup: p3, p4 have BPF_F_ALLOW_MULTI for both cgroup levels. The effective cgroup array ordering looks like p3 p4 p1 p2 and at run time, progs will execute based on that order. But in some cases, it is desirable to have root prog executes earlier than children progs (pre-ordering). For example, - prog p1 intends to collect original pkt dest addresses. - prog p3 will modify original pkt dest addresses to a proxy address for security reason. The end result is that prog p1 gets proxy address which is not what it wants. Putting p1 to every child cgroup is not desirable either as it will duplicate itself in many child cgroups. And this is exactly a use case we are encountering in Meta. To fix this issue, let us introduce a flag BPF_F_PREORDER. If the flag is specified at attachment time, the prog has higher priority and the ordering with that flag will be from top to bottom (pre-ordering). For example, in the above example, root cgroup: p1, p2 subcgroup: p3, p4 Let us say p2 and p4 are marked with BPF_F_PREORDER. The final effective array ordering will be p2 p4 p3 p1 Suggested-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20250224230116.283071-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf-cgroup.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/cgroup.c | 33 +++++++++++++++++++++++++-------- kernel/bpf/syscall.c | 3 ++- tools/include/uapi/linux/bpf.h | 1 + 5 files changed, 30 insertions(+), 9 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 7fc69083e7450..9de7adb682948 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -111,6 +111,7 @@ struct bpf_prog_list { struct bpf_prog *prog; struct bpf_cgroup_link *link; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; + u32 flags; }; int cgroup_bpf_inherit(struct cgroup *cgrp); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fff6cdb8d11a2..beac5cdf2d2c6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1207,6 +1207,7 @@ enum bpf_perf_event_type { #define BPF_F_BEFORE (1U << 3) #define BPF_F_AFTER (1U << 4) #define BPF_F_ID (1U << 5) +#define BPF_F_PREORDER (1U << 6) #define BPF_F_LINK BPF_F_LINK /* 1 << 13 */ /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 46e5db65dbc8d..84f58f3d028a3 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -369,7 +369,7 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) /* count number of elements in the list. * it's slow but the list cannot be long */ -static u32 prog_list_length(struct hlist_head *head) +static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt) { struct bpf_prog_list *pl; u32 cnt = 0; @@ -377,6 +377,8 @@ static u32 prog_list_length(struct hlist_head *head) hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; + if (preorder_cnt && (pl->flags & BPF_F_PREORDER)) + (*preorder_cnt)++; cnt++; } return cnt; @@ -400,7 +402,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, if (flags & BPF_F_ALLOW_MULTI) return true; - cnt = prog_list_length(&p->bpf.progs[atype]); + cnt = prog_list_length(&p->bpf.progs[atype], NULL); WARN_ON_ONCE(cnt > 1); if (cnt == 1) return !!(flags & BPF_F_ALLOW_OVERRIDE); @@ -423,12 +425,12 @@ static int compute_effective_progs(struct cgroup *cgrp, struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; - int cnt = 0; + int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart; /* count number of effective programs by walking parents */ do { if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) - cnt += prog_list_length(&p->bpf.progs[atype]); + cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt); p = cgroup_parent(p); } while (p); @@ -439,20 +441,34 @@ static int compute_effective_progs(struct cgroup *cgrp, /* populate the array with effective progs */ cnt = 0; p = cgrp; + fstart = preorder_cnt; + bstart = preorder_cnt - 1; do { if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; + init_bstart = bstart; hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue; - item = &progs->items[cnt]; + if (pl->flags & BPF_F_PREORDER) { + item = &progs->items[bstart]; + bstart--; + } else { + item = &progs->items[fstart]; + fstart++; + } item->prog = prog_list_prog(pl); bpf_cgroup_storages_assign(item->cgroup_storage, pl->storage); cnt++; } + + /* reverse pre-ordering progs at this cgroup level */ + for (i = bstart + 1, j = init_bstart; i < j; i++, j--) + swap(progs->items[i], progs->items[j]); + } while ((p = cgroup_parent(p))); *array = progs; @@ -663,7 +679,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, */ return -EPERM; - if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) + if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; pl = find_attach_entry(progs, prog, link, replace_prog, @@ -698,6 +714,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, pl->prog = prog; pl->link = link; + pl->flags = flags; bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[atype] = saved_flags; @@ -1073,7 +1090,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, lockdep_is_held(&cgroup_mutex)); total_cnt += bpf_prog_array_length(effective); } else { - total_cnt += prog_list_length(&cgrp->bpf.progs[atype]); + total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL); } } @@ -1105,7 +1122,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, u32 id; progs = &cgrp->bpf.progs[atype]; - cnt = min_t(int, prog_list_length(progs), total_cnt); + cnt = min_t(int, prog_list_length(progs, NULL), total_cnt); i = 0; hlist_for_each_entry(pl, progs, node) { prog = prog_list_prog(pl); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dbd89c13dd328..d799fe8f568e1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4170,7 +4170,8 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, #define BPF_F_ATTACH_MASK_BASE \ (BPF_F_ALLOW_OVERRIDE | \ BPF_F_ALLOW_MULTI | \ - BPF_F_REPLACE) + BPF_F_REPLACE | \ + BPF_F_PREORDER) #define BPF_F_ATTACH_MASK_MPROG \ (BPF_F_REPLACE | \ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index fff6cdb8d11a2..beac5cdf2d2c6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1207,6 +1207,7 @@ enum bpf_perf_event_type { #define BPF_F_BEFORE (1U << 3) #define BPF_F_AFTER (1U << 4) #define BPF_F_ID (1U << 5) +#define BPF_F_PREORDER (1U << 6) #define BPF_F_LINK BPF_F_LINK /* 1 << 13 */ /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the From 42c5e6d2accf31bba4cd31f8a742d5b9e19a7d28 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 24 Feb 2025 15:01:21 -0800 Subject: [PATCH 08/80] selftests/bpf: Add selftests allowing cgroup prog pre-ordering Add a few selftests with cgroup prog pre-ordering. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20250224230121.283601-1-yonghong.song@linux.dev Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/cgroup_preorder.c | 128 ++++++++++++++++++ .../selftests/bpf/progs/cgroup_preorder.c | 41 ++++++ 2 files changed, 169 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/cgroup_preorder.c create mode 100644 tools/testing/selftests/bpf/progs/cgroup_preorder.c diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_preorder.c b/tools/testing/selftests/bpf/prog_tests/cgroup_preorder.c new file mode 100644 index 0000000000000..d4d583872fa26 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_preorder.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include "cgroup_helpers.h" +#include "cgroup_preorder.skel.h" + +static int run_getsockopt_test(int cg_parent, int cg_child, int sock_fd, bool all_preorder) +{ + LIBBPF_OPTS(bpf_prog_attach_opts, opts); + enum bpf_attach_type prog_c_atype, prog_c2_atype, prog_p_atype, prog_p2_atype; + int prog_c_fd, prog_c2_fd, prog_p_fd, prog_p2_fd; + struct cgroup_preorder *skel = NULL; + struct bpf_program *prog; + __u8 *result, buf; + socklen_t optlen; + int err = 0; + + skel = cgroup_preorder__open_and_load(); + if (!ASSERT_OK_PTR(skel, "cgroup_preorder__open_and_load")) + return 0; + + buf = 0x00; + err = setsockopt(sock_fd, SOL_IP, IP_TOS, &buf, 1); + if (!ASSERT_OK(err, "setsockopt")) + goto close_skel; + + opts.flags = BPF_F_ALLOW_MULTI; + if (all_preorder) + opts.flags |= BPF_F_PREORDER; + prog = skel->progs.child; + prog_c_fd = bpf_program__fd(prog); + prog_c_atype = bpf_program__expected_attach_type(prog); + err = bpf_prog_attach_opts(prog_c_fd, cg_child, prog_c_atype, &opts); + if (!ASSERT_OK(err, "bpf_prog_attach_opts-child")) + goto close_skel; + + opts.flags = BPF_F_ALLOW_MULTI | BPF_F_PREORDER; + prog = skel->progs.child_2; + prog_c2_fd = bpf_program__fd(prog); + prog_c2_atype = bpf_program__expected_attach_type(prog); + err = bpf_prog_attach_opts(prog_c2_fd, cg_child, prog_c2_atype, &opts); + if (!ASSERT_OK(err, "bpf_prog_attach_opts-child_2")) + goto detach_child; + + optlen = 1; + err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen); + if (!ASSERT_OK(err, "getsockopt")) + goto detach_child_2; + + result = skel->bss->result; + if (all_preorder) + ASSERT_TRUE(result[0] == 1 && result[1] == 2, "child only"); + else + ASSERT_TRUE(result[0] == 2 && result[1] == 1, "child only"); + + skel->bss->idx = 0; + memset(result, 0, 4); + + opts.flags = BPF_F_ALLOW_MULTI; + if (all_preorder) + opts.flags |= BPF_F_PREORDER; + prog = skel->progs.parent; + prog_p_fd = bpf_program__fd(prog); + prog_p_atype = bpf_program__expected_attach_type(prog); + err = bpf_prog_attach_opts(prog_p_fd, cg_parent, prog_p_atype, &opts); + if (!ASSERT_OK(err, "bpf_prog_attach_opts-parent")) + goto detach_child_2; + + opts.flags = BPF_F_ALLOW_MULTI | BPF_F_PREORDER; + prog = skel->progs.parent_2; + prog_p2_fd = bpf_program__fd(prog); + prog_p2_atype = bpf_program__expected_attach_type(prog); + err = bpf_prog_attach_opts(prog_p2_fd, cg_parent, prog_p2_atype, &opts); + if (!ASSERT_OK(err, "bpf_prog_attach_opts-parent_2")) + goto detach_parent; + + err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen); + if (!ASSERT_OK(err, "getsockopt")) + goto detach_parent_2; + + if (all_preorder) + ASSERT_TRUE(result[0] == 3 && result[1] == 4 && result[2] == 1 && result[3] == 2, + "parent and child"); + else + ASSERT_TRUE(result[0] == 4 && result[1] == 2 && result[2] == 1 && result[3] == 3, + "parent and child"); + +detach_parent_2: + ASSERT_OK(bpf_prog_detach2(prog_p2_fd, cg_parent, prog_p2_atype), + "bpf_prog_detach2-parent_2"); +detach_parent: + ASSERT_OK(bpf_prog_detach2(prog_p_fd, cg_parent, prog_p_atype), + "bpf_prog_detach2-parent"); +detach_child_2: + ASSERT_OK(bpf_prog_detach2(prog_c2_fd, cg_child, prog_c2_atype), + "bpf_prog_detach2-child_2"); +detach_child: + ASSERT_OK(bpf_prog_detach2(prog_c_fd, cg_child, prog_c_atype), + "bpf_prog_detach2-child"); +close_skel: + cgroup_preorder__destroy(skel); + return err; +} + +void test_cgroup_preorder(void) +{ + int cg_parent = -1, cg_child = -1, sock_fd = -1; + + cg_parent = test__join_cgroup("/parent"); + if (!ASSERT_GE(cg_parent, 0, "join_cgroup /parent")) + goto out; + + cg_child = test__join_cgroup("/parent/child"); + if (!ASSERT_GE(cg_child, 0, "join_cgroup /parent/child")) + goto out; + + sock_fd = socket(AF_INET, SOCK_STREAM, 0); + if (!ASSERT_GE(sock_fd, 0, "socket")) + goto out; + + ASSERT_OK(run_getsockopt_test(cg_parent, cg_child, sock_fd, false), "getsockopt_test_1"); + ASSERT_OK(run_getsockopt_test(cg_parent, cg_child, sock_fd, true), "getsockopt_test_2"); + +out: + close(sock_fd); + close(cg_child); + close(cg_parent); +} diff --git a/tools/testing/selftests/bpf/progs/cgroup_preorder.c b/tools/testing/selftests/bpf/progs/cgroup_preorder.c new file mode 100644 index 0000000000000..4ef6202baa0a4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_preorder.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include + +char _license[] SEC("license") = "GPL"; + +unsigned int idx; +__u8 result[4]; + +SEC("cgroup/getsockopt") +int child(struct bpf_sockopt *ctx) +{ + if (idx < 4) + result[idx++] = 1; + return 1; +} + +SEC("cgroup/getsockopt") +int child_2(struct bpf_sockopt *ctx) +{ + if (idx < 4) + result[idx++] = 2; + return 1; +} + +SEC("cgroup/getsockopt") +int parent(struct bpf_sockopt *ctx) +{ + if (idx < 4) + result[idx++] = 3; + return 1; +} + +SEC("cgroup/getsockopt") +int parent_2(struct bpf_sockopt *ctx) +{ + if (idx < 4) + result[idx++] = 4; + return 1; +} From 0aaddfb06882504dded9cde57f91035ab9403b82 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 21 Feb 2025 18:44:22 -0800 Subject: [PATCH 09/80] locking/local_lock: Introduce localtry_lock_t In !PREEMPT_RT local_lock_irqsave() disables interrupts to protect critical section, but it doesn't prevent NMI, so the fully reentrant code cannot use local_lock_irqsave() for exclusive access. Introduce localtry_lock_t and localtry_lock_irqsave() that disables interrupts and sets acquired=1, so localtry_lock_irqsave() from NMI attempting to acquire the same lock will return false. In PREEMPT_RT local_lock_irqsave() maps to preemptible spin_lock(). Map localtry_lock_irqsave() to preemptible spin_trylock(). When in hard IRQ or NMI return false right away, since spin_trylock() is not safe due to explicit locking in the underneath rt_spin_trylock() implementation. Removing this explicit locking and attempting only "trylock" is undesired due to PI implications. Note there is no need to use local_inc for acquired variable, since it's a percpu variable with strict nesting scopes. Acked-by: Davidlohr Bueso Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Vlastimil Babka Link: https://lore.kernel.org/r/20250222024427.30294-2-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/local_lock.h | 70 +++++++++++++ include/linux/local_lock_internal.h | 146 ++++++++++++++++++++++++++++ 2 files changed, 216 insertions(+) diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h index 091dc0b6bdfb9..1a0bc35839e36 100644 --- a/include/linux/local_lock.h +++ b/include/linux/local_lock.h @@ -51,6 +51,76 @@ #define local_unlock_irqrestore(lock, flags) \ __local_unlock_irqrestore(lock, flags) +/** + * localtry_lock_init - Runtime initialize a lock instance + */ +#define localtry_lock_init(lock) __localtry_lock_init(lock) + +/** + * localtry_lock - Acquire a per CPU local lock + * @lock: The lock variable + */ +#define localtry_lock(lock) __localtry_lock(lock) + +/** + * localtry_lock_irq - Acquire a per CPU local lock and disable interrupts + * @lock: The lock variable + */ +#define localtry_lock_irq(lock) __localtry_lock_irq(lock) + +/** + * localtry_lock_irqsave - Acquire a per CPU local lock, save and disable + * interrupts + * @lock: The lock variable + * @flags: Storage for interrupt flags + */ +#define localtry_lock_irqsave(lock, flags) \ + __localtry_lock_irqsave(lock, flags) + +/** + * localtry_trylock - Try to acquire a per CPU local lock. + * @lock: The lock variable + * + * The function can be used in any context such as NMI or HARDIRQ. Due to + * locking constrains it will _always_ fail to acquire the lock in NMI or + * HARDIRQ context on PREEMPT_RT. + */ +#define localtry_trylock(lock) __localtry_trylock(lock) + +/** + * localtry_trylock_irqsave - Try to acquire a per CPU local lock, save and disable + * interrupts if acquired + * @lock: The lock variable + * @flags: Storage for interrupt flags + * + * The function can be used in any context such as NMI or HARDIRQ. Due to + * locking constrains it will _always_ fail to acquire the lock in NMI or + * HARDIRQ context on PREEMPT_RT. + */ +#define localtry_trylock_irqsave(lock, flags) \ + __localtry_trylock_irqsave(lock, flags) + +/** + * local_unlock - Release a per CPU local lock + * @lock: The lock variable + */ +#define localtry_unlock(lock) __localtry_unlock(lock) + +/** + * local_unlock_irq - Release a per CPU local lock and enable interrupts + * @lock: The lock variable + */ +#define localtry_unlock_irq(lock) __localtry_unlock_irq(lock) + +/** + * localtry_unlock_irqrestore - Release a per CPU local lock and restore + * interrupt flags + * @lock: The lock variable + * @flags: Interrupt flags to restore + */ +#define localtry_unlock_irqrestore(lock, flags) \ + __localtry_unlock_irqrestore(lock, flags) + DEFINE_GUARD(local_lock, local_lock_t __percpu*, local_lock(_T), local_unlock(_T)) diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index 8dd71fbbb6d2b..67bd13d142fac 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -15,6 +15,11 @@ typedef struct { #endif } local_lock_t; +typedef struct { + local_lock_t llock; + unsigned int acquired; +} localtry_lock_t; + #ifdef CONFIG_DEBUG_LOCK_ALLOC # define LOCAL_LOCK_DEBUG_INIT(lockname) \ .dep_map = { \ @@ -31,6 +36,13 @@ static inline void local_lock_acquire(local_lock_t *l) l->owner = current; } +static inline void local_trylock_acquire(local_lock_t *l) +{ + lock_map_acquire_try(&l->dep_map); + DEBUG_LOCKS_WARN_ON(l->owner); + l->owner = current; +} + static inline void local_lock_release(local_lock_t *l) { DEBUG_LOCKS_WARN_ON(l->owner != current); @@ -45,11 +57,13 @@ static inline void local_lock_debug_init(local_lock_t *l) #else /* CONFIG_DEBUG_LOCK_ALLOC */ # define LOCAL_LOCK_DEBUG_INIT(lockname) static inline void local_lock_acquire(local_lock_t *l) { } +static inline void local_trylock_acquire(local_lock_t *l) { } static inline void local_lock_release(local_lock_t *l) { } static inline void local_lock_debug_init(local_lock_t *l) { } #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ #define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) } +#define INIT_LOCALTRY_LOCK(lockname) { .llock = { LOCAL_LOCK_DEBUG_INIT(lockname.llock) }} #define __local_lock_init(lock) \ do { \ @@ -118,6 +132,104 @@ do { \ #define __local_unlock_nested_bh(lock) \ local_lock_release(this_cpu_ptr(lock)) +/* localtry_lock_t variants */ + +#define __localtry_lock_init(lock) \ +do { \ + __local_lock_init(&(lock)->llock); \ + WRITE_ONCE((lock)->acquired, 0); \ +} while (0) + +#define __localtry_lock(lock) \ + do { \ + localtry_lock_t *lt; \ + preempt_disable(); \ + lt = this_cpu_ptr(lock); \ + local_lock_acquire(<->llock); \ + WRITE_ONCE(lt->acquired, 1); \ + } while (0) + +#define __localtry_lock_irq(lock) \ + do { \ + localtry_lock_t *lt; \ + local_irq_disable(); \ + lt = this_cpu_ptr(lock); \ + local_lock_acquire(<->llock); \ + WRITE_ONCE(lt->acquired, 1); \ + } while (0) + +#define __localtry_lock_irqsave(lock, flags) \ + do { \ + localtry_lock_t *lt; \ + local_irq_save(flags); \ + lt = this_cpu_ptr(lock); \ + local_lock_acquire(<->llock); \ + WRITE_ONCE(lt->acquired, 1); \ + } while (0) + +#define __localtry_trylock(lock) \ + ({ \ + localtry_lock_t *lt; \ + bool _ret; \ + \ + preempt_disable(); \ + lt = this_cpu_ptr(lock); \ + if (!READ_ONCE(lt->acquired)) { \ + WRITE_ONCE(lt->acquired, 1); \ + local_trylock_acquire(<->llock); \ + _ret = true; \ + } else { \ + _ret = false; \ + preempt_enable(); \ + } \ + _ret; \ + }) + +#define __localtry_trylock_irqsave(lock, flags) \ + ({ \ + localtry_lock_t *lt; \ + bool _ret; \ + \ + local_irq_save(flags); \ + lt = this_cpu_ptr(lock); \ + if (!READ_ONCE(lt->acquired)) { \ + WRITE_ONCE(lt->acquired, 1); \ + local_trylock_acquire(<->llock); \ + _ret = true; \ + } else { \ + _ret = false; \ + local_irq_restore(flags); \ + } \ + _ret; \ + }) + +#define __localtry_unlock(lock) \ + do { \ + localtry_lock_t *lt; \ + lt = this_cpu_ptr(lock); \ + WRITE_ONCE(lt->acquired, 0); \ + local_lock_release(<->llock); \ + preempt_enable(); \ + } while (0) + +#define __localtry_unlock_irq(lock) \ + do { \ + localtry_lock_t *lt; \ + lt = this_cpu_ptr(lock); \ + WRITE_ONCE(lt->acquired, 0); \ + local_lock_release(<->llock); \ + local_irq_enable(); \ + } while (0) + +#define __localtry_unlock_irqrestore(lock, flags) \ + do { \ + localtry_lock_t *lt; \ + lt = this_cpu_ptr(lock); \ + WRITE_ONCE(lt->acquired, 0); \ + local_lock_release(<->llock); \ + local_irq_restore(flags); \ + } while (0) + #else /* !CONFIG_PREEMPT_RT */ /* @@ -125,8 +237,10 @@ do { \ * critical section while staying preemptible. */ typedef spinlock_t local_lock_t; +typedef spinlock_t localtry_lock_t; #define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) +#define INIT_LOCALTRY_LOCK(lockname) INIT_LOCAL_LOCK(lockname) #define __local_lock_init(l) \ do { \ @@ -169,4 +283,36 @@ do { \ spin_unlock(this_cpu_ptr((lock))); \ } while (0) +/* localtry_lock_t variants */ + +#define __localtry_lock_init(lock) __local_lock_init(lock) +#define __localtry_lock(lock) __local_lock(lock) +#define __localtry_lock_irq(lock) __local_lock(lock) +#define __localtry_lock_irqsave(lock, flags) __local_lock_irqsave(lock, flags) +#define __localtry_unlock(lock) __local_unlock(lock) +#define __localtry_unlock_irq(lock) __local_unlock(lock) +#define __localtry_unlock_irqrestore(lock, flags) __local_unlock_irqrestore(lock, flags) + +#define __localtry_trylock(lock) \ + ({ \ + int __locked; \ + \ + if (in_nmi() | in_hardirq()) { \ + __locked = 0; \ + } else { \ + migrate_disable(); \ + __locked = spin_trylock(this_cpu_ptr((lock))); \ + if (!__locked) \ + migrate_enable(); \ + } \ + __locked; \ + }) + +#define __localtry_trylock_irqsave(lock, flags) \ + ({ \ + typecheck(unsigned long, flags); \ + flags = 0; \ + __localtry_trylock(lock); \ + }) + #endif /* CONFIG_PREEMPT_RT */ From 97769a53f117e2f33864c587d85992ee35194ecf Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 21 Feb 2025 18:44:23 -0800 Subject: [PATCH 10/80] mm, bpf: Introduce try_alloc_pages() for opportunistic page allocation Tracing BPF programs execute from tracepoints and kprobes where running context is unknown, but they need to request additional memory. The prior workarounds were using pre-allocated memory and BPF specific freelists to satisfy such allocation requests. Instead, introduce gfpflags_allow_spinning() condition that signals to the allocator that running context is unknown. Then rely on percpu free list of pages to allocate a page. try_alloc_pages() -> get_page_from_freelist() -> rmqueue() -> rmqueue_pcplist() will spin_trylock to grab the page from percpu free list. If it fails (due to re-entrancy or list being empty) then rmqueue_bulk()/rmqueue_buddy() will attempt to spin_trylock zone->lock and grab the page from there. spin_trylock() is not safe in PREEMPT_RT when in NMI or in hard IRQ. Bailout early in such case. The support for gfpflags_allow_spinning() mode for free_page and memcg comes in the next patches. This is a first step towards supporting BPF requirements in SLUB and getting rid of bpf_mem_alloc. That goal was discussed at LSFMM: https://lwn.net/Articles/974138/ Acked-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Sebastian Andrzej Siewior Reviewed-by: Shakeel Butt Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20250222024427.30294-3-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/gfp.h | 22 ++++++++++ lib/stackdepot.c | 5 ++- mm/internal.h | 1 + mm/page_alloc.c | 104 ++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 127 insertions(+), 5 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 6bb1a5a7a4ae3..5d9ee78c74e4a 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -39,6 +39,25 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) return !!(gfp_flags & __GFP_DIRECT_RECLAIM); } +static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags) +{ + /* + * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed. + * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd. + * All GFP_* flags including GFP_NOWAIT use one or both flags. + * try_alloc_pages() is the only API that doesn't specify either flag. + * + * This is stronger than GFP_NOWAIT or GFP_ATOMIC because + * those are guaranteed to never block on a sleeping lock. + * Here we are enforcing that the allocation doesn't ever spin + * on any locks (i.e. only trylocks). There is no high level + * GFP_$FOO flag for this use in try_alloc_pages() as the + * regular page allocator doesn't fully support this + * allocation mode. + */ + return !(gfp_flags & __GFP_RECLAIM); +} + #ifdef CONFIG_HIGHMEM #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM #else @@ -335,6 +354,9 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp, } #define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) +struct page *try_alloc_pages_noprof(int nid, unsigned int order); +#define try_alloc_pages(...) alloc_hooks(try_alloc_pages_noprof(__VA_ARGS__)) + extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order); #define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__)) diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 245d5b4166999..377194969e616 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -591,7 +591,8 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, depot_stack_handle_t handle = 0; struct page *page = NULL; void *prealloc = NULL; - bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC; + bool allow_spin = gfpflags_allow_spinning(alloc_flags); + bool can_alloc = (depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC) && allow_spin; unsigned long flags; u32 hash; @@ -630,7 +631,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, prealloc = page_address(page); } - if (in_nmi()) { + if (in_nmi() || !allow_spin) { /* We can never allocate in NMI context. */ WARN_ON_ONCE(can_alloc); /* Best effort; bail if we fail to take the lock. */ diff --git a/mm/internal.h b/mm/internal.h index 109ef30fee11f..10a8b4b3b86ec 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1187,6 +1187,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_NOFRAGMENT 0x0 #endif #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ +#define ALLOC_TRYLOCK 0x400 /* Only use spin_trylock in allocation path */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ /* Flags that allow allocations below the min watermark. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 579789600a3c7..99753bb9d707e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2307,7 +2307,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long flags; int i; - spin_lock_irqsave(&zone->lock, flags); + if (!spin_trylock_irqsave(&zone->lock, flags)) { + if (unlikely(alloc_flags & ALLOC_TRYLOCK)) + return 0; + spin_lock_irqsave(&zone->lock, flags); + } for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, alloc_flags); @@ -2907,7 +2911,11 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, do { page = NULL; - spin_lock_irqsave(&zone->lock, flags); + if (!spin_trylock_irqsave(&zone->lock, flags)) { + if (unlikely(alloc_flags & ALLOC_TRYLOCK)) + return NULL; + spin_lock_irqsave(&zone->lock, flags); + } if (alloc_flags & ALLOC_HIGHATOMIC) page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (!page) { @@ -4511,7 +4519,12 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, might_alloc(gfp_mask); - if (should_fail_alloc_page(gfp_mask, order)) + /* + * Don't invoke should_fail logic, since it may call + * get_random_u32() and printk() which need to spin_lock. + */ + if (!(*alloc_flags & ALLOC_TRYLOCK) && + should_fail_alloc_page(gfp_mask, order)) return false; *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags); @@ -7071,3 +7084,88 @@ static bool __free_unaccepted(struct page *page) } #endif /* CONFIG_UNACCEPTED_MEMORY */ + +/** + * try_alloc_pages - opportunistic reentrant allocation from any context + * @nid: node to allocate from + * @order: allocation order size + * + * Allocates pages of a given order from the given node. This is safe to + * call from any context (from atomic, NMI, and also reentrant + * allocator -> tracepoint -> try_alloc_pages_noprof). + * Allocation is best effort and to be expected to fail easily so nobody should + * rely on the success. Failures are not reported via warn_alloc(). + * See always fail conditions below. + * + * Return: allocated page or NULL on failure. + */ +struct page *try_alloc_pages_noprof(int nid, unsigned int order) +{ + /* + * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed. + * Do not specify __GFP_KSWAPD_RECLAIM either, since wake up of kswapd + * is not safe in arbitrary context. + * + * These two are the conditions for gfpflags_allow_spinning() being true. + * + * Specify __GFP_NOWARN since failing try_alloc_pages() is not a reason + * to warn. Also warn would trigger printk() which is unsafe from + * various contexts. We cannot use printk_deferred_enter() to mitigate, + * since the running context is unknown. + * + * Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below + * is safe in any context. Also zeroing the page is mandatory for + * BPF use cases. + * + * Though __GFP_NOMEMALLOC is not checked in the code path below, + * specify it here to highlight that try_alloc_pages() + * doesn't want to deplete reserves. + */ + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC; + unsigned int alloc_flags = ALLOC_TRYLOCK; + struct alloc_context ac = { }; + struct page *page; + + /* + * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is + * unsafe in NMI. If spin_trylock() is called from hard IRQ the current + * task may be waiting for one rt_spin_lock, but rt_spin_trylock() will + * mark the task as the owner of another rt_spin_lock which will + * confuse PI logic, so return immediately if called form hard IRQ or + * NMI. + * + * Note, irqs_disabled() case is ok. This function can be called + * from raw_spin_lock_irqsave region. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) + return NULL; + if (!pcp_allowed_order(order)) + return NULL; + +#ifdef CONFIG_UNACCEPTED_MEMORY + /* Bailout, since try_to_accept_memory_one() needs to take a lock */ + if (has_unaccepted_memory()) + return NULL; +#endif + /* Bailout, since _deferred_grow_zone() needs to take a lock */ + if (deferred_pages_enabled()) + return NULL; + + if (nid == NUMA_NO_NODE) + nid = numa_node_id(); + + prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac, + &alloc_gfp, &alloc_flags); + + /* + * Best effort allocation from percpu free list. + * If it's empty attempt to spin_trylock zone->lock. + */ + page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); + + /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */ + + trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); + kmsan_alloc_page(page, order, alloc_gfp); + return page; +} From 8c57b687e8331eb80e302a2c528b18b966a9ac7a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 21 Feb 2025 18:44:24 -0800 Subject: [PATCH 11/80] mm, bpf: Introduce free_pages_nolock() Introduce free_pages_nolock() that can free pages without taking locks. It relies on trylock and can be called from any context. Since spin_trylock() cannot be used in PREEMPT_RT from hard IRQ or NMI it uses lockless link list to stash the pages which will be freed by subsequent free_pages() from good context. Do not use llist unconditionally. BPF maps continuously allocate/free, so we cannot unconditionally delay the freeing to llist. When the memory becomes free make it available to the kernel and BPF users right away if possible, and fallback to llist as the last resort. Acked-by: Vlastimil Babka Acked-by: Sebastian Andrzej Siewior Reviewed-by: Shakeel Butt Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20250222024427.30294-4-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/gfp.h | 1 + include/linux/mm_types.h | 4 ++ include/linux/mmzone.h | 3 ++ lib/stackdepot.c | 5 ++- mm/page_alloc.c | 93 ++++++++++++++++++++++++++++++++++------ mm/page_owner.c | 8 +++- 6 files changed, 100 insertions(+), 14 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 5d9ee78c74e4a..ceb226c2e25c5 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -379,6 +379,7 @@ __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas __get_free_pages((gfp_mask) | GFP_DMA, (order)) extern void __free_pages(struct page *page, unsigned int order); +extern void free_pages_nolock(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); #define __free_page(page) __free_pages((page), 0) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6b27db7f94963..483aa90242cdf 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -99,6 +99,10 @@ struct page { /* Or, free page */ struct list_head buddy_list; struct list_head pcp_list; + struct { + struct llist_node pcp_llist; + unsigned int order; + }; }; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9540b41894da6..e169395539304 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -972,6 +972,9 @@ struct zone { /* Primarily protects free_area */ spinlock_t lock; + /* Pages to be freed when next trylock succeeds */ + struct llist_head trylock_free_pages; + /* Write-intensive fields used by compaction and vmstats. */ CACHELINE_PADDING(_pad2_); diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 377194969e616..73d7b50924ef6 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -672,7 +672,10 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, exit: if (prealloc) { /* Stack depot didn't use this memory, free it. */ - free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER); + if (!allow_spin) + free_pages_nolock(virt_to_page(prealloc), DEPOT_POOL_ORDER); + else + free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER); } if (found) handle = found->handle.handle; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 99753bb9d707e..52836eb650325 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -88,6 +88,9 @@ typedef int __bitwise fpi_t; */ #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) +/* Free the page without taking locks. Rely on trylock only. */ +#define FPI_TRYLOCK ((__force fpi_t)BIT(2)) + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) @@ -1249,13 +1252,44 @@ static void split_large_buddy(struct zone *zone, struct page *page, } while (1); } +static void add_page_to_zone_llist(struct zone *zone, struct page *page, + unsigned int order) +{ + /* Remember the order */ + page->order = order; + /* Add the page to the free list */ + llist_add(&page->pcp_llist, &zone->trylock_free_pages); +} + static void free_one_page(struct zone *zone, struct page *page, unsigned long pfn, unsigned int order, fpi_t fpi_flags) { + struct llist_head *llhead; unsigned long flags; - spin_lock_irqsave(&zone->lock, flags); + if (!spin_trylock_irqsave(&zone->lock, flags)) { + if (unlikely(fpi_flags & FPI_TRYLOCK)) { + add_page_to_zone_llist(zone, page, order); + return; + } + spin_lock_irqsave(&zone->lock, flags); + } + + /* The lock succeeded. Process deferred pages. */ + llhead = &zone->trylock_free_pages; + if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) { + struct llist_node *llnode; + struct page *p, *tmp; + + llnode = llist_del_all(llhead); + llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) { + unsigned int p_order = p->order; + + split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags); + __count_vm_events(PGFREE, 1 << p_order); + } + } split_large_buddy(zone, page, pfn, order, fpi_flags); spin_unlock_irqrestore(&zone->lock, flags); @@ -2599,7 +2633,7 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, static void free_frozen_page_commit(struct zone *zone, struct per_cpu_pages *pcp, struct page *page, int migratetype, - unsigned int order) + unsigned int order, fpi_t fpi_flags) { int high, batch; int pindex; @@ -2634,6 +2668,14 @@ static void free_frozen_page_commit(struct zone *zone, } if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX)) pcp->free_count += (1 << order); + + if (unlikely(fpi_flags & FPI_TRYLOCK)) { + /* + * Do not attempt to take a zone lock. Let pcp->count get + * over high mark temporarily. + */ + return; + } high = nr_pcp_high(pcp, zone, batch, free_high); if (pcp->count >= high) { free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), @@ -2648,7 +2690,8 @@ static void free_frozen_page_commit(struct zone *zone, /* * Free a pcp page */ -void free_frozen_pages(struct page *page, unsigned int order) +static void __free_frozen_pages(struct page *page, unsigned int order, + fpi_t fpi_flags) { unsigned long __maybe_unused UP_flags; struct per_cpu_pages *pcp; @@ -2657,7 +2700,7 @@ void free_frozen_pages(struct page *page, unsigned int order) int migratetype; if (!pcp_allowed_order(order)) { - __free_pages_ok(page, order, FPI_NONE); + __free_pages_ok(page, order, fpi_flags); return; } @@ -2675,23 +2718,33 @@ void free_frozen_pages(struct page *page, unsigned int order) migratetype = get_pfnblock_migratetype(page, pfn); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(zone, page, pfn, order, FPI_NONE); + free_one_page(zone, page, pfn, order, fpi_flags); return; } migratetype = MIGRATE_MOVABLE; } + if (unlikely((fpi_flags & FPI_TRYLOCK) && IS_ENABLED(CONFIG_PREEMPT_RT) + && (in_nmi() || in_hardirq()))) { + add_page_to_zone_llist(zone, page, order); + return; + } pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { - free_frozen_page_commit(zone, pcp, page, migratetype, order); + free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags); pcp_spin_unlock(pcp); } else { - free_one_page(zone, page, pfn, order, FPI_NONE); + free_one_page(zone, page, pfn, order, fpi_flags); } pcp_trylock_finish(UP_flags); } +void free_frozen_pages(struct page *page, unsigned int order) +{ + __free_frozen_pages(page, order, FPI_NONE); +} + /* * Free a batch of folios */ @@ -2780,7 +2833,7 @@ void free_unref_folios(struct folio_batch *folios) trace_mm_page_free_batched(&folio->page); free_frozen_page_commit(zone, pcp, &folio->page, migratetype, - order); + order, FPI_NONE); } if (pcp) { @@ -4822,9 +4875,10 @@ unsigned long get_zeroed_page_noprof(gfp_t gfp_mask) EXPORT_SYMBOL(get_zeroed_page_noprof); /** - * __free_pages - Free pages allocated with alloc_pages(). + * ___free_pages - Free pages allocated with alloc_pages(). * @page: The page pointer returned from alloc_pages(). * @order: The order of the allocation. + * @fpi_flags: Free Page Internal flags. * * This function can free multi-page allocations that are not compound * pages. It does not check that the @order passed in matches that of @@ -4841,22 +4895,37 @@ EXPORT_SYMBOL(get_zeroed_page_noprof); * Context: May be called in interrupt context or while holding a normal * spinlock, but not in NMI context or while holding a raw spinlock. */ -void __free_pages(struct page *page, unsigned int order) +static void ___free_pages(struct page *page, unsigned int order, + fpi_t fpi_flags) { /* get PageHead before we drop reference */ int head = PageHead(page); struct alloc_tag *tag = pgalloc_tag_get(page); if (put_page_testzero(page)) - free_frozen_pages(page, order); + __free_frozen_pages(page, order, fpi_flags); else if (!head) { pgalloc_tag_sub_pages(tag, (1 << order) - 1); while (order-- > 0) - free_frozen_pages(page + (1 << order), order); + __free_frozen_pages(page + (1 << order), order, + fpi_flags); } } +void __free_pages(struct page *page, unsigned int order) +{ + ___free_pages(page, order, FPI_NONE); +} EXPORT_SYMBOL(__free_pages); +/* + * Can be called while holding raw_spin_lock or from IRQ and NMI for any + * page type (not only those that came from try_alloc_pages) + */ +void free_pages_nolock(struct page *page, unsigned int order) +{ + ___free_pages(page, order, FPI_TRYLOCK); +} + void free_pages(unsigned long addr, unsigned int order) { if (addr != 0) { diff --git a/mm/page_owner.c b/mm/page_owner.c index 2d6360eaccbb6..90e31d0e3ed78 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -294,7 +294,13 @@ void __reset_page_owner(struct page *page, unsigned short order) page_owner = get_page_owner(page_ext); alloc_handle = page_owner->handle; - handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); + /* + * Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false + * to prevent issues in stack_depot_save(). + * This is similar to try_alloc_pages() gfp flags, but only used + * to signal stack_depot to avoid spin_locks. + */ + handle = save_stack(__GFP_NOWARN); __update_page_owner_free_handle(page_ext, handle, order, current->pid, current->tgid, free_ts_nsec); page_ext_put(page_ext); From 01d37228d331047a0bbbd1026cec2ccabef6d88d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 21 Feb 2025 18:44:25 -0800 Subject: [PATCH 12/80] memcg: Use trylock to access memcg stock_lock. Teach memcg to operate under trylock conditions when spinning locks cannot be used. localtry_trylock might fail and this would lead to charge cache bypass if the calling context doesn't allow spinning (gfpflags_allow_spinning). In those cases charge the memcg counter directly and fail early if that is not possible. This might cause a pre-mature charge failing but it will allow an opportunistic charging that is safe from try_alloc_pages path. Acked-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Shakeel Butt Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20250222024427.30294-5-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- mm/memcontrol.c | 53 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 46f8b372d212b..092cab99dec70 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1739,7 +1739,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) } struct memcg_stock_pcp { - local_lock_t stock_lock; + localtry_lock_t stock_lock; struct mem_cgroup *cached; /* this never be root cgroup */ unsigned int nr_pages; @@ -1754,7 +1754,7 @@ struct memcg_stock_pcp { #define FLUSHING_CACHED_CHARGE 0 }; static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { - .stock_lock = INIT_LOCAL_LOCK(stock_lock), + .stock_lock = INIT_LOCALTRY_LOCK(stock_lock), }; static DEFINE_MUTEX(percpu_charge_mutex); @@ -1766,6 +1766,7 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, * consume_stock: Try to consume stocked charge on this cpu. * @memcg: memcg to consume from. * @nr_pages: how many pages to charge. + * @gfp_mask: allocation mask. * * The charges will only happen if @memcg matches the current cpu's memcg * stock, and at least @nr_pages are available in that stock. Failure to @@ -1773,7 +1774,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, * * returns true if successful, false otherwise. */ -static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages, + gfp_t gfp_mask) { struct memcg_stock_pcp *stock; unsigned int stock_pages; @@ -1783,7 +1785,11 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) if (nr_pages > MEMCG_CHARGE_BATCH) return ret; - local_lock_irqsave(&memcg_stock.stock_lock, flags); + if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) { + if (!gfpflags_allow_spinning(gfp_mask)) + return ret; + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); + } stock = this_cpu_ptr(&memcg_stock); stock_pages = READ_ONCE(stock->nr_pages); @@ -1792,7 +1798,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) ret = true; } - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); return ret; } @@ -1831,14 +1837,14 @@ static void drain_local_stock(struct work_struct *dummy) * drain_stock races is that we always operate on local CPU stock * here with IRQ disabled */ - local_lock_irqsave(&memcg_stock.stock_lock, flags); + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); old = drain_obj_stock(stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); obj_cgroup_put(old); } @@ -1868,9 +1874,20 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { unsigned long flags; - local_lock_irqsave(&memcg_stock.stock_lock, flags); + if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) { + /* + * In case of unlikely failure to lock percpu stock_lock + * uncharge memcg directly. + */ + if (mem_cgroup_is_root(memcg)) + return; + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, nr_pages); + return; + } __refill_stock(memcg, nr_pages); - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); } /* @@ -2213,9 +2230,13 @@ int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long pflags; retry: - if (consume_stock(memcg, nr_pages)) + if (consume_stock(memcg, nr_pages, gfp_mask)) return 0; + if (!gfpflags_allow_spinning(gfp_mask)) + /* Avoid the refill and flush of the older stock */ + batch = nr_pages; + if (!do_memsw_account() || page_counter_try_charge(&memcg->memsw, batch, &counter)) { if (page_counter_try_charge(&memcg->memory, batch, &counter)) @@ -2699,7 +2720,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, unsigned long flags; int *bytes; - local_lock_irqsave(&memcg_stock.stock_lock, flags); + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); /* @@ -2752,7 +2773,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, if (nr) __mod_objcg_mlstate(objcg, pgdat, idx, nr); - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); obj_cgroup_put(old); } @@ -2762,7 +2783,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) unsigned long flags; bool ret = false; - local_lock_irqsave(&memcg_stock.stock_lock, flags); + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { @@ -2770,7 +2791,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) ret = true; } - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); return ret; } @@ -2862,7 +2883,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, unsigned long flags; unsigned int nr_pages = 0; - local_lock_irqsave(&memcg_stock.stock_lock, flags); + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ @@ -2880,7 +2901,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, stock->nr_bytes &= (PAGE_SIZE - 1); } - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); obj_cgroup_put(old); if (nr_pages) From e8d78dbd0199a42f4e8599d768e77348f3e59741 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 21 Feb 2025 18:44:26 -0800 Subject: [PATCH 13/80] mm, bpf: Use memcg in try_alloc_pages(). Unconditionally use __GFP_ACCOUNT in try_alloc_pages(). The caller is responsible to setup memcg correctly. All BPF memory accounting is memcg based. Acked-by: Vlastimil Babka Acked-by: Shakeel Butt Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20250222024427.30294-6-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- mm/page_alloc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 52836eb650325..a0904315d4da2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7190,7 +7190,8 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order) * specify it here to highlight that try_alloc_pages() * doesn't want to deplete reserves. */ - gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC; + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC + | __GFP_ACCOUNT; unsigned int alloc_flags = ALLOC_TRYLOCK; struct alloc_context ac = { }; struct page *page; @@ -7234,6 +7235,11 @@ struct page *try_alloc_pages_noprof(int nid, unsigned int order) /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */ + if (memcg_kmem_online() && page && + unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) { + free_pages_nolock(page, order); + page = NULL; + } trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); kmsan_alloc_page(page, order, alloc_gfp); return page; From c9eb8102e21e8c4c6fb74d1921d99e4d43713520 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 21 Feb 2025 18:44:27 -0800 Subject: [PATCH 14/80] bpf: Use try_alloc_pages() to allocate pages for bpf needs. Use try_alloc_pages() and free_pages_nolock() for BPF needs when context doesn't allow using normal alloc_pages. This is a prerequisite for further work. Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20250222024427.30294-7-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- kernel/bpf/arena.c | 5 ++--- kernel/bpf/syscall.c | 23 ++++++++++++++++++++--- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f3f50e29d6392..e1838a3418173 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2348,7 +2348,7 @@ int generic_map_delete_batch(struct bpf_map *map, struct bpf_map *bpf_map_get_curr_or_next(u32 *id); struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, +int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **page_array); #ifdef CONFIG_MEMCG void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 870aeb51d70ad..aa43d6c34c46d 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -287,7 +287,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) return VM_FAULT_SIGSEGV; /* Account into memcg of the process that created bpf_arena */ - ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page); + ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); return VM_FAULT_SIGSEGV; @@ -465,8 +465,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt if (ret) goto out_free_pages; - ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO, - node_id, page_cnt, pages); + ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages); if (ret) goto out; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c420edbfb7c87..a7af8d0185d0a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -569,7 +569,24 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, +static bool can_alloc_pages(void) +{ + return preempt_count() == 0 && !irqs_disabled() && + !IS_ENABLED(CONFIG_PREEMPT_RT); +} + +static struct page *__bpf_alloc_page(int nid) +{ + if (!can_alloc_pages()) + return try_alloc_pages(nid, 0); + + return alloc_pages_node(nid, + GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT + | __GFP_NOWARN, + 0); +} + +int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **pages) { unsigned long i, j; @@ -582,14 +599,14 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, old_memcg = set_active_memcg(memcg); #endif for (i = 0; i < nr_pages; i++) { - pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0); + pg = __bpf_alloc_page(nid); if (pg) { pages[i] = pg; continue; } for (j = 0; j < i; j++) - __free_page(pages[j]); + free_pages_nolock(pages[j], 0); ret = -ENOMEM; break; } From 0b9363131daf4227d5ae11ee677acdcfff06e938 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexis=20Lothor=C3=A9=20=28eBPF=20Foundation=29?= Date: Thu, 27 Feb 2025 15:08:23 +0100 Subject: [PATCH 15/80] bpf/selftests: test_select_reuseport_kern: Remove unused header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_select_reuseport_kern.c is currently including , but it does not use any definition from there. Remove stdlib.h inclusion from test_select_reuseport_kern.c Signed-off-by: Alexis Lothoré (eBPF Foundation) Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250227-remove_wrong_header-v1-1-bc94eb4e2f73@bootlin.com --- tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c b/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c index 5eb25c6ad75b1..a5be3267dbb01 100644 --- a/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c +++ b/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2018 Facebook */ -#include #include #include #include From 90d8c8980b5c7e519ef71fec1e6dc51d0c7696be Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 1 Mar 2025 07:18:44 -0800 Subject: [PATCH 16/80] bpf: Summarize sleepable global subprogs The verifier currently does not permit global subprog calls when a lock is held, preemption is disabled, or when IRQs are disabled. This is because we don't know whether the global subprog calls sleepable functions or not. In case of locks, there's an additional reason: functions called by the global subprog may hold additional locks etc. The verifier won't know while verifying the global subprog whether it was called in context where a spin lock is already held by the program. Perform summarization of the sleepable nature of a global subprog just like changes_pkt_data and then allow calls to global subprogs for non-sleepable ones from atomic context. While making this change, I noticed that RCU read sections had no protection against sleepable global subprog calls, include it in the checks and fix this while we're at it. Care needs to be taken to not allow global subprog calls when regular bpf_spin_lock is held. When resilient spin locks is held, we want to potentially have this check relaxed, but not for now. Also make sure extensions freplacing global functions cannot do so in case the target is non-sleepable, but the extension is. The other combination is ok. Tests are included in the next patch to handle all special conditions. Fixes: 9bb00b2895cb ("bpf: Add kfunc bpf_rcu_read_lock/unlock()") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250301151846.1552362-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 62 ++++++++++++++++++++++++++++-------- 3 files changed, 50 insertions(+), 14 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aec102868b939..4c4028d865ee4 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1531,6 +1531,7 @@ struct bpf_prog_aux { bool jits_use_priv_stack; bool priv_stack_requested; bool changes_pkt_data; + bool might_sleep; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index bbd013c38ff9f..d338f2a96bbae 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -667,6 +667,7 @@ struct bpf_subprog_info { /* true if bpf_fastcall stack region is used by functions that can't be inlined */ bool keep_fastcall_stack: 1; bool changes_pkt_data: 1; + bool might_sleep: 1; enum priv_stack_mode priv_stack_mode; u8 arg_cnt; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dcd0da4e62fca..eb1624f6e7434 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10317,23 +10317,18 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (subprog_is_global(env, subprog)) { const char *sub_name = subprog_name(env, subprog); - /* Only global subprogs cannot be called with a lock held. */ if (env->cur_state->active_locks) { verbose(env, "global function calls are not allowed while holding a lock,\n" "use static function instead\n"); return -EINVAL; } - /* Only global subprogs cannot be called with preemption disabled. */ - if (env->cur_state->active_preempt_locks) { - verbose(env, "global function calls are not allowed with preemption disabled,\n" - "use static function instead\n"); - return -EINVAL; - } - - if (env->cur_state->active_irq_id) { - verbose(env, "global function calls are not allowed with IRQs disabled,\n" - "use static function instead\n"); + if (env->subprog_info[subprog].might_sleep && + (env->cur_state->active_rcu_lock || env->cur_state->active_preempt_locks || + env->cur_state->active_irq_id || !in_sleepable(env))) { + verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n" + "i.e., in a RCU/IRQ/preempt-disabled section, or in\n" + "a non-sleepable BPF program context\n"); return -EINVAL; } @@ -16703,6 +16698,14 @@ static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off) subprog->changes_pkt_data = true; } +static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *subprog; + + subprog = find_containing_subprog(env, off); + subprog->might_sleep = true; +} + /* 't' is an index of a call-site. * 'w' is a callee entry point. * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED. @@ -16716,6 +16719,7 @@ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) caller = find_containing_subprog(env, t); callee = find_containing_subprog(env, w); caller->changes_pkt_data |= callee->changes_pkt_data; + caller->might_sleep |= callee->might_sleep; } /* non-recursive DFS pseudo code @@ -17183,9 +17187,20 @@ static int visit_insn(int t, struct bpf_verifier_env *env) mark_prune_point(env, t); mark_jmp_point(env, t); } - if (bpf_helper_call(insn) && bpf_helper_changes_pkt_data(insn->imm)) - mark_subprog_changes_pkt_data(env, t); - if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + if (bpf_helper_call(insn)) { + const struct bpf_func_proto *fp; + + ret = get_helper_proto(env, insn->imm, &fp); + /* If called in a non-sleepable context program will be + * rejected anyway, so we should end up with precise + * sleepable marks on subprogs, except for dead code + * elimination. + */ + if (ret == 0 && fp->might_sleep) + mark_subprog_might_sleep(env, t); + if (bpf_helper_changes_pkt_data(insn->imm)) + mark_subprog_changes_pkt_data(env, t); + } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; ret = fetch_kfunc_meta(env, insn, &meta, NULL); @@ -17204,6 +17219,13 @@ static int visit_insn(int t, struct bpf_verifier_env *env) */ mark_force_checkpoint(env, t); } + /* Same as helpers, if called in a non-sleepable context + * program will be rejected anyway, so we should end up + * with precise sleepable marks on subprogs, except for + * dead code elimination. + */ + if (ret == 0 && is_kfunc_sleepable(&meta)) + mark_subprog_might_sleep(env, t); } return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); @@ -17320,6 +17342,7 @@ static int check_cfg(struct bpf_verifier_env *env) } ret = 0; /* cfg looks good */ env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data; + env->prog->aux->might_sleep = env->subprog_info[0].might_sleep; err_free: kvfree(insn_state); @@ -20845,6 +20868,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb; func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data; + func[i]->aux->might_sleep = env->subprog_info[i].might_sleep; if (!i) func[i]->aux->exception_boundary = env->seen_exception; func[i] = bpf_int_jit_compile(func[i]); @@ -22723,6 +22747,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (tgt_prog) { struct bpf_prog_aux *aux = tgt_prog->aux; bool tgt_changes_pkt_data; + bool tgt_might_sleep; if (bpf_prog_is_dev_bound(prog->aux) && !bpf_prog_dev_bound_match(prog, tgt_prog)) { @@ -22765,6 +22790,15 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, "Extension program changes packet data, while original does not\n"); return -EINVAL; } + + tgt_might_sleep = aux->func + ? aux->func[subprog]->aux->might_sleep + : aux->might_sleep; + if (prog->aux->might_sleep && !tgt_might_sleep) { + bpf_log(log, + "Extension program may sleep, while original does not\n"); + return -EINVAL; + } } if (!tgt_prog->jited) { bpf_log(log, "Can attach to only JITed progs\n"); From 6e2cc6067e7313202dea730c3b33fd7f44f1f637 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 1 Mar 2025 07:18:45 -0800 Subject: [PATCH 17/80] selftests/bpf: Test sleepable global subprogs in atomic contexts Add tests for rejecting sleepable and accepting non-sleepable global function calls in atomic contexts. For spin locks, we still reject all global function calls. Once resilient spin locks land, we will carefully lift in cases where we deem it safe. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250301151846.1552362-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/rcu_read_lock.c | 3 + .../selftests/bpf/prog_tests/spin_lock.c | 3 + tools/testing/selftests/bpf/progs/irq.c | 71 ++++++++++++++++++- .../selftests/bpf/progs/preempt_lock.c | 68 +++++++++++++++++- .../selftests/bpf/progs/rcu_read_lock.c | 58 +++++++++++++++ .../selftests/bpf/progs/test_spin_lock_fail.c | 69 ++++++++++++++++++ 6 files changed, 270 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c index ebe0c12b55363..c9f855e5da240 100644 --- a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c @@ -81,6 +81,9 @@ static const char * const inproper_region_tests[] = { "nested_rcu_region", "rcu_read_lock_global_subprog_lock", "rcu_read_lock_global_subprog_unlock", + "rcu_read_lock_sleepable_helper_global_subprog", + "rcu_read_lock_sleepable_kfunc_global_subprog", + "rcu_read_lock_sleepable_global_subprog_indirect", }; static void test_inproper_region(void) diff --git a/tools/testing/selftests/bpf/prog_tests/spin_lock.c b/tools/testing/selftests/bpf/prog_tests/spin_lock.c index 2b0068742ef9f..e3ea5dc2f697c 100644 --- a/tools/testing/selftests/bpf/prog_tests/spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/spin_lock.c @@ -50,6 +50,9 @@ static struct { { "lock_id_mismatch_innermapval_mapval", "bpf_spin_unlock of different lock" }, { "lock_global_subprog_call1", "global function calls are not allowed while holding a lock" }, { "lock_global_subprog_call2", "global function calls are not allowed while holding a lock" }, + { "lock_global_sleepable_helper_subprog", "global function calls are not allowed while holding a lock" }, + { "lock_global_sleepable_kfunc_subprog", "global function calls are not allowed while holding a lock" }, + { "lock_global_sleepable_subprog_indirect", "global function calls are not allowed while holding a lock" }, }; static int match_regex(const char *pattern, const char *string) diff --git a/tools/testing/selftests/bpf/progs/irq.c b/tools/testing/selftests/bpf/progs/irq.c index b0b53d9809642..298d48d7886d9 100644 --- a/tools/testing/selftests/bpf/progs/irq.c +++ b/tools/testing/selftests/bpf/progs/irq.c @@ -222,7 +222,7 @@ int __noinline global_local_irq_balance(void) } SEC("?tc") -__failure __msg("global function calls are not allowed with IRQs disabled") +__success int irq_global_subprog(struct __sk_buff *ctx) { unsigned long flags; @@ -441,4 +441,73 @@ int irq_ooo_refs_array(struct __sk_buff *ctx) return 0; } +int __noinline +global_subprog(int i) +{ + if (i) + bpf_printk("%p", &i); + return i; +} + +int __noinline +global_sleepable_helper_subprog(int i) +{ + if (i) + bpf_copy_from_user(&i, sizeof(i), NULL); + return i; +} + +int __noinline +global_sleepable_kfunc_subprog(int i) +{ + if (i) + bpf_copy_from_user_str(&i, sizeof(i), NULL, 0); + global_subprog(i); + return i; +} + +int __noinline +global_subprog_calling_sleepable_global(int i) +{ + if (!i) + global_sleepable_kfunc_subprog(i); + return i; +} + +SEC("?syscall") +__success +int irq_non_sleepable_global_subprog(void *ctx) +{ + unsigned long flags; + + bpf_local_irq_save(&flags); + global_subprog(0); + bpf_local_irq_restore(&flags); + return 0; +} + +SEC("?syscall") +__failure __msg("global functions that may sleep are not allowed in non-sleepable context") +int irq_sleepable_helper_global_subprog(void *ctx) +{ + unsigned long flags; + + bpf_local_irq_save(&flags); + global_sleepable_helper_subprog(0); + bpf_local_irq_restore(&flags); + return 0; +} + +SEC("?syscall") +__failure __msg("global functions that may sleep are not allowed in non-sleepable context") +int irq_sleepable_global_subprog_indirect(void *ctx) +{ + unsigned long flags; + + bpf_local_irq_save(&flags); + global_subprog_calling_sleepable_global(0); + bpf_local_irq_restore(&flags); + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/preempt_lock.c b/tools/testing/selftests/bpf/progs/preempt_lock.c index 6c5797bf0ead5..7d04254e61f1c 100644 --- a/tools/testing/selftests/bpf/progs/preempt_lock.c +++ b/tools/testing/selftests/bpf/progs/preempt_lock.c @@ -134,7 +134,7 @@ int __noinline preempt_global_subprog(void) } SEC("?tc") -__failure __msg("global function calls are not allowed with preemption disabled") +__success int preempt_global_subprog_test(struct __sk_buff *ctx) { preempt_disable(); @@ -143,4 +143,70 @@ int preempt_global_subprog_test(struct __sk_buff *ctx) return 0; } +int __noinline +global_subprog(int i) +{ + if (i) + bpf_printk("%p", &i); + return i; +} + +int __noinline +global_sleepable_helper_subprog(int i) +{ + if (i) + bpf_copy_from_user(&i, sizeof(i), NULL); + return i; +} + +int __noinline +global_sleepable_kfunc_subprog(int i) +{ + if (i) + bpf_copy_from_user_str(&i, sizeof(i), NULL, 0); + global_subprog(i); + return i; +} + +int __noinline +global_subprog_calling_sleepable_global(int i) +{ + if (!i) + global_sleepable_kfunc_subprog(i); + return i; +} + +SEC("?syscall") +__failure __msg("global functions that may sleep are not allowed in non-sleepable context") +int preempt_global_sleepable_helper_subprog(struct __sk_buff *ctx) +{ + preempt_disable(); + if (ctx->mark) + global_sleepable_helper_subprog(ctx->mark); + preempt_enable(); + return 0; +} + +SEC("?syscall") +__failure __msg("global functions that may sleep are not allowed in non-sleepable context") +int preempt_global_sleepable_kfunc_subprog(struct __sk_buff *ctx) +{ + preempt_disable(); + if (ctx->mark) + global_sleepable_kfunc_subprog(ctx->mark); + preempt_enable(); + return 0; +} + +SEC("?syscall") +__failure __msg("global functions that may sleep are not allowed in non-sleepable context") +int preempt_global_sleepable_subprog_indirect(struct __sk_buff *ctx) +{ + preempt_disable(); + if (ctx->mark) + global_subprog_calling_sleepable_global(ctx->mark); + preempt_enable(); + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c index ab3a532b7dd6d..5cf1ae637ec7f 100644 --- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c @@ -439,3 +439,61 @@ int rcu_read_lock_global_subprog_unlock(void *ctx) ret += global_subprog_unlock(ret); return 0; } + +int __noinline +global_sleepable_helper_subprog(int i) +{ + if (i) + bpf_copy_from_user(&i, sizeof(i), NULL); + return i; +} + +int __noinline +global_sleepable_kfunc_subprog(int i) +{ + if (i) + bpf_copy_from_user_str(&i, sizeof(i), NULL, 0); + global_subprog(i); + return i; +} + +int __noinline +global_subprog_calling_sleepable_global(int i) +{ + if (!i) + global_sleepable_kfunc_subprog(i); + return i; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int rcu_read_lock_sleepable_helper_global_subprog(void *ctx) +{ + volatile int ret = 0; + + bpf_rcu_read_lock(); + ret += global_sleepable_helper_subprog(ret); + bpf_rcu_read_unlock(); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int rcu_read_lock_sleepable_kfunc_global_subprog(void *ctx) +{ + volatile int ret = 0; + + bpf_rcu_read_lock(); + ret += global_sleepable_kfunc_subprog(ret); + bpf_rcu_read_unlock(); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int rcu_read_lock_sleepable_global_subprog_indirect(void *ctx) +{ + volatile int ret = 0; + + bpf_rcu_read_lock(); + ret += global_subprog_calling_sleepable_global(ret); + bpf_rcu_read_unlock(); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c b/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c index 1c8b678e2e9a3..f678ee6bd7eaf 100644 --- a/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c +++ b/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c @@ -245,4 +245,73 @@ int lock_global_subprog_call2(struct __sk_buff *ctx) return ret; } +int __noinline +global_subprog_int(int i) +{ + if (i) + bpf_printk("%p", &i); + return i; +} + +int __noinline +global_sleepable_helper_subprog(int i) +{ + if (i) + bpf_copy_from_user(&i, sizeof(i), NULL); + return i; +} + +int __noinline +global_sleepable_kfunc_subprog(int i) +{ + if (i) + bpf_copy_from_user_str(&i, sizeof(i), NULL, 0); + global_subprog_int(i); + return i; +} + +int __noinline +global_subprog_calling_sleepable_global(int i) +{ + if (!i) + global_sleepable_kfunc_subprog(i); + return i; +} + +SEC("?syscall") +int lock_global_sleepable_helper_subprog(struct __sk_buff *ctx) +{ + int ret = 0; + + bpf_spin_lock(&lockA); + if (ctx->mark == 42) + ret = global_sleepable_helper_subprog(ctx->mark); + bpf_spin_unlock(&lockA); + return ret; +} + +SEC("?syscall") +int lock_global_sleepable_kfunc_subprog(struct __sk_buff *ctx) +{ + int ret = 0; + + bpf_spin_lock(&lockA); + if (ctx->mark == 42) + ret = global_sleepable_kfunc_subprog(ctx->mark); + bpf_spin_unlock(&lockA); + return ret; +} + +SEC("?syscall") +int lock_global_sleepable_subprog_indirect(struct __sk_buff *ctx) +{ + int ret = 0; + + bpf_spin_lock(&lockA); + if (ctx->mark == 42) + ret = global_subprog_calling_sleepable_global(ctx->mark); + bpf_spin_unlock(&lockA); + return ret; +} + char _license[] SEC("license") = "GPL"; From ce9add7b9028f95c8b02170332ab2783fa27772d Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 1 Mar 2025 07:18:46 -0800 Subject: [PATCH 18/80] selftests/bpf: Add tests for extending sleepable global subprogs Add tests for freplace behavior with the combination of sleepable and non-sleepable global subprogs. The changes_pkt_data selftest did all the hardwork, so simply rename it and include new support for more summarization tests for might_sleep bit. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250301151846.1552362-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/changes_pkt_data.c | 107 ------------- .../selftests/bpf/prog_tests/summarization.c | 144 ++++++++++++++++++ .../selftests/bpf/progs/changes_pkt_data.c | 39 ----- .../selftests/bpf/progs/summarization.c | 78 ++++++++++ ...ta_freplace.c => summarization_freplace.c} | 17 ++- 5 files changed, 238 insertions(+), 147 deletions(-) delete mode 100644 tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c create mode 100644 tools/testing/selftests/bpf/prog_tests/summarization.c delete mode 100644 tools/testing/selftests/bpf/progs/changes_pkt_data.c create mode 100644 tools/testing/selftests/bpf/progs/summarization.c rename tools/testing/selftests/bpf/progs/{changes_pkt_data_freplace.c => summarization_freplace.c} (57%) diff --git a/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c b/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c deleted file mode 100644 index 7526de3790814..0000000000000 --- a/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c +++ /dev/null @@ -1,107 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bpf/libbpf.h" -#include "changes_pkt_data_freplace.skel.h" -#include "changes_pkt_data.skel.h" -#include - -static void print_verifier_log(const char *log) -{ - if (env.verbosity >= VERBOSE_VERY) - fprintf(stdout, "VERIFIER LOG:\n=============\n%s=============\n", log); -} - -static void test_aux(const char *main_prog_name, - const char *to_be_replaced, - const char *replacement, - bool expect_load) -{ - struct changes_pkt_data_freplace *freplace = NULL; - struct bpf_program *freplace_prog = NULL; - struct bpf_program *main_prog = NULL; - LIBBPF_OPTS(bpf_object_open_opts, opts); - struct changes_pkt_data *main = NULL; - char log[16*1024]; - int err; - - opts.kernel_log_buf = log; - opts.kernel_log_size = sizeof(log); - if (env.verbosity >= VERBOSE_SUPER) - opts.kernel_log_level = 1 | 2 | 4; - main = changes_pkt_data__open_opts(&opts); - if (!ASSERT_OK_PTR(main, "changes_pkt_data__open")) - goto out; - main_prog = bpf_object__find_program_by_name(main->obj, main_prog_name); - if (!ASSERT_OK_PTR(main_prog, "main_prog")) - goto out; - bpf_program__set_autoload(main_prog, true); - err = changes_pkt_data__load(main); - print_verifier_log(log); - if (!ASSERT_OK(err, "changes_pkt_data__load")) - goto out; - freplace = changes_pkt_data_freplace__open_opts(&opts); - if (!ASSERT_OK_PTR(freplace, "changes_pkt_data_freplace__open")) - goto out; - freplace_prog = bpf_object__find_program_by_name(freplace->obj, replacement); - if (!ASSERT_OK_PTR(freplace_prog, "freplace_prog")) - goto out; - bpf_program__set_autoload(freplace_prog, true); - bpf_program__set_autoattach(freplace_prog, true); - bpf_program__set_attach_target(freplace_prog, - bpf_program__fd(main_prog), - to_be_replaced); - err = changes_pkt_data_freplace__load(freplace); - print_verifier_log(log); - if (expect_load) { - ASSERT_OK(err, "changes_pkt_data_freplace__load"); - } else { - ASSERT_ERR(err, "changes_pkt_data_freplace__load"); - ASSERT_HAS_SUBSTR(log, "Extension program changes packet data", "error log"); - } - -out: - changes_pkt_data_freplace__destroy(freplace); - changes_pkt_data__destroy(main); -} - -/* There are two global subprograms in both changes_pkt_data.skel.h: - * - one changes packet data; - * - another does not. - * It is ok to freplace subprograms that change packet data with those - * that either do or do not. It is only ok to freplace subprograms - * that do not change packet data with those that do not as well. - * The below tests check outcomes for each combination of such freplace. - * Also test a case when main subprogram itself is replaced and is a single - * subprogram in a program. - */ -void test_changes_pkt_data_freplace(void) -{ - struct { - const char *main; - const char *to_be_replaced; - bool changes; - } mains[] = { - { "main_with_subprogs", "changes_pkt_data", true }, - { "main_with_subprogs", "does_not_change_pkt_data", false }, - { "main_changes", "main_changes", true }, - { "main_does_not_change", "main_does_not_change", false }, - }; - struct { - const char *func; - bool changes; - } replacements[] = { - { "changes_pkt_data", true }, - { "does_not_change_pkt_data", false } - }; - char buf[64]; - - for (int i = 0; i < ARRAY_SIZE(mains); ++i) { - for (int j = 0; j < ARRAY_SIZE(replacements); ++j) { - snprintf(buf, sizeof(buf), "%s_with_%s", - mains[i].to_be_replaced, replacements[j].func); - if (!test__start_subtest(buf)) - continue; - test_aux(mains[i].main, mains[i].to_be_replaced, replacements[j].func, - mains[i].changes || !replacements[j].changes); - } - } -} diff --git a/tools/testing/selftests/bpf/prog_tests/summarization.c b/tools/testing/selftests/bpf/prog_tests/summarization.c new file mode 100644 index 0000000000000..5dd6c120a8386 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/summarization.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bpf/libbpf.h" +#include "summarization_freplace.skel.h" +#include "summarization.skel.h" +#include + +static void print_verifier_log(const char *log) +{ + if (env.verbosity >= VERBOSE_VERY) + fprintf(stdout, "VERIFIER LOG:\n=============\n%s=============\n", log); +} + +static void test_aux(const char *main_prog_name, + const char *to_be_replaced, + const char *replacement, + bool expect_load, + const char *err_msg) +{ + struct summarization_freplace *freplace = NULL; + struct bpf_program *freplace_prog = NULL; + struct bpf_program *main_prog = NULL; + LIBBPF_OPTS(bpf_object_open_opts, opts); + struct summarization *main = NULL; + char log[16*1024]; + int err; + + opts.kernel_log_buf = log; + opts.kernel_log_size = sizeof(log); + if (env.verbosity >= VERBOSE_SUPER) + opts.kernel_log_level = 1 | 2 | 4; + main = summarization__open_opts(&opts); + if (!ASSERT_OK_PTR(main, "summarization__open")) + goto out; + main_prog = bpf_object__find_program_by_name(main->obj, main_prog_name); + if (!ASSERT_OK_PTR(main_prog, "main_prog")) + goto out; + bpf_program__set_autoload(main_prog, true); + err = summarization__load(main); + print_verifier_log(log); + if (!ASSERT_OK(err, "summarization__load")) + goto out; + freplace = summarization_freplace__open_opts(&opts); + if (!ASSERT_OK_PTR(freplace, "summarization_freplace__open")) + goto out; + freplace_prog = bpf_object__find_program_by_name(freplace->obj, replacement); + if (!ASSERT_OK_PTR(freplace_prog, "freplace_prog")) + goto out; + bpf_program__set_autoload(freplace_prog, true); + bpf_program__set_autoattach(freplace_prog, true); + bpf_program__set_attach_target(freplace_prog, + bpf_program__fd(main_prog), + to_be_replaced); + err = summarization_freplace__load(freplace); + print_verifier_log(log); + + /* The might_sleep extension doesn't work yet as sleepable calls are not + * allowed, but preserve the check in case it's supported later and then + * this particular combination can be enabled. + */ + if (!strcmp("might_sleep", replacement) && err) { + ASSERT_HAS_SUBSTR(log, "helper call might sleep in a non-sleepable prog", "error log"); + ASSERT_EQ(err, -EINVAL, "err"); + test__skip(); + goto out; + } + + if (expect_load) { + ASSERT_OK(err, "summarization_freplace__load"); + } else { + ASSERT_ERR(err, "summarization_freplace__load"); + ASSERT_HAS_SUBSTR(log, err_msg, "error log"); + } + +out: + summarization_freplace__destroy(freplace); + summarization__destroy(main); +} + +/* There are two global subprograms in both summarization.skel.h: + * - one changes packet data; + * - another does not. + * It is ok to freplace subprograms that change packet data with those + * that either do or do not. It is only ok to freplace subprograms + * that do not change packet data with those that do not as well. + * The below tests check outcomes for each combination of such freplace. + * Also test a case when main subprogram itself is replaced and is a single + * subprogram in a program. + * + * This holds for might_sleep programs. It is ok to replace might_sleep with + * might_sleep and with does_not_sleep, but does_not_sleep cannot be replaced + * with might_sleep. + */ +void test_summarization_freplace(void) +{ + struct { + const char *main; + const char *to_be_replaced; + bool has_side_effect; + } mains[2][4] = { + { + { "main_changes_with_subprogs", "changes_pkt_data", true }, + { "main_changes_with_subprogs", "does_not_change_pkt_data", false }, + { "main_changes", "main_changes", true }, + { "main_does_not_change", "main_does_not_change", false }, + }, + { + { "main_might_sleep_with_subprogs", "might_sleep", true }, + { "main_might_sleep_with_subprogs", "does_not_sleep", false }, + { "main_might_sleep", "main_might_sleep", true }, + { "main_does_not_sleep", "main_does_not_sleep", false }, + }, + }; + const char *pkt_err = "Extension program changes packet data"; + const char *slp_err = "Extension program may sleep"; + struct { + const char *func; + bool has_side_effect; + const char *err_msg; + } replacements[2][2] = { + { + { "changes_pkt_data", true, pkt_err }, + { "does_not_change_pkt_data", false, pkt_err }, + }, + { + { "might_sleep", true, slp_err }, + { "does_not_sleep", false, slp_err }, + }, + }; + char buf[64]; + + for (int t = 0; t < 2; t++) { + for (int i = 0; i < ARRAY_SIZE(mains); ++i) { + for (int j = 0; j < ARRAY_SIZE(replacements); ++j) { + snprintf(buf, sizeof(buf), "%s_with_%s", + mains[t][i].to_be_replaced, replacements[t][j].func); + if (!test__start_subtest(buf)) + continue; + test_aux(mains[t][i].main, mains[t][i].to_be_replaced, replacements[t][j].func, + mains[t][i].has_side_effect || !replacements[t][j].has_side_effect, + replacements[t][j].err_msg); + } + } + } +} diff --git a/tools/testing/selftests/bpf/progs/changes_pkt_data.c b/tools/testing/selftests/bpf/progs/changes_pkt_data.c deleted file mode 100644 index 43cada48b28ad..0000000000000 --- a/tools/testing/selftests/bpf/progs/changes_pkt_data.c +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include - -__noinline -long changes_pkt_data(struct __sk_buff *sk) -{ - return bpf_skb_pull_data(sk, 0); -} - -__noinline __weak -long does_not_change_pkt_data(struct __sk_buff *sk) -{ - return 0; -} - -SEC("?tc") -int main_with_subprogs(struct __sk_buff *sk) -{ - changes_pkt_data(sk); - does_not_change_pkt_data(sk); - return 0; -} - -SEC("?tc") -int main_changes(struct __sk_buff *sk) -{ - bpf_skb_pull_data(sk, 0); - return 0; -} - -SEC("?tc") -int main_does_not_change(struct __sk_buff *sk) -{ - return 0; -} - -char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/summarization.c b/tools/testing/selftests/bpf/progs/summarization.c new file mode 100644 index 0000000000000..f89effe82c9e2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/summarization.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" + +__noinline +long changes_pkt_data(struct __sk_buff *sk) +{ + return bpf_skb_pull_data(sk, 0); +} + +__noinline __weak +long does_not_change_pkt_data(struct __sk_buff *sk) +{ + return 0; +} + +SEC("?tc") +int main_changes_with_subprogs(struct __sk_buff *sk) +{ + changes_pkt_data(sk); + does_not_change_pkt_data(sk); + return 0; +} + +SEC("?tc") +int main_changes(struct __sk_buff *sk) +{ + bpf_skb_pull_data(sk, 0); + return 0; +} + +SEC("?tc") +int main_does_not_change(struct __sk_buff *sk) +{ + return 0; +} + +__noinline +long might_sleep(struct pt_regs *ctx __arg_ctx) +{ + int i; + + bpf_copy_from_user(&i, sizeof(i), NULL); + return i; +} + +__noinline __weak +long does_not_sleep(struct pt_regs *ctx __arg_ctx) +{ + return 0; +} + +SEC("?uprobe.s") +int main_might_sleep_with_subprogs(struct pt_regs *ctx) +{ + might_sleep(ctx); + does_not_sleep(ctx); + return 0; +} + +SEC("?uprobe.s") +int main_might_sleep(struct pt_regs *ctx) +{ + int i; + + bpf_copy_from_user(&i, sizeof(i), NULL); + return i; +} + +SEC("?uprobe.s") +int main_does_not_sleep(struct pt_regs *ctx) +{ + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c b/tools/testing/selftests/bpf/progs/summarization_freplace.c similarity index 57% rename from tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c rename to tools/testing/selftests/bpf/progs/summarization_freplace.c index f9a622705f1b3..935f00e0e9ea0 100644 --- a/tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c +++ b/tools/testing/selftests/bpf/progs/summarization_freplace.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include SEC("?freplace") @@ -15,4 +15,19 @@ long does_not_change_pkt_data(struct __sk_buff *sk) return 0; } +SEC("?freplace") +long might_sleep(struct pt_regs *ctx) +{ + int i; + + bpf_copy_from_user(&i, sizeof(i), NULL); + return i; +} + +SEC("?freplace") +long does_not_sleep(struct pt_regs *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; From 17a82ed98f62e0b3d2465e482ae5d0503e989a48 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 1 Mar 2025 19:13:15 +0000 Subject: [PATCH 19/80] bpf: no longer acquire map_idr_lock in bpf_map_inc_not_zero() bpf_sk_storage_clone() is the only caller of bpf_map_inc_not_zero() and is holding rcu_read_lock(). map_idr_lock does not add any protection, just remove the cost for passive TCP flows. Signed-off-by: Eric Dumazet Cc: Kui-Feng Lee Cc: Martin KaFai Lau Acked-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20250301191315.1532629-1-edumazet@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1392b546c88b0..57a4387062159 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1610,11 +1610,8 @@ struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) { - spin_lock_bh(&map_idr_lock); - map = __bpf_map_inc_not_zero(map, false); - spin_unlock_bh(&map_idr_lock); - - return map; + lockdep_assert(rcu_read_lock_held()); + return __bpf_map_inc_not_zero(map, false); } EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); From b2d9ef71d4c9ac4cfd42e1db9d42fd6b961611c9 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Mon, 3 Mar 2025 05:37:07 +0000 Subject: [PATCH 20/80] bpf: Factor out atomic_ptr_type_ok() Factor out atomic_ptr_type_ok() as a helper function to be used later. Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/e5ef8b3116f3fffce78117a14060ddce05eba52a.1740978603.git.yepeilin@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index eb1624f6e7434..66b19fa4be489 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6195,6 +6195,26 @@ static bool is_arena_reg(struct bpf_verifier_env *env, int regno) return reg->type == PTR_TO_ARENA; } +/* Return false if @regno contains a pointer whose type isn't supported for + * atomic instruction @insn. + */ +static bool atomic_ptr_type_ok(struct bpf_verifier_env *env, int regno, + struct bpf_insn *insn) +{ + if (is_ctx_reg(env, regno)) + return false; + if (is_pkt_reg(env, regno)) + return false; + if (is_flow_key_reg(env, regno)) + return false; + if (is_sk_reg(env, regno)) + return false; + if (is_arena_reg(env, regno)) + return bpf_jit_supports_insn(insn, true); + + return true; +} + static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { #ifdef CONFIG_NET [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], @@ -7652,11 +7672,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return -EACCES; } - if (is_ctx_reg(env, insn->dst_reg) || - is_pkt_reg(env, insn->dst_reg) || - is_flow_key_reg(env, insn->dst_reg) || - is_sk_reg(env, insn->dst_reg) || - (is_arena_reg(env, insn->dst_reg) && !bpf_jit_supports_insn(insn, true))) { + if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) { verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str(env, reg_state(env, insn->dst_reg)->type)); From d430c46c758016b0618256298043fad12336ed6d Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Mon, 3 Mar 2025 05:37:19 +0000 Subject: [PATCH 21/80] bpf: Factor out check_atomic_rmw() Currently, check_atomic() only handles atomic read-modify-write (RMW) instructions. Since we are planning to introduce other types of atomic instructions (i.e., atomic load/store), extract the existing RMW handling logic into its own function named check_atomic_rmw(). Remove the @insn_idx parameter as it is not really necessary. Use 'env->insn_idx' instead, as in other places in verifier.c. Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/6323ac8e73a10a1c8ee547c77ed68cf8eb6b90e1.1740978603.git.yepeilin@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 53 +++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 66b19fa4be489..e3991ac72029f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7616,28 +7616,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, bool allow_trust_mismatch); -static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) +static int check_atomic_rmw(struct bpf_verifier_env *env, + struct bpf_insn *insn) { int load_reg; int err; - switch (insn->imm) { - case BPF_ADD: - case BPF_ADD | BPF_FETCH: - case BPF_AND: - case BPF_AND | BPF_FETCH: - case BPF_OR: - case BPF_OR | BPF_FETCH: - case BPF_XOR: - case BPF_XOR | BPF_FETCH: - case BPF_XCHG: - case BPF_CMPXCHG: - break; - default: - verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); - return -EINVAL; - } - if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) { verbose(env, "invalid atomic operand size\n"); return -EINVAL; @@ -7699,12 +7683,12 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i /* Check whether we can read the memory, with second call for fetch * case to simulate the register fill. */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1, true, false); if (!err && load_reg >= 0) - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, load_reg, - true, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_READ, load_reg, true, false); if (err) return err; @@ -7714,13 +7698,34 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return err; } /* Check whether we can write into the same memory. */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); if (err) return err; return 0; } +static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + switch (insn->imm) { + case BPF_ADD: + case BPF_ADD | BPF_FETCH: + case BPF_AND: + case BPF_AND | BPF_FETCH: + case BPF_OR: + case BPF_OR | BPF_FETCH: + case BPF_XOR: + case BPF_XOR | BPF_FETCH: + case BPF_XCHG: + case BPF_CMPXCHG: + return check_atomic_rmw(env, insn); + default: + verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", + insn->imm); + return -EINVAL; + } +} + /* When register 'regno' is used to read the stack (either directly or through * a helper function) make sure that it's within stack boundary and, depending * on the access type and privileges, that all elements of the stack are @@ -19224,7 +19229,7 @@ static int do_check(struct bpf_verifier_env *env) enum bpf_reg_type dst_reg_type; if (BPF_MODE(insn->code) == BPF_ATOMIC) { - err = check_atomic(env, env->insn_idx, insn); + err = check_atomic(env, insn); if (err) return err; env->insn_idx++; From d38ad248fb7a526ff4ea7218c30c421d3b2ff34b Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Mon, 3 Mar 2025 05:37:25 +0000 Subject: [PATCH 22/80] bpf: Factor out check_load_mem() and check_store_reg() Extract BPF_LDX and most non-ATOMIC BPF_STX instruction handling logic in do_check() into helper functions to be used later. While we are here, make that comment about "reserved fields" more specific. Suggested-by: Eduard Zingerman Acked-by: Eduard Zingerman Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/8b39c94eac2bb7389ff12392ca666f939124ec4f.1740978603.git.yepeilin@google.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 110 +++++++++++++++++++++++++----------------- 1 file changed, 67 insertions(+), 43 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e3991ac72029f..22c4edc8695c0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7616,6 +7616,67 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, bool allow_trust_mismatch); +static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, + bool strict_alignment_once, bool is_ldsx, + bool allow_trust_mismatch, const char *ctx) +{ + struct bpf_reg_state *regs = cur_regs(env); + enum bpf_reg_type src_reg_type; + int err; + + /* check src operand */ + err = check_reg_arg(env, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check dst operand */ + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + + src_reg_type = regs[insn->src_reg].type; + + /* Check if (src_reg + off) is readable. The state of dst_reg will be + * updated by this call. + */ + err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, insn->dst_reg, + strict_alignment_once, is_ldsx); + err = err ?: save_aux_ptr_type(env, src_reg_type, + allow_trust_mismatch); + err = err ?: reg_bounds_sanity_check(env, ®s[insn->dst_reg], ctx); + + return err; +} + +static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, + bool strict_alignment_once) +{ + struct bpf_reg_state *regs = cur_regs(env); + enum bpf_reg_type dst_reg_type; + int err; + + /* check src1 operand */ + err = check_reg_arg(env, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check src2 operand */ + err = check_reg_arg(env, insn->dst_reg, SRC_OP); + if (err) + return err; + + dst_reg_type = regs[insn->dst_reg].type; + + /* Check if (dst_reg + off) is writeable. */ + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg, + strict_alignment_once, false); + err = err ?: save_aux_ptr_type(env, dst_reg_type, false); + + return err; +} + static int check_atomic_rmw(struct bpf_verifier_env *env, struct bpf_insn *insn) { @@ -19199,35 +19260,16 @@ static int do_check(struct bpf_verifier_env *env) return err; } else if (class == BPF_LDX) { - enum bpf_reg_type src_reg_type; - - /* check for reserved fields is already done */ - - /* check src operand */ - err = check_reg_arg(env, insn->src_reg, SRC_OP); - if (err) - return err; + bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX; - err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); - if (err) - return err; - - src_reg_type = regs[insn->src_reg].type; - - /* check that memory (src_reg + off) is readable, - * the state of dst_reg will be updated by this func + /* Check for reserved fields is already done in + * resolve_pseudo_ldimm64(). */ - err = check_mem_access(env, env->insn_idx, insn->src_reg, - insn->off, BPF_SIZE(insn->code), - BPF_READ, insn->dst_reg, false, - BPF_MODE(insn->code) == BPF_MEMSX); - err = err ?: save_aux_ptr_type(env, src_reg_type, true); - err = err ?: reg_bounds_sanity_check(env, ®s[insn->dst_reg], "ldx"); + err = check_load_mem(env, insn, false, is_ldsx, true, + "ldx"); if (err) return err; } else if (class == BPF_STX) { - enum bpf_reg_type dst_reg_type; - if (BPF_MODE(insn->code) == BPF_ATOMIC) { err = check_atomic(env, insn); if (err) @@ -19241,25 +19283,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - /* check src1 operand */ - err = check_reg_arg(env, insn->src_reg, SRC_OP); - if (err) - return err; - /* check src2 operand */ - err = check_reg_arg(env, insn->dst_reg, SRC_OP); - if (err) - return err; - - dst_reg_type = regs[insn->dst_reg].type; - - /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, - insn->off, BPF_SIZE(insn->code), - BPF_WRITE, insn->src_reg, false, false); - if (err) - return err; - - err = save_aux_ptr_type(env, dst_reg_type, false); + err = check_store_reg(env, insn, false); if (err) return err; } else if (class == BPF_ST) { From 128cd7672577dcc2026b95a674700ab0ccc52ccd Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 28 Feb 2025 16:01:45 -0800 Subject: [PATCH 23/80] veristat: @files-list.txt notation for object files list Allow reading object file list from file. E.g. the following command: ./veristat @list.txt Is equivalent to the following invocation: ./veristat line-1 line-2 ... line-N Where line-i corresponds to lines from list.txt. Lines starting with '#' are ignored. Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Acked-by: Mykyta Yatsenko Link: https://lore.kernel.org/bpf/20250301000147.1583999-2-eddyz87@gmail.com --- tools/testing/selftests/bpf/veristat.c | 62 ++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 175a03e6c5ef4..8bc4622992902 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -268,10 +268,11 @@ static int append_filter(struct filter **filters, int *cnt, const char *str); static int append_filter_file(const char *path); static int append_var_preset(struct var_preset **presets, int *cnt, const char *expr); static int append_var_preset_file(const char *filename); +static int append_file(const char *path); +static int append_file_from_file(const char *path); static error_t parse_arg(int key, char *arg, struct argp_state *state) { - void *tmp; int err; switch (key) { @@ -381,14 +382,14 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) break; } case ARGP_KEY_ARG: - tmp = realloc(env.filenames, (env.filename_cnt + 1) * sizeof(*env.filenames)); - if (!tmp) - return -ENOMEM; - env.filenames = tmp; - env.filenames[env.filename_cnt] = strdup(arg); - if (!env.filenames[env.filename_cnt]) - return -ENOMEM; - env.filename_cnt++; + if (arg[0] == '@') + err = append_file_from_file(arg + 1); + else + err = append_file(arg); + if (err) { + fprintf(stderr, "Failed to collect BPF object files: %d\n", err); + return err; + } break; default: return ARGP_ERR_UNKNOWN; @@ -689,6 +690,49 @@ static const struct stat_specs default_output_spec = { }, }; +static int append_file(const char *path) +{ + void *tmp; + + tmp = realloc(env.filenames, (env.filename_cnt + 1) * sizeof(*env.filenames)); + if (!tmp) + return -ENOMEM; + env.filenames = tmp; + env.filenames[env.filename_cnt] = strdup(path); + if (!env.filenames[env.filename_cnt]) + return -ENOMEM; + env.filename_cnt++; + return 0; +} + +static int append_file_from_file(const char *path) +{ + char buf[1024]; + int err = 0; + FILE *f; + + f = fopen(path, "r"); + if (!f) { + err = -errno; + fprintf(stderr, "Failed to open object files list in '%s': %s\n", + path, strerror(errno)); + return err; + } + + while (fscanf(f, " %1023[^\n]\n", buf) == 1) { + /* lines starting with # are comments, skip them */ + if (buf[0] == '\0' || buf[0] == '#') + continue; + err = append_file(buf); + if (err) + goto cleanup; + } + +cleanup: + fclose(f); + return err; +} + static const struct stat_specs default_csv_output_spec = { .spec_cnt = 14, .ids = { From 164975333cec24bf8ea1b15fe93b2f9568005b39 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 28 Feb 2025 16:01:46 -0800 Subject: [PATCH 24/80] veristat: Strerror expects positive number (errno) Before: ./veristat -G @foobar iters.bpf.o Failed to open presets in 'foobar': Unknown error -2 ... After: ./veristat -G @foobar iters.bpf.o Failed to open presets in 'foobar': No such file or directory ... Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Acked-by: Mykyta Yatsenko Link: https://lore.kernel.org/bpf/20250301000147.1583999-3-eddyz87@gmail.com --- tools/testing/selftests/bpf/veristat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 8bc4622992902..41dfcb6f56903 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -660,7 +660,7 @@ static int append_filter_file(const char *path) f = fopen(path, "r"); if (!f) { err = -errno; - fprintf(stderr, "Failed to open filters in '%s': %s\n", path, strerror(err)); + fprintf(stderr, "Failed to open filters in '%s': %s\n", path, strerror(-err)); return err; } @@ -1422,7 +1422,7 @@ static int append_var_preset_file(const char *filename) f = fopen(filename, "rt"); if (!f) { err = -errno; - fprintf(stderr, "Failed to open presets in '%s': %s\n", filename, strerror(err)); + fprintf(stderr, "Failed to open presets in '%s': %s\n", filename, strerror(-err)); return -EINVAL; } From b12752801e4472cc1b472606462eaab33008bfe0 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 28 Feb 2025 16:01:47 -0800 Subject: [PATCH 25/80] veristat: Report program type guess results to sdterr In order not to pollute CSV output, e.g.: $ ./veristat -o csv exceptions_ext.bpf.o > test.csv Using guessed program type 'sched_cls' for exceptions_ext.bpf.o/extension... Using guessed program type 'sched_cls' for exceptions_ext.bpf.o/throwing_extension... Signed-off-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Acked-by: Mykyta Yatsenko Link: https://lore.kernel.org/bpf/20250301000147.1583999-4-eddyz87@gmail.com --- tools/testing/selftests/bpf/veristat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 41dfcb6f56903..a18972ffdeb6f 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -1234,13 +1234,13 @@ static void fixup_obj(struct bpf_object *obj, struct bpf_program *prog, const ch bpf_program__set_expected_attach_type(prog, attach_type); if (!env.quiet) { - printf("Using guessed program type '%s' for %s/%s...\n", + fprintf(stderr, "Using guessed program type '%s' for %s/%s...\n", libbpf_bpf_prog_type_str(prog_type), filename, prog_name); } } else { if (!env.quiet) { - printf("Failed to guess program type for freplace program with context type name '%s' for %s/%s. Consider using canonical type names to help veristat...\n", + fprintf(stderr, "Failed to guess program type for freplace program with context type name '%s' for %s/%s. Consider using canonical type names to help veristat...\n", ctx_name, filename, prog_name); } } From 6829f3c51baf21f4b43ed6092b261ea4c2f6b1d2 Mon Sep 17 00:00:00 2001 From: "Bastien Curutchet (eBPF Foundation)" Date: Mon, 3 Mar 2025 09:22:49 +0100 Subject: [PATCH 26/80] selftests/bpf: test_tunnel: Add generic_attach* helpers A fair amount of code duplication is present among tests to attach BPF programs. Create generic_attach* helpers that attach BPF programs to a given interface. Use ASSERT_OK_FD() instead of ASSERT_GE() to check fd's validity. Use these helpers in all the available tests. Signed-off-by: Bastien Curutchet (eBPF Foundation) Signed-off-by: Martin KaFai Lau Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250303-tunnels-v2-1-8329f38f0678@bootlin.com --- .../selftests/bpf/prog_tests/test_tunnel.c | 140 +++++++++--------- 1 file changed, 66 insertions(+), 74 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c index cec746e77cd3a..cc03a4440182c 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c @@ -364,32 +364,34 @@ static int test_ping(int family, const char *addr) return -1; } -static int attach_tc_prog(struct bpf_tc_hook *hook, int igr_fd, int egr_fd) +static int attach_tc_prog(int ifindex, int igr_fd, int egr_fd) { + DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = ifindex, + .attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS); DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts1, .handle = 1, .priority = 1, .prog_fd = igr_fd); DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts2, .handle = 1, .priority = 1, .prog_fd = egr_fd); int ret; - ret = bpf_tc_hook_create(hook); + ret = bpf_tc_hook_create(&hook); if (!ASSERT_OK(ret, "create tc hook")) return ret; if (igr_fd >= 0) { - hook->attach_point = BPF_TC_INGRESS; - ret = bpf_tc_attach(hook, &opts1); + hook.attach_point = BPF_TC_INGRESS; + ret = bpf_tc_attach(&hook, &opts1); if (!ASSERT_OK(ret, "bpf_tc_attach")) { - bpf_tc_hook_destroy(hook); + bpf_tc_hook_destroy(&hook); return ret; } } if (egr_fd >= 0) { - hook->attach_point = BPF_TC_EGRESS; - ret = bpf_tc_attach(hook, &opts2); + hook.attach_point = BPF_TC_EGRESS; + ret = bpf_tc_attach(&hook, &opts2); if (!ASSERT_OK(ret, "bpf_tc_attach")) { - bpf_tc_hook_destroy(hook); + bpf_tc_hook_destroy(&hook); return ret; } } @@ -397,6 +399,50 @@ static int attach_tc_prog(struct bpf_tc_hook *hook, int igr_fd, int egr_fd) return 0; } +static int generic_attach(const char *dev, int igr_fd, int egr_fd) +{ + int ifindex; + + if (!ASSERT_OK_FD(igr_fd, "check ingress fd")) + return -1; + if (!ASSERT_OK_FD(egr_fd, "check egress fd")) + return -1; + + ifindex = if_nametoindex(dev); + if (!ASSERT_NEQ(ifindex, 0, "get ifindex")) + return -1; + + return attach_tc_prog(ifindex, igr_fd, egr_fd); +} + +static int generic_attach_igr(const char *dev, int igr_fd) +{ + int ifindex; + + if (!ASSERT_OK_FD(igr_fd, "check ingress fd")) + return -1; + + ifindex = if_nametoindex(dev); + if (!ASSERT_NEQ(ifindex, 0, "get ifindex")) + return -1; + + return attach_tc_prog(ifindex, igr_fd, -1); +} + +static int generic_attach_egr(const char *dev, int egr_fd) +{ + int ifindex; + + if (!ASSERT_OK_FD(egr_fd, "check egress fd")) + return -1; + + ifindex = if_nametoindex(dev); + if (!ASSERT_NEQ(ifindex, 0, "get ifindex")) + return -1; + + return attach_tc_prog(ifindex, -1, egr_fd); +} + static void test_vxlan_tunnel(void) { struct test_tunnel_kern *skel = NULL; @@ -404,11 +450,9 @@ static void test_vxlan_tunnel(void) int local_ip_map_fd = -1; int set_src_prog_fd, get_src_prog_fd; int set_dst_prog_fd; - int key = 0, ifindex = -1; + int key = 0; uint local_ip; int err; - DECLARE_LIBBPF_OPTS(bpf_tc_hook, tc_hook, - .attach_point = BPF_TC_INGRESS); /* add vxlan tunnel */ err = add_vxlan_tunnel(); @@ -419,42 +463,22 @@ static void test_vxlan_tunnel(void) skel = test_tunnel_kern__open_and_load(); if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) goto done; - ifindex = if_nametoindex(VXLAN_TUNL_DEV1); - if (!ASSERT_NEQ(ifindex, 0, "vxlan11 ifindex")) - goto done; - tc_hook.ifindex = ifindex; get_src_prog_fd = bpf_program__fd(skel->progs.vxlan_get_tunnel_src); set_src_prog_fd = bpf_program__fd(skel->progs.vxlan_set_tunnel_src); - if (!ASSERT_GE(get_src_prog_fd, 0, "bpf_program__fd")) - goto done; - if (!ASSERT_GE(set_src_prog_fd, 0, "bpf_program__fd")) - goto done; - if (attach_tc_prog(&tc_hook, get_src_prog_fd, set_src_prog_fd)) + if (generic_attach(VXLAN_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd)) goto done; /* load and attach bpf prog to veth dev tc hook point */ - ifindex = if_nametoindex("veth1"); - if (!ASSERT_NEQ(ifindex, 0, "veth1 ifindex")) - goto done; - tc_hook.ifindex = ifindex; set_dst_prog_fd = bpf_program__fd(skel->progs.veth_set_outer_dst); - if (!ASSERT_GE(set_dst_prog_fd, 0, "bpf_program__fd")) - goto done; - if (attach_tc_prog(&tc_hook, set_dst_prog_fd, -1)) + if (generic_attach_igr("veth1", set_dst_prog_fd)) goto done; /* load and attach prog set_md to tunnel dev tc hook point at_ns0 */ nstoken = open_netns("at_ns0"); if (!ASSERT_OK_PTR(nstoken, "setns src")) goto done; - ifindex = if_nametoindex(VXLAN_TUNL_DEV0); - if (!ASSERT_NEQ(ifindex, 0, "vxlan00 ifindex")) - goto done; - tc_hook.ifindex = ifindex; set_dst_prog_fd = bpf_program__fd(skel->progs.vxlan_set_tunnel_dst); - if (!ASSERT_GE(set_dst_prog_fd, 0, "bpf_program__fd")) - goto done; - if (attach_tc_prog(&tc_hook, -1, set_dst_prog_fd)) + if (generic_attach_egr(VXLAN_TUNL_DEV0, set_dst_prog_fd)) goto done; close_netns(nstoken); @@ -488,11 +512,9 @@ static void test_ip6vxlan_tunnel(void) int local_ip_map_fd = -1; int set_src_prog_fd, get_src_prog_fd; int set_dst_prog_fd; - int key = 0, ifindex = -1; + int key = 0; uint local_ip; int err; - DECLARE_LIBBPF_OPTS(bpf_tc_hook, tc_hook, - .attach_point = BPF_TC_INGRESS); /* add vxlan tunnel */ err = add_ip6vxlan_tunnel(); @@ -503,31 +525,17 @@ static void test_ip6vxlan_tunnel(void) skel = test_tunnel_kern__open_and_load(); if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) goto done; - ifindex = if_nametoindex(IP6VXLAN_TUNL_DEV1); - if (!ASSERT_NEQ(ifindex, 0, "ip6vxlan11 ifindex")) - goto done; - tc_hook.ifindex = ifindex; get_src_prog_fd = bpf_program__fd(skel->progs.ip6vxlan_get_tunnel_src); set_src_prog_fd = bpf_program__fd(skel->progs.ip6vxlan_set_tunnel_src); - if (!ASSERT_GE(set_src_prog_fd, 0, "bpf_program__fd")) - goto done; - if (!ASSERT_GE(get_src_prog_fd, 0, "bpf_program__fd")) - goto done; - if (attach_tc_prog(&tc_hook, get_src_prog_fd, set_src_prog_fd)) + if (generic_attach(IP6VXLAN_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd)) goto done; /* load and attach prog set_md to tunnel dev tc hook point at_ns0 */ nstoken = open_netns("at_ns0"); if (!ASSERT_OK_PTR(nstoken, "setns src")) goto done; - ifindex = if_nametoindex(IP6VXLAN_TUNL_DEV0); - if (!ASSERT_NEQ(ifindex, 0, "ip6vxlan00 ifindex")) - goto done; - tc_hook.ifindex = ifindex; set_dst_prog_fd = bpf_program__fd(skel->progs.ip6vxlan_set_tunnel_dst); - if (!ASSERT_GE(set_dst_prog_fd, 0, "bpf_program__fd")) - goto done; - if (attach_tc_prog(&tc_hook, -1, set_dst_prog_fd)) + if (generic_attach_egr(IP6VXLAN_TUNL_DEV0, set_dst_prog_fd)) goto done; close_netns(nstoken); @@ -559,10 +567,7 @@ static void test_ipip_tunnel(enum ipip_encap encap) struct test_tunnel_kern *skel = NULL; struct nstoken *nstoken; int set_src_prog_fd, get_src_prog_fd; - int ifindex = -1; int err; - DECLARE_LIBBPF_OPTS(bpf_tc_hook, tc_hook, - .attach_point = BPF_TC_INGRESS); /* add ipip tunnel */ err = add_ipip_tunnel(encap); @@ -573,10 +578,6 @@ static void test_ipip_tunnel(enum ipip_encap encap) skel = test_tunnel_kern__open_and_load(); if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) goto done; - ifindex = if_nametoindex(IPIP_TUNL_DEV1); - if (!ASSERT_NEQ(ifindex, 0, "ipip11 ifindex")) - goto done; - tc_hook.ifindex = ifindex; switch (encap) { case FOU: @@ -598,11 +599,7 @@ static void test_ipip_tunnel(enum ipip_encap encap) skel->progs.ipip_set_tunnel); } - if (!ASSERT_GE(set_src_prog_fd, 0, "bpf_program__fd")) - goto done; - if (!ASSERT_GE(get_src_prog_fd, 0, "bpf_program__fd")) - goto done; - if (attach_tc_prog(&tc_hook, get_src_prog_fd, set_src_prog_fd)) + if (generic_attach(IPIP_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd)) goto done; /* ping from root namespace test */ @@ -628,8 +625,6 @@ static void test_ipip_tunnel(enum ipip_encap encap) static void test_xfrm_tunnel(void) { - DECLARE_LIBBPF_OPTS(bpf_tc_hook, tc_hook, - .attach_point = BPF_TC_INGRESS); LIBBPF_OPTS(bpf_xdp_attach_opts, opts); struct test_tunnel_kern *skel = NULL; struct nstoken *nstoken; @@ -646,19 +641,16 @@ static void test_xfrm_tunnel(void) if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) goto done; - ifindex = if_nametoindex("veth1"); - if (!ASSERT_NEQ(ifindex, 0, "veth1 ifindex")) - goto done; /* attach tc prog to tunnel dev */ - tc_hook.ifindex = ifindex; tc_prog_fd = bpf_program__fd(skel->progs.xfrm_get_state); - if (!ASSERT_GE(tc_prog_fd, 0, "bpf_program__fd")) - goto done; - if (attach_tc_prog(&tc_hook, tc_prog_fd, -1)) + if (generic_attach_igr("veth1", tc_prog_fd)) goto done; /* attach xdp prog to tunnel dev */ + ifindex = if_nametoindex("veth1"); + if (!ASSERT_NEQ(ifindex, 0, "veth1 ifindex")) + goto done; xdp_prog_fd = bpf_program__fd(skel->progs.xfrm_get_state_xdp); if (!ASSERT_GE(xdp_prog_fd, 0, "bpf_program__fd")) goto done; From 7289e59a7667c4d7398491493b045db469b9008c Mon Sep 17 00:00:00 2001 From: "Bastien Curutchet (eBPF Foundation)" Date: Mon, 3 Mar 2025 09:22:50 +0100 Subject: [PATCH 27/80] selftests/bpf: test_tunnel: Add ping helpers All tests use more or less the same ping commands as final validation. Also test_ping()'s return value is checked with ASSERT_OK() while this check is already done by the SYS() macro inside test_ping(). Create helpers around test_ping() and use them in the tests to avoid code duplication. Remove the unnecessary ASSERT_OK() from the tests. Signed-off-by: Bastien Curutchet (eBPF Foundation) Signed-off-by: Martin KaFai Lau Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250303-tunnels-v2-2-8329f38f0678@bootlin.com --- .../selftests/bpf/prog_tests/test_tunnel.c | 53 +++++++++---------- 1 file changed, 24 insertions(+), 29 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c index cc03a4440182c..09e674c147bb3 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c @@ -364,6 +364,25 @@ static int test_ping(int family, const char *addr) return -1; } +static void ping_dev0(void) +{ + /* ping from root namespace test */ + test_ping(AF_INET, IP4_ADDR_TUNL_DEV0); +} + +static void ping_dev1(void) +{ + struct nstoken *nstoken; + + /* ping from at_ns0 namespace test */ + nstoken = open_netns("at_ns0"); + if (!ASSERT_OK_PTR(nstoken, "setns")) + return; + + test_ping(AF_INET, IP4_ADDR_TUNL_DEV1); + close_netns(nstoken); +} + static int attach_tc_prog(int ifindex, int igr_fd, int egr_fd) { DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = ifindex, @@ -492,9 +511,7 @@ static void test_vxlan_tunnel(void) goto done; /* ping test */ - err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV0); - if (!ASSERT_OK(err, "test_ping")) - goto done; + ping_dev0(); done: /* delete vxlan tunnel */ @@ -549,9 +566,7 @@ static void test_ip6vxlan_tunnel(void) goto done; /* ping test */ - err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV0); - if (!ASSERT_OK(err, "test_ping")) - goto done; + ping_dev0(); done: /* delete ipv6 vxlan tunnel */ @@ -565,7 +580,6 @@ static void test_ip6vxlan_tunnel(void) static void test_ipip_tunnel(enum ipip_encap encap) { struct test_tunnel_kern *skel = NULL; - struct nstoken *nstoken; int set_src_prog_fd, get_src_prog_fd; int err; @@ -602,19 +616,8 @@ static void test_ipip_tunnel(enum ipip_encap encap) if (generic_attach(IPIP_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd)) goto done; - /* ping from root namespace test */ - err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV0); - if (!ASSERT_OK(err, "test_ping")) - goto done; - - /* ping from at_ns0 namespace test */ - nstoken = open_netns("at_ns0"); - if (!ASSERT_OK_PTR(nstoken, "setns")) - goto done; - err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1); - if (!ASSERT_OK(err, "test_ping")) - goto done; - close_netns(nstoken); + ping_dev0(); + ping_dev1(); done: /* delete ipip tunnel */ @@ -627,7 +630,6 @@ static void test_xfrm_tunnel(void) { LIBBPF_OPTS(bpf_xdp_attach_opts, opts); struct test_tunnel_kern *skel = NULL; - struct nstoken *nstoken; int xdp_prog_fd; int tc_prog_fd; int ifindex; @@ -658,14 +660,7 @@ static void test_xfrm_tunnel(void) if (!ASSERT_OK(err, "bpf_xdp_attach")) goto done; - /* ping from at_ns0 namespace test */ - nstoken = open_netns("at_ns0"); - if (!ASSERT_OK_PTR(nstoken, "setns")) - goto done; - err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1); - close_netns(nstoken); - if (!ASSERT_OK(err, "test_ping")) - goto done; + ping_dev1(); if (!ASSERT_EQ(skel->bss->xfrm_reqid, 1, "req_id")) goto done; From 08d20eaa072771aecf8cdf38dbdb71d9a29c57b2 Mon Sep 17 00:00:00 2001 From: "Bastien Curutchet (eBPF Foundation)" Date: Mon, 3 Mar 2025 09:22:51 +0100 Subject: [PATCH 28/80] selftests/bpf: test_tunnel: Move gre tunnel test to test_progs gre tunnels are tested in the test_tunnel.sh but not in the test_progs framework. Add a new test in test_progs to test gre tunnels. It uses the same network topology and the same BPF programs than the script. Remove test_gre() and test_gre_no_tunnel_key() from the script. Signed-off-by: Bastien Curutchet (eBPF Foundation) Signed-off-by: Martin KaFai Lau Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250303-tunnels-v2-3-8329f38f0678@bootlin.com --- .../selftests/bpf/prog_tests/test_tunnel.c | 97 +++++++++++++++++++ tools/testing/selftests/bpf/test_tunnel.sh | 79 --------------- 2 files changed, 97 insertions(+), 79 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c index 09e674c147bb3..79b0b1cd85657 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c @@ -98,6 +98,9 @@ #define XFRM_SPI_IN_TO_OUT 0x1 #define XFRM_SPI_OUT_TO_IN 0x2 +#define GRE_TUNL_DEV0 "gre00" +#define GRE_TUNL_DEV1 "gre11" + #define PING_ARGS "-i 0.01 -c 3 -w 10 -q" static int config_device(void) @@ -216,6 +219,18 @@ static int set_ipip_encap(const char *ipproto, const char *type) return -1; } +static int set_ipv4_addr(const char *dev0, const char *dev1) +{ + SYS(fail, "ip -n at_ns0 link set dev %s up", dev0); + SYS(fail, "ip -n at_ns0 addr add dev %s %s/24", dev0, IP4_ADDR_TUNL_DEV0); + SYS(fail, "ip link set dev %s up", dev1); + SYS(fail, "ip addr add dev %s %s/24", dev1, IP4_ADDR_TUNL_DEV1); + + return 0; +fail: + return 1; +} + static int add_ipip_tunnel(enum ipip_encap encap) { int err; @@ -356,6 +371,31 @@ static void delete_xfrm_tunnel(void) IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN); } +static int add_ipv4_tunnel(const char *dev0, const char *dev1, + const char *type, const char *opt) +{ + if (!type || !opt || !dev0 || !dev1) + return -1; + + SYS(fail, "ip -n at_ns0 link add dev %s type %s %s local %s remote %s", + dev0, type, opt, IP4_ADDR_VETH0, IP4_ADDR1_VETH1); + + SYS(fail, "ip link add dev %s type %s external", dev1, type); + + return set_ipv4_addr(dev0, dev1); +fail: + return -1; +} + +static void delete_tunnel(const char *dev0, const char *dev1) +{ + if (!dev0 || !dev1) + return; + + SYS_NOFAIL("ip netns exec at_ns0 ip link delete dev %s", dev0); + SYS_NOFAIL("ip link delete dev %s", dev1); +} + static int test_ping(int family, const char *addr) { SYS(fail, "%s %s %s > /dev/null", ping_command(family), PING_ARGS, addr); @@ -677,6 +717,59 @@ static void test_xfrm_tunnel(void) test_tunnel_kern__destroy(skel); } +enum gre_test { + GRE, + GRE_NOKEY, + GRETAP, + GRETAP_NOKEY, +}; + +static void test_gre_tunnel(enum gre_test test) +{ + struct test_tunnel_kern *skel; + int set_fd, get_fd; + int err; + + skel = test_tunnel_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) + return; + + switch (test) { + case GRE: + err = add_ipv4_tunnel(GRE_TUNL_DEV0, GRE_TUNL_DEV1, "gre", "seq"); + set_fd = bpf_program__fd(skel->progs.gre_set_tunnel_no_key); + get_fd = bpf_program__fd(skel->progs.gre_get_tunnel); + break; + case GRE_NOKEY: + err = add_ipv4_tunnel(GRE_TUNL_DEV0, GRE_TUNL_DEV1, "gre", "seq key 2"); + set_fd = bpf_program__fd(skel->progs.gre_set_tunnel); + get_fd = bpf_program__fd(skel->progs.gre_get_tunnel); + break; + case GRETAP: + err = add_ipv4_tunnel(GRE_TUNL_DEV0, GRE_TUNL_DEV1, "gretap", "seq"); + set_fd = bpf_program__fd(skel->progs.gre_set_tunnel_no_key); + get_fd = bpf_program__fd(skel->progs.gre_get_tunnel); + break; + case GRETAP_NOKEY: + err = add_ipv4_tunnel(GRE_TUNL_DEV0, GRE_TUNL_DEV1, "gretap", "seq key 2"); + set_fd = bpf_program__fd(skel->progs.gre_set_tunnel); + get_fd = bpf_program__fd(skel->progs.gre_get_tunnel); + break; + } + if (!ASSERT_OK(err, "add tunnel")) + goto done; + + if (generic_attach(GRE_TUNL_DEV1, get_fd, set_fd)) + goto done; + + ping_dev0(); + ping_dev1(); + +done: + delete_tunnel(GRE_TUNL_DEV0, GRE_TUNL_DEV1); + test_tunnel_kern__destroy(skel); +} + #define RUN_TEST(name, ...) \ ({ \ if (test__start_subtest(#name)) { \ @@ -694,6 +787,10 @@ static void *test_tunnel_run_tests(void *arg) RUN_TEST(ipip_tunnel, FOU); RUN_TEST(ipip_tunnel, GUE); RUN_TEST(xfrm_tunnel); + RUN_TEST(gre_tunnel, GRE); + RUN_TEST(gre_tunnel, GRE_NOKEY); + RUN_TEST(gre_tunnel, GRETAP); + RUN_TEST(gre_tunnel, GRETAP_NOKEY); return NULL; } diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh index d9661b9988ba9..48ac9cb2092f6 100755 --- a/tools/testing/selftests/bpf/test_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tunnel.sh @@ -64,26 +64,6 @@ config_device() ip addr add dev veth1 172.16.1.200/24 } -add_gre_tunnel() -{ - tun_key= - if [ -n "$1" ]; then - tun_key="key $1" - fi - - # at_ns0 namespace - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE seq $tun_key \ - local 172.16.1.100 remote 172.16.1.200 - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - - # root namespace - ip link add dev $DEV type $TYPE $tun_key external - ip link set dev $DEV up - ip addr add dev $DEV 10.1.1.200/24 -} - add_ip6gretap_tunnel() { @@ -234,54 +214,6 @@ add_ip6tnl_tunnel() ip link set dev $DEV up } -test_gre() -{ - TYPE=gretap - DEV_NS=gretap00 - DEV=gretap11 - ret=0 - - check $TYPE - config_device - add_gre_tunnel 2 - attach_bpf $DEV gre_set_tunnel gre_get_tunnel - ping $PING_ARG 10.1.1.100 - check_err $? - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: $TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: $TYPE"${NC} -} - -test_gre_no_tunnel_key() -{ - TYPE=gre - DEV_NS=gre00 - DEV=gre11 - ret=0 - - check $TYPE - config_device - add_gre_tunnel - attach_bpf $DEV gre_set_tunnel_no_key gre_get_tunnel - ping $PING_ARG 10.1.1.100 - check_err $? - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: $TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: $TYPE"${NC} -} - test_ip6gre() { TYPE=ip6gre @@ -538,8 +470,6 @@ cleanup() ip link del ipip11 2> /dev/null ip link del ipip6tnl11 2> /dev/null ip link del ip6ip6tnl11 2> /dev/null - ip link del gretap11 2> /dev/null - ip link del gre11 2> /dev/null ip link del ip6gre11 2> /dev/null ip link del ip6gretap11 2> /dev/null ip link del geneve11 2> /dev/null @@ -567,7 +497,6 @@ check() enable_debug() { - echo 'file ip_gre.c +p' > /sys/kernel/debug/dynamic_debug/control echo 'file ip6_gre.c +p' > /sys/kernel/debug/dynamic_debug/control echo 'file geneve.c +p' > /sys/kernel/debug/dynamic_debug/control echo 'file ipip.c +p' > /sys/kernel/debug/dynamic_debug/control @@ -584,14 +513,6 @@ bpf_tunnel_test() { local errors=0 - echo "Testing GRE tunnel..." - test_gre - errors=$(( $errors + $? )) - - echo "Testing GRE tunnel (without tunnel keys)..." - test_gre_no_tunnel_key - errors=$(( $errors + $? )) - echo "Testing IP6GRE tunnel..." test_ip6gre errors=$(( $errors + $? )) From 1ea01a806e4cb69d9cceff3cd1225833c78c2453 Mon Sep 17 00:00:00 2001 From: "Bastien Curutchet (eBPF Foundation)" Date: Mon, 3 Mar 2025 09:22:52 +0100 Subject: [PATCH 29/80] selftests/bpf: test_tunnel: Move ip6gre tunnel test to test_progs ip6gre tunnels are tested in the test_tunnel.sh but not in the test_progs framework. Add a new test in test_progs to test ip6gre tunnels. It uses the same network topology and the same BPF programs than the script. Disable the IPv6 DAD feature because it can take lot of time and cause some tests to fail depending on the environment they're run on. Remove test_ip6gre() and test_ip6gretap() from the script. Signed-off-by: Bastien Curutchet (eBPF Foundation) Signed-off-by: Martin KaFai Lau Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250303-tunnels-v2-4-8329f38f0678@bootlin.com --- .../selftests/bpf/prog_tests/test_tunnel.c | 104 ++++++++++++++++++ tools/testing/selftests/bpf/test_tunnel.sh | 95 ---------------- 2 files changed, 104 insertions(+), 95 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c index 79b0b1cd85657..1aa0fa56a679a 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c @@ -71,6 +71,8 @@ #define IP4_ADDR2_VETH1 "172.16.1.20" #define IP4_ADDR_TUNL_DEV0 "10.1.1.100" #define IP4_ADDR_TUNL_DEV1 "10.1.1.200" +#define IP6_ADDR_TUNL_DEV0 "fc80::100" +#define IP6_ADDR_TUNL_DEV1 "fc80::200" #define IP6_ADDR_VETH0 "::11" #define IP6_ADDR1_VETH1 "::22" @@ -101,6 +103,9 @@ #define GRE_TUNL_DEV0 "gre00" #define GRE_TUNL_DEV1 "gre11" +#define IP6GRE_TUNL_DEV0 "ip6gre00" +#define IP6GRE_TUNL_DEV1 "ip6gre11" + #define PING_ARGS "-i 0.01 -c 3 -w 10 -q" static int config_device(void) @@ -396,6 +401,43 @@ static void delete_tunnel(const char *dev0, const char *dev1) SYS_NOFAIL("ip link delete dev %s", dev1); } +static int set_ipv6_addr(const char *dev0, const char *dev1) +{ + /* disable IPv6 DAD because it might take too long and fail tests */ + SYS(fail, "ip -n at_ns0 addr add %s/96 dev veth0 nodad", IP6_ADDR_VETH0); + SYS(fail, "ip -n at_ns0 link set dev veth0 up"); + SYS(fail, "ip addr add %s/96 dev veth1 nodad", IP6_ADDR1_VETH1); + SYS(fail, "ip link set dev veth1 up"); + + SYS(fail, "ip -n at_ns0 addr add dev %s %s/24", dev0, IP4_ADDR_TUNL_DEV0); + SYS(fail, "ip -n at_ns0 addr add dev %s %s/96 nodad", dev0, IP6_ADDR_TUNL_DEV0); + SYS(fail, "ip -n at_ns0 link set dev %s up", dev0); + + SYS(fail, "ip addr add dev %s %s/24", dev1, IP4_ADDR_TUNL_DEV1); + SYS(fail, "ip addr add dev %s %s/96 nodad", dev1, IP6_ADDR_TUNL_DEV1); + SYS(fail, "ip link set dev %s up", dev1); + return 0; +fail: + return 1; +} + +static int add_ipv6_tunnel(const char *dev0, const char *dev1, + const char *type, const char *opt) +{ + if (!type || !opt || !dev0 || !dev1) + return -1; + + SYS(fail, "ip -n at_ns0 link add dev %s type %s %s local %s remote %s", + dev0, type, opt, IP6_ADDR_VETH0, IP6_ADDR1_VETH1); + + SYS(fail, "ip link add dev %s type %s external", dev1, type); + + return set_ipv6_addr(dev0, dev1); +fail: + return -1; +} + + static int test_ping(int family, const char *addr) { SYS(fail, "%s %s %s > /dev/null", ping_command(family), PING_ARGS, addr); @@ -423,6 +465,24 @@ static void ping_dev1(void) close_netns(nstoken); } +static void ping6_veth0(void) +{ + test_ping(AF_INET6, IP6_ADDR_VETH0); +} + +static void ping6_dev1(void) +{ + struct nstoken *nstoken; + + /* ping from at_ns0 namespace test */ + nstoken = open_netns("at_ns0"); + if (!ASSERT_OK_PTR(nstoken, "setns")) + return; + + test_ping(AF_INET, IP6_ADDR_TUNL_DEV1); + close_netns(nstoken); +} + static int attach_tc_prog(int ifindex, int igr_fd, int egr_fd) { DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = ifindex, @@ -770,6 +830,48 @@ static void test_gre_tunnel(enum gre_test test) test_tunnel_kern__destroy(skel); } +enum ip6gre_test { + IP6GRE, + IP6GRETAP +}; + +static void test_ip6gre_tunnel(enum ip6gre_test test) +{ + struct test_tunnel_kern *skel; + int set_fd, get_fd; + int err; + + skel = test_tunnel_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) + return; + + switch (test) { + case IP6GRE: + err = add_ipv6_tunnel(IP6GRE_TUNL_DEV0, IP6GRE_TUNL_DEV1, + "ip6gre", "flowlabel 0xbcdef key 2"); + break; + case IP6GRETAP: + err = add_ipv6_tunnel(IP6GRE_TUNL_DEV0, IP6GRE_TUNL_DEV1, + "ip6gretap", "flowlabel 0xbcdef key 2"); + break; + } + if (!ASSERT_OK(err, "add tunnel")) + goto done; + + set_fd = bpf_program__fd(skel->progs.ip6gretap_set_tunnel); + get_fd = bpf_program__fd(skel->progs.ip6gretap_get_tunnel); + if (generic_attach(IP6GRE_TUNL_DEV1, get_fd, set_fd)) + goto done; + + ping6_veth0(); + ping6_dev1(); + ping_dev0(); + ping_dev1(); +done: + delete_tunnel(IP6GRE_TUNL_DEV0, IP6GRE_TUNL_DEV1); + test_tunnel_kern__destroy(skel); +} + #define RUN_TEST(name, ...) \ ({ \ if (test__start_subtest(#name)) { \ @@ -791,6 +893,8 @@ static void *test_tunnel_run_tests(void *arg) RUN_TEST(gre_tunnel, GRE_NOKEY); RUN_TEST(gre_tunnel, GRETAP); RUN_TEST(gre_tunnel, GRETAP_NOKEY); + RUN_TEST(ip6gre_tunnel, IP6GRE); + RUN_TEST(ip6gre_tunnel, IP6GRETAP); return NULL; } diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh index 48ac9cb2092f6..367af24d2ca52 100755 --- a/tools/testing/selftests/bpf/test_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tunnel.sh @@ -64,31 +64,6 @@ config_device() ip addr add dev veth1 172.16.1.200/24 } -add_ip6gretap_tunnel() -{ - - # assign ipv6 address - ip netns exec at_ns0 ip addr add ::11/96 dev veth0 - ip netns exec at_ns0 ip link set dev veth0 up - ip addr add dev veth1 ::22/96 - ip link set dev veth1 up - - # at_ns0 namespace - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE seq flowlabel 0xbcdef key 2 \ - local ::11 remote ::22 - - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - ip netns exec at_ns0 ip addr add dev $DEV_NS fc80::100/96 - ip netns exec at_ns0 ip link set dev $DEV_NS up - - # root namespace - ip link add dev $DEV type $TYPE external - ip addr add dev $DEV 10.1.1.200/24 - ip addr add dev $DEV fc80::200/24 - ip link set dev $DEV up -} - add_erspan_tunnel() { # at_ns0 namespace @@ -214,65 +189,6 @@ add_ip6tnl_tunnel() ip link set dev $DEV up } -test_ip6gre() -{ - TYPE=ip6gre - DEV_NS=ip6gre00 - DEV=ip6gre11 - ret=0 - - check $TYPE - config_device - # reuse the ip6gretap function - add_ip6gretap_tunnel - attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel - # underlay - ping6 $PING_ARG ::11 - # overlay: ipv4 over ipv6 - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - ping $PING_ARG 10.1.1.100 - check_err $? - # overlay: ipv6 over ipv6 - ip netns exec at_ns0 ping6 $PING_ARG fc80::200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: $TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: $TYPE"${NC} -} - -test_ip6gretap() -{ - TYPE=ip6gretap - DEV_NS=ip6gretap00 - DEV=ip6gretap11 - ret=0 - - check $TYPE - config_device - add_ip6gretap_tunnel - attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel - # underlay - ping6 $PING_ARG ::11 - # overlay: ipv4 over ipv6 - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - ping $PING_ARG 10.1.1.100 - check_err $? - # overlay: ipv6 over ipv6 - ip netns exec at_ns0 ping6 $PING_ARG fc80::200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: $TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: $TYPE"${NC} -} - test_erspan() { TYPE=erspan @@ -470,8 +386,6 @@ cleanup() ip link del ipip11 2> /dev/null ip link del ipip6tnl11 2> /dev/null ip link del ip6ip6tnl11 2> /dev/null - ip link del ip6gre11 2> /dev/null - ip link del ip6gretap11 2> /dev/null ip link del geneve11 2> /dev/null ip link del ip6geneve11 2> /dev/null ip link del erspan11 2> /dev/null @@ -497,7 +411,6 @@ check() enable_debug() { - echo 'file ip6_gre.c +p' > /sys/kernel/debug/dynamic_debug/control echo 'file geneve.c +p' > /sys/kernel/debug/dynamic_debug/control echo 'file ipip.c +p' > /sys/kernel/debug/dynamic_debug/control } @@ -513,14 +426,6 @@ bpf_tunnel_test() { local errors=0 - echo "Testing IP6GRE tunnel..." - test_ip6gre - errors=$(( $errors + $? )) - - echo "Testing IP6GRETAP tunnel..." - test_ip6gretap - errors=$(( $errors + $? )) - echo "Testing ERSPAN tunnel..." test_erspan v2 errors=$(( $errors + $? )) From 0ecd1e9d32372e337e3a31bfa4bef3247f5b28b3 Mon Sep 17 00:00:00 2001 From: "Bastien Curutchet (eBPF Foundation)" Date: Mon, 3 Mar 2025 09:22:53 +0100 Subject: [PATCH 30/80] selftests/bpf: test_tunnel: Move erspan tunnel tests to test_progs erspan tunnels are tested in the test_tunnel.sh but not in the test_progs framework. Add a new test in test_progs to test erspan tunnels. It uses the same network topology and the same BPF programs than the script. Remove test_erspan() from the script. Signed-off-by: Bastien Curutchet (eBPF Foundation) Signed-off-by: Martin KaFai Lau Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250303-tunnels-v2-5-8329f38f0678@bootlin.com --- .../selftests/bpf/prog_tests/test_tunnel.c | 46 ++++++++++++++++ tools/testing/selftests/bpf/test_tunnel.sh | 52 ------------------- 2 files changed, 46 insertions(+), 52 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c index 1aa0fa56a679a..1f39ebdf79c8e 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c @@ -106,6 +106,9 @@ #define IP6GRE_TUNL_DEV0 "ip6gre00" #define IP6GRE_TUNL_DEV1 "ip6gre11" +#define ERSPAN_TUNL_DEV0 "erspan00" +#define ERSPAN_TUNL_DEV1 "erspan11" + #define PING_ARGS "-i 0.01 -c 3 -w 10 -q" static int config_device(void) @@ -872,6 +875,47 @@ static void test_ip6gre_tunnel(enum ip6gre_test test) test_tunnel_kern__destroy(skel); } +enum erspan_test { + V1, + V2 +}; + +static void test_erspan_tunnel(enum erspan_test test) +{ + struct test_tunnel_kern *skel; + int set_fd, get_fd; + int err; + + skel = test_tunnel_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) + return; + + switch (test) { + case V1: + err = add_ipv4_tunnel(ERSPAN_TUNL_DEV0, ERSPAN_TUNL_DEV1, + "erspan", "seq key 2 erspan_ver 1 erspan 123"); + break; + case V2: + err = add_ipv4_tunnel(ERSPAN_TUNL_DEV0, ERSPAN_TUNL_DEV1, + "erspan", + "seq key 2 erspan_ver 2 erspan_dir egress erspan_hwid 3"); + break; + } + if (!ASSERT_OK(err, "add tunnel")) + goto done; + + set_fd = bpf_program__fd(skel->progs.erspan_set_tunnel); + get_fd = bpf_program__fd(skel->progs.erspan_get_tunnel); + if (generic_attach(ERSPAN_TUNL_DEV1, get_fd, set_fd)) + goto done; + + ping_dev0(); + ping_dev1(); +done: + delete_tunnel(ERSPAN_TUNL_DEV0, ERSPAN_TUNL_DEV1); + test_tunnel_kern__destroy(skel); +} + #define RUN_TEST(name, ...) \ ({ \ if (test__start_subtest(#name)) { \ @@ -895,6 +939,8 @@ static void *test_tunnel_run_tests(void *arg) RUN_TEST(gre_tunnel, GRETAP_NOKEY); RUN_TEST(ip6gre_tunnel, IP6GRE); RUN_TEST(ip6gre_tunnel, IP6GRETAP); + RUN_TEST(erspan_tunnel, V1); + RUN_TEST(erspan_tunnel, V2); return NULL; } diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh index 367af24d2ca52..e8e7839fb5b5f 100755 --- a/tools/testing/selftests/bpf/test_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tunnel.sh @@ -64,29 +64,6 @@ config_device() ip addr add dev veth1 172.16.1.200/24 } -add_erspan_tunnel() -{ - # at_ns0 namespace - if [ "$1" == "v1" ]; then - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE seq key 2 \ - local 172.16.1.100 remote 172.16.1.200 \ - erspan_ver 1 erspan 123 - else - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE seq key 2 \ - local 172.16.1.100 remote 172.16.1.200 \ - erspan_ver 2 erspan_dir egress erspan_hwid 3 - fi - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - - # root namespace - ip link add dev $DEV type $TYPE external - ip link set dev $DEV up - ip addr add dev $DEV 10.1.1.200/24 -} - add_ip6erspan_tunnel() { @@ -189,30 +166,6 @@ add_ip6tnl_tunnel() ip link set dev $DEV up } -test_erspan() -{ - TYPE=erspan - DEV_NS=erspan00 - DEV=erspan11 - ret=0 - - check $TYPE - config_device - add_erspan_tunnel $1 - attach_bpf $DEV erspan_set_tunnel erspan_get_tunnel - ping $PING_ARG 10.1.1.100 - check_err $? - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: $TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: $TYPE"${NC} -} - test_ip6erspan() { TYPE=ip6erspan @@ -388,7 +341,6 @@ cleanup() ip link del ip6ip6tnl11 2> /dev/null ip link del geneve11 2> /dev/null ip link del ip6geneve11 2> /dev/null - ip link del erspan11 2> /dev/null ip link del ip6erspan11 2> /dev/null } @@ -426,10 +378,6 @@ bpf_tunnel_test() { local errors=0 - echo "Testing ERSPAN tunnel..." - test_erspan v2 - errors=$(( $errors + $? )) - echo "Testing IP6ERSPAN tunnel..." test_ip6erspan v2 errors=$(( $errors + $? )) From cae41f74b778e204aca10cafc2c0ddb3a94e5f8e Mon Sep 17 00:00:00 2001 From: "Bastien Curutchet (eBPF Foundation)" Date: Mon, 3 Mar 2025 09:22:54 +0100 Subject: [PATCH 31/80] selftests/bpf: test_tunnel: Move ip6erspan tunnel test to test_progs ip6erspan tunnels are tested in the test_tunnel.sh but not in the test_progs framework. Add a new test in test_progs to test ip6erspan tunnels. It uses the same network topology and the same BPF programs than the script. Remove test_ip6erspan() from the script. Signed-off-by: Bastien Curutchet (eBPF Foundation) Signed-off-by: Martin KaFai Lau Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250303-tunnels-v2-6-8329f38f0678@bootlin.com --- .../selftests/bpf/prog_tests/test_tunnel.c | 41 +++++++++++++ tools/testing/selftests/bpf/test_tunnel.sh | 58 ------------------- 2 files changed, 41 insertions(+), 58 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c index 1f39ebdf79c8e..1b1b190452824 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c @@ -109,6 +109,9 @@ #define ERSPAN_TUNL_DEV0 "erspan00" #define ERSPAN_TUNL_DEV1 "erspan11" +#define IP6ERSPAN_TUNL_DEV0 "ip6erspan00" +#define IP6ERSPAN_TUNL_DEV1 "ip6erspan11" + #define PING_ARGS "-i 0.01 -c 3 -w 10 -q" static int config_device(void) @@ -916,6 +919,42 @@ static void test_erspan_tunnel(enum erspan_test test) test_tunnel_kern__destroy(skel); } +static void test_ip6erspan_tunnel(enum erspan_test test) +{ + struct test_tunnel_kern *skel; + int set_fd, get_fd; + int err; + + skel = test_tunnel_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) + return; + + switch (test) { + case V1: + err = add_ipv6_tunnel(IP6ERSPAN_TUNL_DEV0, IP6ERSPAN_TUNL_DEV1, + "ip6erspan", "seq key 2 erspan_ver 1 erspan 123"); + break; + case V2: + err = add_ipv6_tunnel(IP6ERSPAN_TUNL_DEV0, IP6ERSPAN_TUNL_DEV1, + "ip6erspan", + "seq key 2 erspan_ver 2 erspan_dir egress erspan_hwid 7"); + break; + } + if (!ASSERT_OK(err, "add tunnel")) + goto done; + + set_fd = bpf_program__fd(skel->progs.ip4ip6erspan_set_tunnel); + get_fd = bpf_program__fd(skel->progs.ip4ip6erspan_get_tunnel); + if (generic_attach(IP6ERSPAN_TUNL_DEV1, get_fd, set_fd)) + goto done; + + ping6_veth0(); + ping_dev1(); +done: + delete_tunnel(IP6ERSPAN_TUNL_DEV0, IP6ERSPAN_TUNL_DEV1); + test_tunnel_kern__destroy(skel); +} + #define RUN_TEST(name, ...) \ ({ \ if (test__start_subtest(#name)) { \ @@ -941,6 +980,8 @@ static void *test_tunnel_run_tests(void *arg) RUN_TEST(ip6gre_tunnel, IP6GRETAP); RUN_TEST(erspan_tunnel, V1); RUN_TEST(erspan_tunnel, V2); + RUN_TEST(ip6erspan_tunnel, V1); + RUN_TEST(ip6erspan_tunnel, V2); return NULL; } diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh index e8e7839fb5b5f..2b486df9724dd 100755 --- a/tools/testing/selftests/bpf/test_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tunnel.sh @@ -64,36 +64,6 @@ config_device() ip addr add dev veth1 172.16.1.200/24 } -add_ip6erspan_tunnel() -{ - - # assign ipv6 address - ip netns exec at_ns0 ip addr add ::11/96 dev veth0 - ip netns exec at_ns0 ip link set dev veth0 up - ip addr add dev veth1 ::22/96 - ip link set dev veth1 up - - # at_ns0 namespace - if [ "$1" == "v1" ]; then - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE seq key 2 \ - local ::11 remote ::22 \ - erspan_ver 1 erspan 123 - else - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE seq key 2 \ - local ::11 remote ::22 \ - erspan_ver 2 erspan_dir egress erspan_hwid 7 - fi - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - ip netns exec at_ns0 ip link set dev $DEV_NS up - - # root namespace - ip link add dev $DEV type $TYPE external - ip addr add dev $DEV 10.1.1.200/24 - ip link set dev $DEV up -} - add_geneve_tunnel() { # at_ns0 namespace @@ -166,29 +136,6 @@ add_ip6tnl_tunnel() ip link set dev $DEV up } -test_ip6erspan() -{ - TYPE=ip6erspan - DEV_NS=ip6erspan00 - DEV=ip6erspan11 - ret=0 - - check $TYPE - config_device - add_ip6erspan_tunnel $1 - attach_bpf $DEV ip4ip6erspan_set_tunnel ip4ip6erspan_get_tunnel - ping6 $PING_ARG ::11 - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: $TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: $TYPE"${NC} -} - test_geneve() { TYPE=geneve @@ -341,7 +288,6 @@ cleanup() ip link del ip6ip6tnl11 2> /dev/null ip link del geneve11 2> /dev/null ip link del ip6geneve11 2> /dev/null - ip link del ip6erspan11 2> /dev/null } cleanup_exit() @@ -378,10 +324,6 @@ bpf_tunnel_test() { local errors=0 - echo "Testing IP6ERSPAN tunnel..." - test_ip6erspan v2 - errors=$(( $errors + $? )) - echo "Testing GENEVE tunnel..." test_geneve errors=$(( $errors + $? )) From d89542d2534f6b5f40113c2a63620a23184e2fff Mon Sep 17 00:00:00 2001 From: "Bastien Curutchet (eBPF Foundation)" Date: Mon, 3 Mar 2025 09:22:55 +0100 Subject: [PATCH 32/80] selftests/bpf: test_tunnel: Move geneve tunnel test to test_progs geneve tunnels are tested in the test_tunnel.sh but not in the test_progs framework. Add a new test in test_progs to test geneve tunnels. It uses the same network topology and the same BPF programs than the script. Remove test_geneve() from the script. Signed-off-by: Bastien Curutchet (eBPF Foundation) Signed-off-by: Martin KaFai Lau Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250303-tunnels-v2-7-8329f38f0678@bootlin.com --- .../selftests/bpf/prog_tests/test_tunnel.c | 45 +++++++++++++++++++ tools/testing/selftests/bpf/test_tunnel.sh | 45 ------------------- 2 files changed, 45 insertions(+), 45 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c index 1b1b190452824..2210a1d768362 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c @@ -112,6 +112,9 @@ #define IP6ERSPAN_TUNL_DEV0 "ip6erspan00" #define IP6ERSPAN_TUNL_DEV1 "ip6erspan11" +#define GENEVE_TUNL_DEV0 "geneve00" +#define GENEVE_TUNL_DEV1 "geneve11" + #define PING_ARGS "-i 0.01 -c 3 -w 10 -q" static int config_device(void) @@ -443,6 +446,21 @@ static int add_ipv6_tunnel(const char *dev0, const char *dev1, return -1; } +static int add_geneve_tunnel(const char *dev0, const char *dev1, + const char *type, const char *opt) +{ + if (!type || !opt || !dev0 || !dev1) + return -1; + + SYS(fail, "ip -n at_ns0 link add dev %s type %s id 2 %s remote %s", + dev0, type, opt, IP4_ADDR1_VETH1); + + SYS(fail, "ip link add dev %s type %s %s external", dev1, type, opt); + + return set_ipv4_addr(dev0, dev1); +fail: + return -1; +} static int test_ping(int family, const char *addr) { @@ -955,6 +973,32 @@ static void test_ip6erspan_tunnel(enum erspan_test test) test_tunnel_kern__destroy(skel); } +static void test_geneve_tunnel(void) +{ + struct test_tunnel_kern *skel; + int set_fd, get_fd; + int err; + + skel = test_tunnel_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) + return; + + err = add_geneve_tunnel(GENEVE_TUNL_DEV0, GENEVE_TUNL_DEV1, + "geneve", "dstport 6081"); + if (!ASSERT_OK(err, "add tunnel")) + goto done; + + set_fd = bpf_program__fd(skel->progs.geneve_set_tunnel); + get_fd = bpf_program__fd(skel->progs.geneve_get_tunnel); + if (generic_attach(GENEVE_TUNL_DEV1, get_fd, set_fd)) + goto done; + + ping_dev0(); + ping_dev1(); +done: + delete_tunnel(GENEVE_TUNL_DEV0, GENEVE_TUNL_DEV1); + test_tunnel_kern__destroy(skel); +} #define RUN_TEST(name, ...) \ ({ \ if (test__start_subtest(#name)) { \ @@ -982,6 +1026,7 @@ static void *test_tunnel_run_tests(void *arg) RUN_TEST(erspan_tunnel, V2); RUN_TEST(ip6erspan_tunnel, V1); RUN_TEST(ip6erspan_tunnel, V2); + RUN_TEST(geneve_tunnel); return NULL; } diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh index 2b486df9724dd..7f2b1c846a72f 100755 --- a/tools/testing/selftests/bpf/test_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tunnel.sh @@ -64,21 +64,6 @@ config_device() ip addr add dev veth1 172.16.1.200/24 } -add_geneve_tunnel() -{ - # at_ns0 namespace - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE \ - id 2 dstport 6081 remote 172.16.1.200 - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - - # root namespace - ip link add dev $DEV type $TYPE dstport 6081 external - ip link set dev $DEV up - ip addr add dev $DEV 10.1.1.200/24 -} - add_ip6geneve_tunnel() { ip netns exec at_ns0 ip addr add ::11/96 dev veth0 @@ -136,30 +121,6 @@ add_ip6tnl_tunnel() ip link set dev $DEV up } -test_geneve() -{ - TYPE=geneve - DEV_NS=geneve00 - DEV=geneve11 - ret=0 - - check $TYPE - config_device - add_geneve_tunnel - attach_bpf $DEV geneve_set_tunnel geneve_get_tunnel - ping $PING_ARG 10.1.1.100 - check_err $? - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: $TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: $TYPE"${NC} -} - test_ip6geneve() { TYPE=geneve @@ -286,7 +247,6 @@ cleanup() ip link del ipip11 2> /dev/null ip link del ipip6tnl11 2> /dev/null ip link del ip6ip6tnl11 2> /dev/null - ip link del geneve11 2> /dev/null ip link del ip6geneve11 2> /dev/null } @@ -309,7 +269,6 @@ check() enable_debug() { - echo 'file geneve.c +p' > /sys/kernel/debug/dynamic_debug/control echo 'file ipip.c +p' > /sys/kernel/debug/dynamic_debug/control } @@ -324,10 +283,6 @@ bpf_tunnel_test() { local errors=0 - echo "Testing GENEVE tunnel..." - test_geneve - errors=$(( $errors + $? )) - echo "Testing IP6GENEVE tunnel..." test_ip6geneve errors=$(( $errors + $? )) From 8d86094305761b5af77529bc5bc8c3c51ce866db Mon Sep 17 00:00:00 2001 From: "Bastien Curutchet (eBPF Foundation)" Date: Mon, 3 Mar 2025 09:22:56 +0100 Subject: [PATCH 33/80] selftests/bpf: test_tunnel: Move ip6geneve tunnel test to test_progs ip6geneve tunnels are tested in the test_tunnel.sh but not in the test_progs framework. Add a new test in test_progs to test ip6geneve tunnels. It uses the same network topology and the same BPF programs than the script. Remove test_ip6geneve() from the script. Signed-off-by: Bastien Curutchet (eBPF Foundation) Signed-off-by: Martin KaFai Lau Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250303-tunnels-v2-8-8329f38f0678@bootlin.com --- .../selftests/bpf/prog_tests/test_tunnel.c | 48 ++++++++++++++++++ tools/testing/selftests/bpf/test_tunnel.sh | 49 ------------------- 2 files changed, 48 insertions(+), 49 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c index 2210a1d768362..b5d48d4fd423a 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c @@ -115,6 +115,9 @@ #define GENEVE_TUNL_DEV0 "geneve00" #define GENEVE_TUNL_DEV1 "geneve11" +#define IP6GENEVE_TUNL_DEV0 "ip6geneve00" +#define IP6GENEVE_TUNL_DEV1 "ip6geneve11" + #define PING_ARGS "-i 0.01 -c 3 -w 10 -q" static int config_device(void) @@ -462,6 +465,22 @@ static int add_geneve_tunnel(const char *dev0, const char *dev1, return -1; } +static int add_ip6geneve_tunnel(const char *dev0, const char *dev1, + const char *type, const char *opt) +{ + if (!type || !opt || !dev0 || !dev1) + return -1; + + SYS(fail, "ip -n at_ns0 link add dev %s type %s id 22 %s remote %s", + dev0, type, opt, IP6_ADDR1_VETH1); + + SYS(fail, "ip link add dev %s type %s %s external", dev1, type, opt); + + return set_ipv6_addr(dev0, dev1); +fail: + return -1; +} + static int test_ping(int family, const char *addr) { SYS(fail, "%s %s %s > /dev/null", ping_command(family), PING_ARGS, addr); @@ -999,6 +1018,34 @@ static void test_geneve_tunnel(void) delete_tunnel(GENEVE_TUNL_DEV0, GENEVE_TUNL_DEV1); test_tunnel_kern__destroy(skel); } + +static void test_ip6geneve_tunnel(void) +{ + struct test_tunnel_kern *skel; + int set_fd, get_fd; + int err; + + skel = test_tunnel_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) + return; + + err = add_ip6geneve_tunnel(IP6GENEVE_TUNL_DEV0, IP6GENEVE_TUNL_DEV1, + "geneve", ""); + if (!ASSERT_OK(err, "add tunnel")) + goto done; + + set_fd = bpf_program__fd(skel->progs.ip6geneve_set_tunnel); + get_fd = bpf_program__fd(skel->progs.ip6geneve_get_tunnel); + if (generic_attach(IP6GENEVE_TUNL_DEV1, get_fd, set_fd)) + goto done; + + ping_dev0(); + ping_dev1(); +done: + delete_tunnel(IP6GENEVE_TUNL_DEV0, IP6GENEVE_TUNL_DEV1); + test_tunnel_kern__destroy(skel); +} + #define RUN_TEST(name, ...) \ ({ \ if (test__start_subtest(#name)) { \ @@ -1027,6 +1074,7 @@ static void *test_tunnel_run_tests(void *arg) RUN_TEST(ip6erspan_tunnel, V1); RUN_TEST(ip6erspan_tunnel, V2); RUN_TEST(geneve_tunnel); + RUN_TEST(ip6geneve_tunnel); return NULL; } diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh index 7f2b1c846a72f..f46628f70399e 100755 --- a/tools/testing/selftests/bpf/test_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tunnel.sh @@ -64,26 +64,6 @@ config_device() ip addr add dev veth1 172.16.1.200/24 } -add_ip6geneve_tunnel() -{ - ip netns exec at_ns0 ip addr add ::11/96 dev veth0 - ip netns exec at_ns0 ip link set dev veth0 up - ip addr add dev veth1 ::22/96 - ip link set dev veth1 up - - # at_ns0 namespace - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE id 22 \ - remote ::22 # geneve has no local option - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - ip netns exec at_ns0 ip link set dev $DEV_NS up - - # root namespace - ip link add dev $DEV type $TYPE external - ip addr add dev $DEV 10.1.1.200/24 - ip link set dev $DEV up -} - add_ipip_tunnel() { # at_ns0 namespace @@ -121,30 +101,6 @@ add_ip6tnl_tunnel() ip link set dev $DEV up } -test_ip6geneve() -{ - TYPE=geneve - DEV_NS=ip6geneve00 - DEV=ip6geneve11 - ret=0 - - check $TYPE - config_device - add_ip6geneve_tunnel - attach_bpf $DEV ip6geneve_set_tunnel ip6geneve_get_tunnel - ping $PING_ARG 10.1.1.100 - check_err $? - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: ip6$TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: ip6$TYPE"${NC} -} - test_ipip() { TYPE=ipip @@ -247,7 +203,6 @@ cleanup() ip link del ipip11 2> /dev/null ip link del ipip6tnl11 2> /dev/null ip link del ip6ip6tnl11 2> /dev/null - ip link del ip6geneve11 2> /dev/null } cleanup_exit() @@ -283,10 +238,6 @@ bpf_tunnel_test() { local errors=0 - echo "Testing IP6GENEVE tunnel..." - test_ip6geneve - errors=$(( $errors + $? )) - echo "Testing IPIP tunnel..." test_ipip errors=$(( $errors + $? )) From 680a75248df79f8c68fe52f77e3b73f31c5f9834 Mon Sep 17 00:00:00 2001 From: "Bastien Curutchet (eBPF Foundation)" Date: Mon, 3 Mar 2025 09:22:57 +0100 Subject: [PATCH 34/80] selftests/bpf: test_tunnel: Move ip6tnl tunnel tests to test_progs ip6tnl tunnels are tested in the test_tunnel.sh but not in the test_progs framework. Add a new test in test_progs to test ip6tnl tunnels. It uses the same network topology and the same BPF programs than the script. Remove test_ipip6() and test_ip6ip6() from the script. Signed-off-by: Bastien Curutchet (eBPF Foundation) Signed-off-by: Martin KaFai Lau Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250303-tunnels-v2-9-8329f38f0678@bootlin.com --- .../selftests/bpf/prog_tests/test_tunnel.c | 59 +++++++++++++ tools/testing/selftests/bpf/test_tunnel.sh | 88 ------------------- 2 files changed, 59 insertions(+), 88 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c index b5d48d4fd423a..bae0e9de277d2 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c @@ -118,6 +118,9 @@ #define IP6GENEVE_TUNL_DEV0 "ip6geneve00" #define IP6GENEVE_TUNL_DEV1 "ip6geneve11" +#define IP6TNL_TUNL_DEV0 "ip6tnl00" +#define IP6TNL_TUNL_DEV1 "ip6tnl11" + #define PING_ARGS "-i 0.01 -c 3 -w 10 -q" static int config_device(void) @@ -513,6 +516,11 @@ static void ping6_veth0(void) test_ping(AF_INET6, IP6_ADDR_VETH0); } +static void ping6_dev0(void) +{ + test_ping(AF_INET6, IP6_ADDR_TUNL_DEV0); +} + static void ping6_dev1(void) { struct nstoken *nstoken; @@ -1046,6 +1054,55 @@ static void test_ip6geneve_tunnel(void) test_tunnel_kern__destroy(skel); } +enum ip6tnl_test { + IPIP6, + IP6IP6 +}; + +static void test_ip6tnl_tunnel(enum ip6tnl_test test) +{ + struct test_tunnel_kern *skel; + int set_fd, get_fd; + int err; + + skel = test_tunnel_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) + return; + + err = add_ipv6_tunnel(IP6TNL_TUNL_DEV0, IP6TNL_TUNL_DEV1, "ip6tnl", ""); + if (!ASSERT_OK(err, "add tunnel")) + goto done; + + switch (test) { + case IPIP6: + set_fd = bpf_program__fd(skel->progs.ipip6_set_tunnel); + get_fd = bpf_program__fd(skel->progs.ipip6_get_tunnel); + break; + case IP6IP6: + set_fd = bpf_program__fd(skel->progs.ip6ip6_set_tunnel); + get_fd = bpf_program__fd(skel->progs.ip6ip6_get_tunnel); + break; + } + if (generic_attach(IP6TNL_TUNL_DEV1, get_fd, set_fd)) + goto done; + + ping6_veth0(); + switch (test) { + case IPIP6: + ping_dev0(); + ping_dev1(); + break; + case IP6IP6: + ping6_dev0(); + ping6_dev1(); + break; + } + +done: + delete_tunnel(IP6TNL_TUNL_DEV0, IP6TNL_TUNL_DEV1); + test_tunnel_kern__destroy(skel); +} + #define RUN_TEST(name, ...) \ ({ \ if (test__start_subtest(#name)) { \ @@ -1075,6 +1132,8 @@ static void *test_tunnel_run_tests(void *arg) RUN_TEST(ip6erspan_tunnel, V2); RUN_TEST(geneve_tunnel); RUN_TEST(ip6geneve_tunnel); + RUN_TEST(ip6tnl_tunnel, IPIP6); + RUN_TEST(ip6tnl_tunnel, IP6IP6); return NULL; } diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh index f46628f70399e..165023d1b5f70 100755 --- a/tools/testing/selftests/bpf/test_tunnel.sh +++ b/tools/testing/selftests/bpf/test_tunnel.sh @@ -79,28 +79,6 @@ add_ipip_tunnel() ip addr add dev $DEV 10.1.1.200/24 } -add_ip6tnl_tunnel() -{ - ip netns exec at_ns0 ip addr add ::11/96 dev veth0 - ip netns exec at_ns0 ip link set dev veth0 up - ip addr add dev veth1 ::22/96 - ip link set dev veth1 up - - # at_ns0 namespace - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE \ - local ::11 remote ::22 - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - ip netns exec at_ns0 ip addr add dev $DEV_NS 1::11/96 - ip netns exec at_ns0 ip link set dev $DEV_NS up - - # root namespace - ip link add dev $DEV type $TYPE external - ip addr add dev $DEV 10.1.1.200/24 - ip addr add dev $DEV 1::22/96 - ip link set dev $DEV up -} - test_ipip() { TYPE=ipip @@ -126,62 +104,6 @@ test_ipip() echo -e ${GREEN}"PASS: $TYPE"${NC} } -test_ipip6() -{ - TYPE=ip6tnl - DEV_NS=ipip6tnl00 - DEV=ipip6tnl11 - ret=0 - - check $TYPE - config_device - add_ip6tnl_tunnel - ip link set dev veth1 mtu 1500 - attach_bpf $DEV ipip6_set_tunnel ipip6_get_tunnel - # underlay - ping6 $PING_ARG ::11 - # ip4 over ip6 - ping $PING_ARG 10.1.1.100 - check_err $? - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: $TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: $TYPE"${NC} -} - -test_ip6ip6() -{ - TYPE=ip6tnl - DEV_NS=ip6ip6tnl00 - DEV=ip6ip6tnl11 - ret=0 - - check $TYPE - config_device - add_ip6tnl_tunnel - ip link set dev veth1 mtu 1500 - attach_bpf $DEV ip6ip6_set_tunnel ip6ip6_get_tunnel - # underlay - ping6 $PING_ARG ::11 - # ip6 over ip6 - ping6 $PING_ARG 1::11 - check_err $? - ip netns exec at_ns0 ping6 $PING_ARG 1::22 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: ip6$TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: ip6$TYPE"${NC} -} - attach_bpf() { DEV=$1 @@ -201,8 +123,6 @@ cleanup() ip netns delete at_ns0 2> /dev/null ip link del veth1 2> /dev/null ip link del ipip11 2> /dev/null - ip link del ipip6tnl11 2> /dev/null - ip link del ip6ip6tnl11 2> /dev/null } cleanup_exit() @@ -242,14 +162,6 @@ bpf_tunnel_test() test_ipip errors=$(( $errors + $? )) - echo "Testing IPIP6 tunnel..." - test_ipip6 - errors=$(( $errors + $? )) - - echo "Testing IP6IP6 tunnel..." - test_ip6ip6 - errors=$(( $errors + $? )) - return $errors } From c8d6d78cea6e28841b68676d42024c204251edbd Mon Sep 17 00:00:00 2001 From: "Bastien Curutchet (eBPF Foundation)" Date: Mon, 3 Mar 2025 09:22:58 +0100 Subject: [PATCH 35/80] selftests/bpf: test_tunnel: Remove test_tunnel.sh All tests from test_tunnel.sh have been migrated into test test_progs. The last test remaining in the script is the test_ipip() that is already covered in the test_prog framework by the NONE case of test_ipip_tunnel(). Remove the test_tunnel.sh script and its Makefile entry Signed-off-by: Bastien Curutchet (eBPF Foundation) Signed-off-by: Martin KaFai Lau Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250303-tunnels-v2-10-8329f38f0678@bootlin.com --- tools/testing/selftests/bpf/Makefile | 1 - tools/testing/selftests/bpf/test_tunnel.sh | 179 --------------------- 2 files changed, 180 deletions(-) delete mode 100755 tools/testing/selftests/bpf/test_tunnel.sh diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index abfb450c26bb3..e6a02d5b87d12 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -100,7 +100,6 @@ TEST_FILES = xsk_prereqs.sh $(wildcard progs/btf_dump_test_case_*.c) # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ - test_tunnel.sh \ test_lwt_seg6local.sh \ test_lirc_mode2.sh \ test_xdp_vlan_mode_generic.sh \ diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh deleted file mode 100755 index 165023d1b5f70..0000000000000 --- a/tools/testing/selftests/bpf/test_tunnel.sh +++ /dev/null @@ -1,179 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# End-to-end eBPF tunnel test suite -# The script tests BPF network tunnel implementation. -# -# Topology: -# --------- -# root namespace | at_ns0 namespace -# | -# ----------- | ----------- -# | tnl dev | | | tnl dev | (overlay network) -# ----------- | ----------- -# metadata-mode | native-mode -# with bpf | -# | -# ---------- | ---------- -# | veth1 | --------- | veth0 | (underlay network) -# ---------- peer ---------- -# -# -# Device Configuration -# -------------------- -# Root namespace with metadata-mode tunnel + BPF -# Device names and addresses: -# veth1 IP: 172.16.1.200, IPv6: 00::22 (underlay) -# tunnel dev 11, ex: gre11, IPv4: 10.1.1.200, IPv6: 1::22 (overlay) -# -# Namespace at_ns0 with native tunnel -# Device names and addresses: -# veth0 IPv4: 172.16.1.100, IPv6: 00::11 (underlay) -# tunnel dev 00, ex: gre00, IPv4: 10.1.1.100, IPv6: 1::11 (overlay) -# -# -# End-to-end ping packet flow -# --------------------------- -# Most of the tests start by namespace creation, device configuration, -# then ping the underlay and overlay network. When doing 'ping 10.1.1.100' -# from root namespace, the following operations happen: -# 1) Route lookup shows 10.1.1.100/24 belongs to tnl dev, fwd to tnl dev. -# 2) Tnl device's egress BPF program is triggered and set the tunnel metadata, -# with remote_ip=172.16.1.100 and others. -# 3) Outer tunnel header is prepended and route the packet to veth1's egress -# 4) veth0's ingress queue receive the tunneled packet at namespace at_ns0 -# 5) Tunnel protocol handler, ex: vxlan_rcv, decap the packet -# 6) Forward the packet to the overlay tnl dev - -BPF_FILE="test_tunnel_kern.bpf.o" -BPF_PIN_TUNNEL_DIR="/sys/fs/bpf/tc/tunnel" -PING_ARG="-c 3 -w 10 -q" -ret=0 -GREEN='\033[0;92m' -RED='\033[0;31m' -NC='\033[0m' # No Color - -config_device() -{ - ip netns add at_ns0 - ip link add veth0 type veth peer name veth1 - ip link set veth0 netns at_ns0 - ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0 - ip netns exec at_ns0 ip link set dev veth0 up - ip link set dev veth1 up mtu 1500 - ip addr add dev veth1 172.16.1.200/24 -} - -add_ipip_tunnel() -{ - # at_ns0 namespace - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE \ - local 172.16.1.100 remote 172.16.1.200 - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - - # root namespace - ip link add dev $DEV type $TYPE external - ip link set dev $DEV up - ip addr add dev $DEV 10.1.1.200/24 -} - -test_ipip() -{ - TYPE=ipip - DEV_NS=ipip00 - DEV=ipip11 - ret=0 - - check $TYPE - config_device - add_ipip_tunnel - ip link set dev veth1 mtu 1500 - attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel - ping $PING_ARG 10.1.1.100 - check_err $? - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: $TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: $TYPE"${NC} -} - -attach_bpf() -{ - DEV=$1 - SET=$2 - GET=$3 - mkdir -p ${BPF_PIN_TUNNEL_DIR} - bpftool prog loadall ${BPF_FILE} ${BPF_PIN_TUNNEL_DIR}/ - tc qdisc add dev $DEV clsact - tc filter add dev $DEV egress bpf da object-pinned ${BPF_PIN_TUNNEL_DIR}/$SET - tc filter add dev $DEV ingress bpf da object-pinned ${BPF_PIN_TUNNEL_DIR}/$GET -} - -cleanup() -{ - rm -rf ${BPF_PIN_TUNNEL_DIR} - - ip netns delete at_ns0 2> /dev/null - ip link del veth1 2> /dev/null - ip link del ipip11 2> /dev/null -} - -cleanup_exit() -{ - echo "CATCH SIGKILL or SIGINT, cleanup and exit" - cleanup - exit 0 -} - -check() -{ - ip link help 2>&1 | grep -q "\s$1\s" - if [ $? -ne 0 ];then - echo "SKIP $1: iproute2 not support" - cleanup - return 1 - fi -} - -enable_debug() -{ - echo 'file ipip.c +p' > /sys/kernel/debug/dynamic_debug/control -} - -check_err() -{ - if [ $ret -eq 0 ]; then - ret=$1 - fi -} - -bpf_tunnel_test() -{ - local errors=0 - - echo "Testing IPIP tunnel..." - test_ipip - errors=$(( $errors + $? )) - - return $errors -} - -trap cleanup 0 3 6 -trap cleanup_exit 2 9 - -cleanup -bpf_tunnel_test - -if [ $? -ne 0 ]; then - echo -e "$(basename $0): ${RED}FAIL${NC}" - exit 1 -fi -echo -e "$(basename $0): ${GREEN}PASS${NC}" -exit 0 From 122f1fd14f44d8d6f0113b9db8acbdd9636d45b4 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 28 Feb 2025 10:43:34 -0800 Subject: [PATCH 36/80] net: filter: Avoid shadowing variable in bpf_convert_ctx_access() Rename the local variable 'off' to 'offset' to avoid shadowing the existing 'off' variable that is declared as an `int` in the outer scope of bpf_convert_ctx_access(). This fixes a compiler warning: net/core/filter.c:9679:8: warning: declaration shadows a local variable [-Wshadow] Signed-off-by: Breno Leitao Signed-off-by: Martin KaFai Lau Acked-by: Yonghong Song Link: https://patch.msgid.link/20250228-fix_filter-v1-1-ce13eae66fe9@debian.org --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 827108c6dad9f..dcc53ac5c5458 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9637,7 +9637,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, queue_mapping): if (type == BPF_WRITE) { - u32 off = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size); + u32 offset = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size); if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) { *insn++ = BPF_JMP_A(0); /* noop */ @@ -9646,7 +9646,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, if (BPF_CLASS(si->code) == BPF_STX) *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); - *insn++ = BPF_EMIT_STORE(BPF_H, si, off); + *insn++ = BPF_EMIT_STORE(BPF_H, si, offset); } else { *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, From 7218ff1f322d6bec73aea3e20828f58018a1c690 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Mon, 3 Mar 2025 13:57:49 +0000 Subject: [PATCH 37/80] libbpf: Use map_is_created helper in map setters Refactoring: use map_is_created helper in map setters that need to check the state of the map. This helps to reduce the number of the places that depend explicitly on the loaded flag, simplifying refactoring in the next patch of this set. Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250303135752.158343-2-mykyta.yatsenko5@gmail.com --- tools/lib/bpf/libbpf.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 899e98225f3b6..4895c7ae64226 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -4845,6 +4845,11 @@ static int bpf_get_map_info_from_fdinfo(int fd, struct bpf_map_info *info) return 0; } +static bool map_is_created(const struct bpf_map *map) +{ + return map->obj->loaded || map->reused; +} + bool bpf_map__autocreate(const struct bpf_map *map) { return map->autocreate; @@ -4852,7 +4857,7 @@ bool bpf_map__autocreate(const struct bpf_map *map) int bpf_map__set_autocreate(struct bpf_map *map, bool autocreate) { - if (map->obj->loaded) + if (map_is_created(map)) return libbpf_err(-EBUSY); map->autocreate = autocreate; @@ -4946,7 +4951,7 @@ struct bpf_map *bpf_map__inner_map(struct bpf_map *map) int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries) { - if (map->obj->loaded) + if (map_is_created(map)) return libbpf_err(-EBUSY); map->def.max_entries = max_entries; @@ -5191,11 +5196,6 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) static void bpf_map__destroy(struct bpf_map *map); -static bool map_is_created(const struct bpf_map *map) -{ - return map->obj->loaded || map->reused; -} - static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, bool is_inner) { LIBBPF_OPTS(bpf_map_create_opts, create_attr); @@ -10299,7 +10299,7 @@ static int map_btf_datasec_resize(struct bpf_map *map, __u32 size) int bpf_map__set_value_size(struct bpf_map *map, __u32 size) { - if (map->obj->loaded || map->reused) + if (map_is_created(map)) return libbpf_err(-EBUSY); if (map->mmaped) { @@ -10345,7 +10345,7 @@ int bpf_map__set_initial_value(struct bpf_map *map, { size_t actual_sz; - if (map->obj->loaded || map->reused) + if (map_is_created(map)) return libbpf_err(-EBUSY); if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG) From 8ca8f6d1a2b48de10373852697be1c25b712768f Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Mon, 3 Mar 2025 13:57:50 +0000 Subject: [PATCH 38/80] libbpf: Introduce more granular state for bpf_object We are going to split bpf_object loading into 2 stages: preparation and loading. This will increase flexibility when working with bpf_object and unlock some optimizations and use cases. This patch substitutes a boolean flag (loaded) by more finely-grained state for bpf_object. Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250303135752.158343-3-mykyta.yatsenko5@gmail.com --- tools/lib/bpf/libbpf.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 4895c7ae64226..7210278ecdcfe 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -670,11 +670,18 @@ struct elf_state { struct usdt_manager; +enum bpf_object_state { + OBJ_OPEN, + OBJ_PREPARED, + OBJ_LOADED, +}; + struct bpf_object { char name[BPF_OBJ_NAME_LEN]; char license[64]; __u32 kern_version; + enum bpf_object_state state; struct bpf_program *programs; size_t nr_programs; struct bpf_map *maps; @@ -686,7 +693,6 @@ struct bpf_object { int nr_extern; int kconfig_map_idx; - bool loaded; bool has_subcalls; bool has_rodata; @@ -1511,7 +1517,7 @@ static struct bpf_object *bpf_object__new(const char *path, obj->kconfig_map_idx = -1; obj->kern_version = get_kernel_version(); - obj->loaded = false; + obj->state = OBJ_OPEN; return obj; } @@ -4847,7 +4853,7 @@ static int bpf_get_map_info_from_fdinfo(int fd, struct bpf_map_info *info) static bool map_is_created(const struct bpf_map *map) { - return map->obj->loaded || map->reused; + return map->obj->state >= OBJ_PREPARED || map->reused; } bool bpf_map__autocreate(const struct bpf_map *map) @@ -8550,7 +8556,7 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch if (!obj) return libbpf_err(-EINVAL); - if (obj->loaded) { + if (obj->state >= OBJ_LOADED) { pr_warn("object '%s': load can't be attempted twice\n", obj->name); return libbpf_err(-EINVAL); } @@ -8602,8 +8608,7 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch btf__free(obj->btf_vmlinux); obj->btf_vmlinux = NULL; - obj->loaded = true; /* doesn't matter if successfully or not */ - + obj->state = OBJ_LOADED; /* doesn't matter if successfully or not */ if (err) goto out; @@ -8866,7 +8871,7 @@ int bpf_object__pin_maps(struct bpf_object *obj, const char *path) if (!obj) return libbpf_err(-ENOENT); - if (!obj->loaded) { + if (obj->state < OBJ_PREPARED) { pr_warn("object not yet loaded; load it first\n"); return libbpf_err(-ENOENT); } @@ -8945,7 +8950,7 @@ int bpf_object__pin_programs(struct bpf_object *obj, const char *path) if (!obj) return libbpf_err(-ENOENT); - if (!obj->loaded) { + if (obj->state < OBJ_LOADED) { pr_warn("object not yet loaded; load it first\n"); return libbpf_err(-ENOENT); } @@ -9132,7 +9137,7 @@ int bpf_object__btf_fd(const struct bpf_object *obj) int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version) { - if (obj->loaded) + if (obj->state >= OBJ_LOADED) return libbpf_err(-EINVAL); obj->kern_version = kern_version; @@ -9229,7 +9234,7 @@ bool bpf_program__autoload(const struct bpf_program *prog) int bpf_program__set_autoload(struct bpf_program *prog, bool autoload) { - if (prog->obj->loaded) + if (prog->obj->state >= OBJ_LOADED) return libbpf_err(-EINVAL); prog->autoload = autoload; @@ -9261,7 +9266,7 @@ int bpf_program__set_insns(struct bpf_program *prog, { struct bpf_insn *insns; - if (prog->obj->loaded) + if (prog->obj->state >= OBJ_LOADED) return libbpf_err(-EBUSY); insns = libbpf_reallocarray(prog->insns, new_insn_cnt, sizeof(*insns)); @@ -9304,7 +9309,7 @@ static int last_custom_sec_def_handler_id; int bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type) { - if (prog->obj->loaded) + if (prog->obj->state >= OBJ_LOADED) return libbpf_err(-EBUSY); /* if type is not changed, do nothing */ @@ -9335,7 +9340,7 @@ enum bpf_attach_type bpf_program__expected_attach_type(const struct bpf_program int bpf_program__set_expected_attach_type(struct bpf_program *prog, enum bpf_attach_type type) { - if (prog->obj->loaded) + if (prog->obj->state >= OBJ_LOADED) return libbpf_err(-EBUSY); prog->expected_attach_type = type; @@ -9349,7 +9354,7 @@ __u32 bpf_program__flags(const struct bpf_program *prog) int bpf_program__set_flags(struct bpf_program *prog, __u32 flags) { - if (prog->obj->loaded) + if (prog->obj->state >= OBJ_LOADED) return libbpf_err(-EBUSY); prog->prog_flags = flags; @@ -9363,7 +9368,7 @@ __u32 bpf_program__log_level(const struct bpf_program *prog) int bpf_program__set_log_level(struct bpf_program *prog, __u32 log_level) { - if (prog->obj->loaded) + if (prog->obj->state >= OBJ_LOADED) return libbpf_err(-EBUSY); prog->log_level = log_level; @@ -9382,7 +9387,7 @@ int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log return libbpf_err(-EINVAL); if (prog->log_size > UINT_MAX) return libbpf_err(-EINVAL); - if (prog->obj->loaded) + if (prog->obj->state >= OBJ_LOADED) return libbpf_err(-EBUSY); prog->log_buf = log_buf; @@ -13666,7 +13671,7 @@ int bpf_program__set_attach_target(struct bpf_program *prog, if (!prog || attach_prog_fd < 0) return libbpf_err(-EINVAL); - if (prog->obj->loaded) + if (prog->obj->state >= OBJ_LOADED) return libbpf_err(-EINVAL); if (attach_prog_fd && !attach_func_name) { From da755540c6f8fdc3ba24091cea58f3fa9754ee7e Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Mon, 3 Mar 2025 13:57:51 +0000 Subject: [PATCH 39/80] libbpf: Split bpf object load into prepare/load Introduce bpf_object__prepare API: additional intermediate preparation step that performs ELF processing, relocations, prepares final state of BPF program instructions (accessible with bpf_program__insns()), creates and (potentially) pins maps, and stops short of loading BPF programs. We anticipate few use cases for this API, such as: * Use prepare to initialize bpf_token, without loading freplace programs, unlocking possibility to lookup BTF of other programs. * Execute prepare to obtain finalized BPF program instructions without loading programs, enabling tools like veristat to process one program at a time, without incurring cost of ELF parsing and processing. Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250303135752.158343-4-mykyta.yatsenko5@gmail.com --- tools/lib/bpf/libbpf.c | 146 +++++++++++++++++++++++++++------------ tools/lib/bpf/libbpf.h | 13 ++++ tools/lib/bpf/libbpf.map | 1 + 3 files changed, 117 insertions(+), 43 deletions(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 7210278ecdcfe..8e32286854ef3 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -7901,13 +7901,6 @@ bpf_object__load_progs(struct bpf_object *obj, int log_level) size_t i; int err; - for (i = 0; i < obj->nr_programs; i++) { - prog = &obj->programs[i]; - err = bpf_object__sanitize_prog(obj, prog); - if (err) - return err; - } - for (i = 0; i < obj->nr_programs; i++) { prog = &obj->programs[i]; if (prog_is_subprog(obj, prog)) @@ -7933,6 +7926,21 @@ bpf_object__load_progs(struct bpf_object *obj, int log_level) return 0; } +static int bpf_object_prepare_progs(struct bpf_object *obj) +{ + struct bpf_program *prog; + size_t i; + int err; + + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + err = bpf_object__sanitize_prog(obj, prog); + if (err) + return err; + } + return 0; +} + static const struct bpf_sec_def *find_sec_def(const char *sec_name); static int bpf_object_init_progs(struct bpf_object *obj, const struct bpf_object_open_opts *opts) @@ -8549,9 +8557,72 @@ static int bpf_object_prepare_struct_ops(struct bpf_object *obj) return 0; } +static void bpf_object_unpin(struct bpf_object *obj) +{ + int i; + + /* unpin any maps that were auto-pinned during load */ + for (i = 0; i < obj->nr_maps; i++) + if (obj->maps[i].pinned && !obj->maps[i].reused) + bpf_map__unpin(&obj->maps[i], NULL); +} + +static void bpf_object_post_load_cleanup(struct bpf_object *obj) +{ + int i; + + /* clean up fd_array */ + zfree(&obj->fd_array); + + /* clean up module BTFs */ + for (i = 0; i < obj->btf_module_cnt; i++) { + close(obj->btf_modules[i].fd); + btf__free(obj->btf_modules[i].btf); + free(obj->btf_modules[i].name); + } + obj->btf_module_cnt = 0; + zfree(&obj->btf_modules); + + /* clean up vmlinux BTF */ + btf__free(obj->btf_vmlinux); + obj->btf_vmlinux = NULL; +} + +static int bpf_object_prepare(struct bpf_object *obj, const char *target_btf_path) +{ + int err; + + if (obj->state >= OBJ_PREPARED) { + pr_warn("object '%s': prepare loading can't be attempted twice\n", obj->name); + return -EINVAL; + } + + err = bpf_object_prepare_token(obj); + err = err ? : bpf_object__probe_loading(obj); + err = err ? : bpf_object__load_vmlinux_btf(obj, false); + err = err ? : bpf_object__resolve_externs(obj, obj->kconfig); + err = err ? : bpf_object__sanitize_maps(obj); + err = err ? : bpf_object__init_kern_struct_ops_maps(obj); + err = err ? : bpf_object_adjust_struct_ops_autoload(obj); + err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path); + err = err ? : bpf_object__sanitize_and_load_btf(obj); + err = err ? : bpf_object__create_maps(obj); + err = err ? : bpf_object_prepare_progs(obj); + + if (err) { + bpf_object_unpin(obj); + bpf_object_unload(obj); + obj->state = OBJ_LOADED; + return err; + } + + obj->state = OBJ_PREPARED; + return 0; +} + static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const char *target_btf_path) { - int err, i; + int err; if (!obj) return libbpf_err(-EINVAL); @@ -8571,17 +8642,12 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch return libbpf_err(-LIBBPF_ERRNO__ENDIAN); } - err = bpf_object_prepare_token(obj); - err = err ? : bpf_object__probe_loading(obj); - err = err ? : bpf_object__load_vmlinux_btf(obj, false); - err = err ? : bpf_object__resolve_externs(obj, obj->kconfig); - err = err ? : bpf_object__sanitize_maps(obj); - err = err ? : bpf_object__init_kern_struct_ops_maps(obj); - err = err ? : bpf_object_adjust_struct_ops_autoload(obj); - err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path); - err = err ? : bpf_object__sanitize_and_load_btf(obj); - err = err ? : bpf_object__create_maps(obj); - err = err ? : bpf_object__load_progs(obj, extra_log_level); + if (obj->state < OBJ_PREPARED) { + err = bpf_object_prepare(obj, target_btf_path); + if (err) + return libbpf_err(err); + } + err = bpf_object__load_progs(obj, extra_log_level); err = err ? : bpf_object_init_prog_arrays(obj); err = err ? : bpf_object_prepare_struct_ops(obj); @@ -8593,35 +8659,22 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch err = bpf_gen__finish(obj->gen_loader, obj->nr_programs, obj->nr_maps); } - /* clean up fd_array */ - zfree(&obj->fd_array); + bpf_object_post_load_cleanup(obj); + obj->state = OBJ_LOADED; /* doesn't matter if successfully or not */ - /* clean up module BTFs */ - for (i = 0; i < obj->btf_module_cnt; i++) { - close(obj->btf_modules[i].fd); - btf__free(obj->btf_modules[i].btf); - free(obj->btf_modules[i].name); + if (err) { + bpf_object_unpin(obj); + bpf_object_unload(obj); + pr_warn("failed to load object '%s'\n", obj->path); + return libbpf_err(err); } - free(obj->btf_modules); - - /* clean up vmlinux BTF */ - btf__free(obj->btf_vmlinux); - obj->btf_vmlinux = NULL; - - obj->state = OBJ_LOADED; /* doesn't matter if successfully or not */ - if (err) - goto out; return 0; -out: - /* unpin any maps that were auto-pinned during load */ - for (i = 0; i < obj->nr_maps; i++) - if (obj->maps[i].pinned && !obj->maps[i].reused) - bpf_map__unpin(&obj->maps[i], NULL); +} - bpf_object_unload(obj); - pr_warn("failed to load object '%s'\n", obj->path); - return libbpf_err(err); +int bpf_object__prepare(struct bpf_object *obj) +{ + return libbpf_err(bpf_object_prepare(obj, NULL)); } int bpf_object__load(struct bpf_object *obj) @@ -9069,6 +9122,13 @@ void bpf_object__close(struct bpf_object *obj) if (IS_ERR_OR_NULL(obj)) return; + /* + * if user called bpf_object__prepare() without ever getting to + * bpf_object__load(), we need to clean up stuff that is normally + * cleaned up at the end of loading step + */ + bpf_object_post_load_cleanup(obj); + usdt_manager_free(obj->usdt_man); obj->usdt_man = NULL; diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 3020ee45303a0..e0605403f9773 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -241,6 +241,19 @@ LIBBPF_API struct bpf_object * bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, const struct bpf_object_open_opts *opts); +/** + * @brief **bpf_object__prepare()** prepares BPF object for loading: + * performs ELF processing, relocations, prepares final state of BPF program + * instructions (accessible with bpf_program__insns()), creates and + * (potentially) pins maps. Leaves BPF object in the state ready for program + * loading. + * @param obj Pointer to a valid BPF object instance returned by + * **bpf_object__open*()** API + * @return 0, on success; negative error code, otherwise, error code is + * stored in errno + */ +int bpf_object__prepare(struct bpf_object *obj); + /** * @brief **bpf_object__load()** loads BPF object into kernel. * @param obj Pointer to a valid BPF object instance returned by diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index b5a838de6f47c..d8b71f22f197c 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -436,6 +436,7 @@ LIBBPF_1.6.0 { bpf_linker__add_buf; bpf_linker__add_fd; bpf_linker__new_fd; + bpf_object__prepare; btf__add_decl_attr; btf__add_type_attr; } LIBBPF_1.5.0; From 68b61a823aaba65a931a8182ac0f30b82dabb7a5 Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Mon, 3 Mar 2025 13:57:52 +0000 Subject: [PATCH 40/80] selftests/bpf: Add tests for bpf_object__prepare Add selftests, checking that running bpf_object__prepare successfully creates maps before load step. Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250303135752.158343-5-mykyta.yatsenko5@gmail.com --- .../selftests/bpf/prog_tests/prepare.c | 99 +++++++++++++++++++ tools/testing/selftests/bpf/progs/prepare.c | 28 ++++++ 2 files changed, 127 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/prepare.c create mode 100644 tools/testing/selftests/bpf/progs/prepare.c diff --git a/tools/testing/selftests/bpf/prog_tests/prepare.c b/tools/testing/selftests/bpf/prog_tests/prepare.c new file mode 100644 index 0000000000000..fb5cdad971168 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/prepare.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta */ + +#include +#include +#include "prepare.skel.h" + +static bool check_prepared(struct bpf_object *obj) +{ + bool is_prepared = true; + const struct bpf_map *map; + + bpf_object__for_each_map(map, obj) { + if (bpf_map__fd(map) < 0) + is_prepared = false; + } + + return is_prepared; +} + +static void test_prepare_no_load(void) +{ + struct prepare *skel; + int err; + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + ); + + skel = prepare__open(); + if (!ASSERT_OK_PTR(skel, "prepare__open")) + return; + + if (!ASSERT_FALSE(check_prepared(skel->obj), "not check_prepared")) + goto cleanup; + + err = bpf_object__prepare(skel->obj); + + if (!ASSERT_TRUE(check_prepared(skel->obj), "check_prepared")) + goto cleanup; + + if (!ASSERT_OK(err, "bpf_object__prepare")) + goto cleanup; + +cleanup: + prepare__destroy(skel); +} + +static void test_prepare_load(void) +{ + struct prepare *skel; + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + ); + + skel = prepare__open(); + if (!ASSERT_OK_PTR(skel, "prepare__open")) + return; + + if (!ASSERT_FALSE(check_prepared(skel->obj), "not check_prepared")) + goto cleanup; + + err = bpf_object__prepare(skel->obj); + if (!ASSERT_OK(err, "bpf_object__prepare")) + goto cleanup; + + err = prepare__load(skel); + if (!ASSERT_OK(err, "prepare__load")) + goto cleanup; + + if (!ASSERT_TRUE(check_prepared(skel->obj), "check_prepared")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.program); + if (!ASSERT_GE(prog_fd, 0, "prog_fd")) + goto cleanup; + + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + goto cleanup; + + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + goto cleanup; + + ASSERT_EQ(skel->bss->err, 0, "err"); + +cleanup: + prepare__destroy(skel); +} + +void test_prepare(void) +{ + if (test__start_subtest("prepare_load")) + test_prepare_load(); + if (test__start_subtest("prepare_no_load")) + test_prepare_no_load(); +} diff --git a/tools/testing/selftests/bpf/progs/prepare.c b/tools/testing/selftests/bpf/progs/prepare.c new file mode 100644 index 0000000000000..1f1dd547e4ee2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/prepare.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta */ +#include +#include +//#include + +char _license[] SEC("license") = "GPL"; + +int err; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 4096); +} ringbuf SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u32); +} array_map SEC(".maps"); + +SEC("cgroup_skb/egress") +int program(struct __sk_buff *skb) +{ + err = 0; + return 0; +} From 13a664f46e345b7de54bce4c1adf0ae97356a414 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 3 Mar 2025 16:32:38 -0800 Subject: [PATCH 41/80] bpf: Add verifier support for timed may_goto Implement support in the verifier for replacing may_goto implementation from a counter-based approach to one which samples time on the local CPU to have a bigger loop bound. We implement it by maintaining 16-bytes per-stack frame, and using 8 bytes for maintaining the count for amortizing time sampling, and 8 bytes for the starting timestamp. To minimize overhead, we need to avoid spilling and filling of registers around this sequence, so we push this cost into the time sampling function 'arch_bpf_timed_may_goto'. This is a JIT-specific wrapper around bpf_check_timed_may_goto which returns us the count to store into the stack through BPF_REG_AX. All caller-saved registers (r0-r5) are guaranteed to remain untouched. The loop can be broken by returning count as 0, otherwise we dispatch into the function when the count drops to 0, and the runtime chooses to refresh it (by returning count as BPF_MAX_TIMED_LOOPS) or returning 0 and aborting the loop on next iteration. Since the check for 0 is done right after loading the count from the stack, all subsequent cond_break sequences should immediately break as well, of the same loop or subsequent loops in the program. We pass in the stack_depth of the count (and thus the timestamp, by adding 8 to it) to the arch_bpf_timed_may_goto call so that it can be passed in to bpf_check_timed_may_goto as an argument after r1 is saved, by adding the offset to r10/fp. This adjustment will be arch specific, and the next patch will introduce support for x86. Note that depending on loop complexity, time spent in the loop can be more than the current limit (250 ms), but imposing an upper bound on program runtime is an orthogonal problem which will be addressed when program cancellations are supported. The current time afforded by cond_break may not be enough for cases where BPF programs want to implement locking algorithms inline, and use cond_break as a promise to the verifier that they will eventually terminate. Below are some benchmarking numbers on the time taken per-iteration for an empty loop that counts the number of iterations until cond_break fires. For comparison, we compare it against bpf_for/bpf_repeat which is another way to achieve the same number of spins (BPF_MAX_LOOPS). The hardware used for benchmarking was a Sapphire Rapids Intel server with performance governor enabled, mitigations were enabled. +-----------------------------+--------------+--------------+------------------+ | Loop type | Iterations | Time (ms) | Time/iter (ns) | +-----------------------------|--------------+--------------+------------------+ | may_goto | 8388608 | 3 | 0.36 | | timed_may_goto (count=65535)| 589674932 | 250 | 0.42 | | bpf_for | 8388608 | 10 | 1.19 | +-----------------------------+--------------+--------------+------------------+ This gives a good approximation at low overhead while staying close to the current implementation. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250304003239.2390751-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/filter.h | 8 +++++ kernel/bpf/core.c | 26 ++++++++++++++++ kernel/bpf/verifier.c | 69 +++++++++++++++++++++++++++++++++++++----- 4 files changed, 96 insertions(+), 8 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4c4028d865ee4..dae3872c301d9 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1987,6 +1987,7 @@ struct bpf_array { */ enum { BPF_MAX_LOOPS = 8 * 1024 * 1024, + BPF_MAX_TIMED_LOOPS = 0xffff, }; #define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ diff --git a/include/linux/filter.h b/include/linux/filter.h index 3ed6eb9e7c733..02dda5c53d914 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -669,6 +669,11 @@ struct bpf_prog_stats { struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); +struct bpf_timed_may_goto { + u64 count; + u64 timestamp; +}; + struct sk_filter { refcount_t refcnt; struct rcu_head rcu; @@ -1130,8 +1135,11 @@ bool bpf_jit_supports_ptr_xchg(void); bool bpf_jit_supports_arena(void); bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); bool bpf_jit_supports_private_stack(void); +bool bpf_jit_supports_timed_may_goto(void); u64 bpf_arch_uaddress_limit(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); +u64 arch_bpf_timed_may_goto(void); +u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *); bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id); static inline bool bpf_dump_raw_ok(const struct cred *cred) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a0200fbbace99..e583c19a0291f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -3069,6 +3069,32 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, { } +bool __weak bpf_jit_supports_timed_may_goto(void) +{ + return false; +} + +u64 __weak arch_bpf_timed_may_goto(void) +{ + return 0; +} + +u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) +{ + u64 time = ktime_get_mono_fast_ns(); + + /* Populate the timestamp for this stack frame, and refresh count. */ + if (!p->timestamp) { + p->timestamp = time; + return BPF_MAX_TIMED_LOOPS; + } + /* Check if we've exhausted our time slice, and zero count. */ + if (time - p->timestamp >= (NSEC_PER_SEC / 4)) + return 0; + /* Refresh the count for the stack frame. */ + return BPF_MAX_TIMED_LOOPS; +} + /* for configs without MMU or 32-bit */ __weak const struct bpf_map_ops arena_map_ops; __weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 22c4edc8695c0..4ec1d1aa25eaf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21572,7 +21572,50 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto next_insn; } - if (is_may_goto_insn(insn)) { + if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) { + int stack_off_cnt = -stack_depth - 16; + + /* + * Two 8 byte slots, depth-16 stores the count, and + * depth-8 stores the start timestamp of the loop. + * + * The starting value of count is BPF_MAX_TIMED_LOOPS + * (0xffff). Every iteration loads it and subs it by 1, + * until the value becomes 0 in AX (thus, 1 in stack), + * after which we call arch_bpf_timed_may_goto, which + * either sets AX to 0xffff to keep looping, or to 0 + * upon timeout. AX is then stored into the stack. In + * the next iteration, we either see 0 and break out, or + * continue iterating until the next time value is 0 + * after subtraction, rinse and repeat. + */ + stack_depth_extra = 16; + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt); + if (insn->off >= 0) + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5); + else + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1); + insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1); + insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2); + /* + * AX is used as an argument to pass in stack_off_cnt + * (to add to r10/fp), and also as the return value of + * the call to arch_bpf_timed_may_goto. + */ + insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt); + insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto); + insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt); + cnt = 7; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } else if (is_may_goto_insn(insn)) { int stack_off = -stack_depth - 8; stack_depth_extra = 8; @@ -22113,23 +22156,33 @@ static int do_misc_fixups(struct bpf_verifier_env *env) env->prog->aux->stack_depth = subprogs[0].stack_depth; for (i = 0; i < env->subprog_cnt; i++) { + int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1; int subprog_start = subprogs[i].start; int stack_slots = subprogs[i].stack_extra / 8; + int slots = delta, cnt = 0; if (!stack_slots) continue; - if (stack_slots > 1) { + /* We need two slots in case timed may_goto is supported. */ + if (stack_slots > slots) { verbose(env, "verifier bug: stack_slots supports may_goto only\n"); return -EFAULT; } - /* Add ST insn to subprog prologue to init extra stack */ - insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, - -subprogs[i].stack_depth, BPF_MAX_LOOPS); + stack_depth = subprogs[i].stack_depth; + if (bpf_jit_supports_timed_may_goto()) { + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth, + BPF_MAX_TIMED_LOOPS); + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0); + } else { + /* Add ST insn to subprog prologue to init extra stack */ + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth, + BPF_MAX_LOOPS); + } /* Copy first actual insn to preserve it */ - insn_buf[1] = env->prog->insnsi[subprog_start]; + insn_buf[cnt++] = env->prog->insnsi[subprog_start]; - new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2); + new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt); if (!new_prog) return -ENOMEM; env->prog = prog = new_prog; @@ -22139,7 +22192,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) * to insn after BPF_ST that inits may_goto count. * Adjustment will succeed because bpf_patch_insn_data() didn't fail. */ - WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1)); + WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta)); } /* Since poke tab is now finalized, publish aux to tracker. */ From 2cb0a521527461c5519b2115b42709062a6a63f1 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 3 Mar 2025 16:32:39 -0800 Subject: [PATCH 42/80] bpf, x86: Add x86 JIT support for timed may_goto Implement the arch_bpf_timed_may_goto function using inline assembly to have control over which registers are spilled, and use our special protocol of using BPF_REG_AX as an argument into the function, and as the return value when going back. Emit call depth accounting for the call made from this stub, and ensure we don't have naked returns (when rethunk mitigations are enabled) by falling back to the RET macro (instead of retq). After popping all saved registers, the return address into the BPF program should be on top of the stack. Since the JIT support is now enabled, ensure selftests which are checking the produced may_goto sequences do not break by adjusting them. Make sure we still test the old may_goto sequence on other architectures, while testing the new sequence on x86_64. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250304003239.2390751-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- arch/x86/net/Makefile | 2 +- arch/x86/net/bpf_jit_comp.c | 5 ++ arch/x86/net/bpf_timed_may_goto.S | 44 ++++++++++++++ .../bpf/progs/verifier_bpf_fastcall.c | 58 +++++++++++++++---- .../selftests/bpf/progs/verifier_may_goto_1.c | 34 ++++++++++- 5 files changed, 130 insertions(+), 13 deletions(-) create mode 100644 arch/x86/net/bpf_timed_may_goto.S diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile index 383c87300b0d3..dddbefc0f4398 100644 --- a/arch/x86/net/Makefile +++ b/arch/x86/net/Makefile @@ -6,5 +6,5 @@ ifeq ($(CONFIG_X86_32),y) obj-$(CONFIG_BPF_JIT) += bpf_jit_comp32.o else - obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o + obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o bpf_timed_may_goto.o endif diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a43fc5af973d2..f3e9ef6b5329c 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3791,3 +3791,8 @@ u64 bpf_arch_uaddress_limit(void) { return 0; } + +bool bpf_jit_supports_timed_may_goto(void) +{ + return true; +} diff --git a/arch/x86/net/bpf_timed_may_goto.S b/arch/x86/net/bpf_timed_may_goto.S new file mode 100644 index 0000000000000..20de4a14dc4cf --- /dev/null +++ b/arch/x86/net/bpf_timed_may_goto.S @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include + + .code64 + .section .text, "ax" + +SYM_FUNC_START(arch_bpf_timed_may_goto) + ANNOTATE_NOENDBR + + /* Save r0-r5. */ + pushq %rax + pushq %rdi + pushq %rsi + pushq %rdx + pushq %rcx + pushq %r8 + + /* + * r10 passes us stack depth, load the pointer to count and timestamp as + * first argument to the call below. + */ + leaq (%rbp, %r10, 1), %rdi + + /* Emit call depth accounting for call below. */ + CALL_DEPTH_ACCOUNT + call bpf_check_timed_may_goto + + /* BPF_REG_AX=r10 will be stored into count, so move return value to it. */ + movq %rax, %r10 + + /* Restore r5-r0. */ + popq %r8 + popq %rcx + popq %rdx + popq %rsi + popq %rdi + popq %rax + + RET +SYM_FUNC_END(arch_bpf_timed_may_goto) diff --git a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c index 5094c288cfd7b..a9be6ae494545 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c +++ b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c @@ -620,23 +620,61 @@ __naked void helper_call_does_not_prevent_bpf_fastcall(void) SEC("raw_tp") __arch_x86_64 +__log_level(4) __msg("stack depth 24") +/* may_goto counter at -24 */ +__xlated("0: *(u64 *)(r10 -24) =") +/* may_goto timestamp at -16 */ +__xlated("1: *(u64 *)(r10 -16) =") +__xlated("2: r1 = 1") +__xlated("...") +__xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("...") +/* may_goto expansion starts */ +__xlated("6: r11 = *(u64 *)(r10 -24)") +__xlated("7: if r11 == 0x0 goto pc+6") +__xlated("8: r11 -= 1") +__xlated("9: if r11 != 0x0 goto pc+2") +__xlated("10: r11 = -24") +__xlated("11: call unknown") +__xlated("12: *(u64 *)(r10 -24) = r11") +/* may_goto expansion ends */ +__xlated("13: *(u64 *)(r10 -8) = r1") +__xlated("14: exit") +__success +__naked void may_goto_interaction_x86_64(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + ".8byte %[may_goto];" + /* just touch some stack at -8 */ + "*(u64 *)(r10 - 8) = r1;" + "exit;" + : + : __imm(bpf_get_smp_processor_id), + __imm_insn(may_goto, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, +1 /* offset */, 0)) + : __clobber_all); +} + +SEC("raw_tp") +__arch_arm64 __log_level(4) __msg("stack depth 16") /* may_goto counter at -16 */ __xlated("0: *(u64 *)(r10 -16) =") __xlated("1: r1 = 1") -__xlated("...") -__xlated("3: r0 = &(void __percpu *)(r0)") -__xlated("...") +__xlated("2: call bpf_get_smp_processor_id") /* may_goto expansion starts */ -__xlated("5: r11 = *(u64 *)(r10 -16)") -__xlated("6: if r11 == 0x0 goto pc+3") -__xlated("7: r11 -= 1") -__xlated("8: *(u64 *)(r10 -16) = r11") +__xlated("3: r11 = *(u64 *)(r10 -16)") +__xlated("4: if r11 == 0x0 goto pc+3") +__xlated("5: r11 -= 1") +__xlated("6: *(u64 *)(r10 -16) = r11") /* may_goto expansion ends */ -__xlated("9: *(u64 *)(r10 -8) = r1") -__xlated("10: exit") +__xlated("7: *(u64 *)(r10 -8) = r1") +__xlated("8: exit") __success -__naked void may_goto_interaction(void) +__naked void may_goto_interaction_arm64(void) { asm volatile ( "r1 = 1;" diff --git a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c index e81097c96fe27..3966d827f2889 100644 --- a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c +++ b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c @@ -69,8 +69,38 @@ __naked void may_goto_batch_1(void) } SEC("raw_tp") -__description("may_goto batch with offsets 2/0") +__description("may_goto batch with offsets 2/0 - x86_64") __arch_x86_64 +__xlated("0: *(u64 *)(r10 -16) = 65535") +__xlated("1: *(u64 *)(r10 -8) = 0") +__xlated("2: r11 = *(u64 *)(r10 -16)") +__xlated("3: if r11 == 0x0 goto pc+6") +__xlated("4: r11 -= 1") +__xlated("5: if r11 != 0x0 goto pc+2") +__xlated("6: r11 = -16") +__xlated("7: call unknown") +__xlated("8: *(u64 *)(r10 -16) = r11") +__xlated("9: r0 = 1") +__xlated("10: r0 = 2") +__xlated("11: exit") +__success +__naked void may_goto_batch_2_x86_64(void) +{ + asm volatile ( + ".8byte %[may_goto1];" + ".8byte %[may_goto3];" + "r0 = 1;" + "r0 = 2;" + "exit;" + : + : __imm_insn(may_goto1, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 2 /* offset */, 0)), + __imm_insn(may_goto3, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0 /* offset */, 0)) + : __clobber_all); +} + +SEC("raw_tp") +__description("may_goto batch with offsets 2/0 - arm64") +__arch_arm64 __xlated("0: *(u64 *)(r10 -8) = 8388608") __xlated("1: r11 = *(u64 *)(r10 -8)") __xlated("2: if r11 == 0x0 goto pc+3") @@ -80,7 +110,7 @@ __xlated("5: r0 = 1") __xlated("6: r0 = 2") __xlated("7: exit") __success -__naked void may_goto_batch_2(void) +__naked void may_goto_batch_2_arm64(void) { asm volatile ( ".8byte %[may_goto1];" From e24bbad29a8de70bb33c1cabc85bb40e6707572a Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 4 Mar 2025 01:06:13 +0000 Subject: [PATCH 43/80] bpf: Introduce load-acquire and store-release instructions Introduce BPF instructions with load-acquire and store-release semantics, as discussed in [1]. Define 2 new flags: #define BPF_LOAD_ACQ 0x100 #define BPF_STORE_REL 0x110 A "load-acquire" is a BPF_STX | BPF_ATOMIC instruction with the 'imm' field set to BPF_LOAD_ACQ (0x100). Similarly, a "store-release" is a BPF_STX | BPF_ATOMIC instruction with the 'imm' field set to BPF_STORE_REL (0x110). Unlike existing atomic read-modify-write operations that only support BPF_W (32-bit) and BPF_DW (64-bit) size modifiers, load-acquires and store-releases also support BPF_B (8-bit) and BPF_H (16-bit). As an exception, however, 64-bit load-acquires/store-releases are not supported on 32-bit architectures (to fix a build error reported by the kernel test robot). An 8- or 16-bit load-acquire zero-extends the value before writing it to a 32-bit register, just like ARM64 instruction LDARH and friends. Similar to existing atomic read-modify-write operations, misaligned load-acquires/store-releases are not allowed (even if BPF_F_ANY_ALIGNMENT is set). As an example, consider the following 64-bit load-acquire BPF instruction (assuming little-endian): db 10 00 00 00 01 00 00 r0 = load_acquire((u64 *)(r1 + 0x0)) opcode (0xdb): BPF_ATOMIC | BPF_DW | BPF_STX imm (0x00000100): BPF_LOAD_ACQ Similarly, a 16-bit BPF store-release: cb 21 00 00 10 01 00 00 store_release((u16 *)(r1 + 0x0), w2) opcode (0xcb): BPF_ATOMIC | BPF_H | BPF_STX imm (0x00000110): BPF_STORE_REL In arch/{arm64,s390,x86}/net/bpf_jit_comp.c, have bpf_jit_supports_insn(..., /*in_arena=*/true) return false for the new instructions, until the corresponding JIT compiler supports them in arena. [1] https://lore.kernel.org/all/20240729183246.4110549-1-yepeilin@google.com/ Acked-by: Eduard Zingerman Acked-by: Ilya Leoshkevich Cc: kernel test robot Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/a217f46f0e445fbd573a1a024be5c6bf1d5fe716.1741049567.git.yepeilin@google.com Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 4 ++ arch/s390/net/bpf_jit_comp.c | 14 +++++-- arch/x86/net/bpf_jit_comp.c | 4 ++ include/linux/bpf.h | 15 ++++++++ include/linux/filter.h | 2 + include/uapi/linux/bpf.h | 3 ++ kernel/bpf/core.c | 67 +++++++++++++++++++++++++++++++--- kernel/bpf/disasm.c | 12 ++++++ kernel/bpf/verifier.c | 55 ++++++++++++++++++++++++++-- tools/include/uapi/linux/bpf.h | 3 ++ 10 files changed, 166 insertions(+), 13 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 7409c8acbde35..bdda5a77bb16c 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2667,8 +2667,12 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) if (!in_arena) return true; switch (insn->code) { + case BPF_STX | BPF_ATOMIC | BPF_B: + case BPF_STX | BPF_ATOMIC | BPF_H: case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: + if (bpf_atomic_is_load_store(insn)) + return false; if (!cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) return false; } diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 9d440a0b729eb..0776dfde2dba9 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -2919,10 +2919,16 @@ bool bpf_jit_supports_arena(void) bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) { - /* - * Currently the verifier uses this function only to check which - * atomic stores to arena are supported, and they all are. - */ + if (!in_arena) + return true; + switch (insn->code) { + case BPF_STX | BPF_ATOMIC | BPF_B: + case BPF_STX | BPF_ATOMIC | BPF_H: + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (bpf_atomic_is_load_store(insn)) + return false; + } return true; } diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index f3e9ef6b5329c..fb541e3b17625 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -3771,8 +3771,12 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) if (!in_arena) return true; switch (insn->code) { + case BPF_STX | BPF_ATOMIC | BPF_B: + case BPF_STX | BPF_ATOMIC | BPF_H: case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: + if (bpf_atomic_is_load_store(insn)) + return false; if (insn->imm == (BPF_AND | BPF_FETCH) || insn->imm == (BPF_OR | BPF_FETCH) || insn->imm == (BPF_XOR | BPF_FETCH)) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index dae3872c301d9..7d55553de3fce 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -991,6 +991,21 @@ static inline bool bpf_pseudo_func(const struct bpf_insn *insn) return bpf_is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC; } +/* Given a BPF_ATOMIC instruction @atomic_insn, return true if it is an + * atomic load or store, and false if it is a read-modify-write instruction. + */ +static inline bool +bpf_atomic_is_load_store(const struct bpf_insn *atomic_insn) +{ + switch (atomic_insn->imm) { + case BPF_LOAD_ACQ: + case BPF_STORE_REL: + return true; + default: + return false; + } +} + struct bpf_prog_ops { int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); diff --git a/include/linux/filter.h b/include/linux/filter.h index 02dda5c53d914..590476743f7a3 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -364,6 +364,8 @@ static inline bool insn_is_cast_user(const struct bpf_insn *insn) * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) + * BPF_LOAD_ACQ dst_reg = smp_load_acquire(src_reg + off16) + * BPF_STORE_REL smp_store_release(dst_reg + off16, src_reg) */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index beac5cdf2d2c6..bb37897c03939 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -51,6 +51,9 @@ #define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ #define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */ +#define BPF_LOAD_ACQ 0x100 /* load-acquire */ +#define BPF_STORE_REL 0x110 /* store-release */ + enum bpf_cond_pseudo_jmp { BPF_MAY_GOTO = 0, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index e583c19a0291f..62cb9557ad3be 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1663,14 +1663,17 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(JMP, JSET, K), \ INSN_2(JMP, JA), \ INSN_2(JMP32, JA), \ + /* Atomic operations. */ \ + INSN_3(STX, ATOMIC, B), \ + INSN_3(STX, ATOMIC, H), \ + INSN_3(STX, ATOMIC, W), \ + INSN_3(STX, ATOMIC, DW), \ /* Store instructions. */ \ /* Register based. */ \ INSN_3(STX, MEM, B), \ INSN_3(STX, MEM, H), \ INSN_3(STX, MEM, W), \ INSN_3(STX, MEM, DW), \ - INSN_3(STX, ATOMIC, W), \ - INSN_3(STX, ATOMIC, DW), \ /* Immediate based. */ \ INSN_3(ST, MEM, B), \ INSN_3(ST, MEM, H), \ @@ -2152,24 +2155,33 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) if (BPF_SIZE(insn->code) == BPF_W) \ atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \ (DST + insn->off)); \ - else \ + else if (BPF_SIZE(insn->code) == BPF_DW) \ atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \ (DST + insn->off)); \ + else \ + goto default_label; \ break; \ case BOP | BPF_FETCH: \ if (BPF_SIZE(insn->code) == BPF_W) \ SRC = (u32) atomic_fetch_##KOP( \ (u32) SRC, \ (atomic_t *)(unsigned long) (DST + insn->off)); \ - else \ + else if (BPF_SIZE(insn->code) == BPF_DW) \ SRC = (u64) atomic64_fetch_##KOP( \ (u64) SRC, \ (atomic64_t *)(unsigned long) (DST + insn->off)); \ + else \ + goto default_label; \ break; STX_ATOMIC_DW: STX_ATOMIC_W: + STX_ATOMIC_H: + STX_ATOMIC_B: switch (IMM) { + /* Atomic read-modify-write instructions support only W and DW + * size modifiers. + */ ATOMIC_ALU_OP(BPF_ADD, add) ATOMIC_ALU_OP(BPF_AND, and) ATOMIC_ALU_OP(BPF_OR, or) @@ -2181,20 +2193,63 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) SRC = (u32) atomic_xchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) SRC); - else + else if (BPF_SIZE(insn->code) == BPF_DW) SRC = (u64) atomic64_xchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) SRC); + else + goto default_label; break; case BPF_CMPXCHG: if (BPF_SIZE(insn->code) == BPF_W) BPF_R0 = (u32) atomic_cmpxchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) BPF_R0, (u32) SRC); - else + else if (BPF_SIZE(insn->code) == BPF_DW) BPF_R0 = (u64) atomic64_cmpxchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) BPF_R0, (u64) SRC); + else + goto default_label; + break; + /* Atomic load and store instructions support all size + * modifiers. + */ + case BPF_LOAD_ACQ: + switch (BPF_SIZE(insn->code)) { +#define LOAD_ACQUIRE(SIZEOP, SIZE) \ + case BPF_##SIZEOP: \ + DST = (SIZE)smp_load_acquire( \ + (SIZE *)(unsigned long)(SRC + insn->off)); \ + break; + LOAD_ACQUIRE(B, u8) + LOAD_ACQUIRE(H, u16) + LOAD_ACQUIRE(W, u32) +#ifdef CONFIG_64BIT + LOAD_ACQUIRE(DW, u64) +#endif +#undef LOAD_ACQUIRE + default: + goto default_label; + } + break; + case BPF_STORE_REL: + switch (BPF_SIZE(insn->code)) { +#define STORE_RELEASE(SIZEOP, SIZE) \ + case BPF_##SIZEOP: \ + smp_store_release( \ + (SIZE *)(unsigned long)(DST + insn->off), (SIZE)SRC); \ + break; + STORE_RELEASE(B, u8) + STORE_RELEASE(H, u16) + STORE_RELEASE(W, u32) +#ifdef CONFIG_64BIT + STORE_RELEASE(DW, u64) +#endif +#undef STORE_RELEASE + default: + goto default_label; + } break; default: diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 309c4aa1b026a..974d172d6735a 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -267,6 +267,18 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_LOAD_ACQ) { + verbose(cbs->private_data, "(%02x) r%d = load_acquire((%s *)(r%d %+d))\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_STORE_REL) { + verbose(cbs->private_data, "(%02x) store_release((%s *)(r%d %+d), r%d)\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, insn->src_reg); } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4ec1d1aa25eaf..b6664d0f6914c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -579,6 +579,13 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) insn->imm == BPF_CMPXCHG; } +static bool is_atomic_load_insn(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_STX && + BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_LOAD_ACQ; +} + static int __get_spi(s32 off) { return (-off - 1) / BPF_REG_SIZE; @@ -3567,7 +3574,7 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (class == BPF_STX) { - /* BPF_STX (including atomic variants) has multiple source + /* BPF_STX (including atomic variants) has one or more source * operands, one of which is a ptr. Check whether the caller is * asking about it. */ @@ -4181,7 +4188,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * dreg still needs precision before this insn */ } - } else if (class == BPF_LDX) { + } else if (class == BPF_LDX || is_atomic_load_insn(insn)) { if (!bt_is_reg_set(bt, dreg)) return 0; bt_clear_reg(bt, dreg); @@ -7766,6 +7773,32 @@ static int check_atomic_rmw(struct bpf_verifier_env *env, return 0; } +static int check_atomic_load(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + if (!atomic_ptr_type_ok(env, insn->src_reg, insn)) { + verbose(env, "BPF_ATOMIC loads from R%d %s is not allowed\n", + insn->src_reg, + reg_type_str(env, reg_state(env, insn->src_reg)->type)); + return -EACCES; + } + + return check_load_mem(env, insn, true, false, false, "atomic_load"); +} + +static int check_atomic_store(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) { + verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", + insn->dst_reg, + reg_type_str(env, reg_state(env, insn->dst_reg)->type)); + return -EACCES; + } + + return check_store_reg(env, insn, true); +} + static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn) { switch (insn->imm) { @@ -7780,6 +7813,20 @@ static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn) case BPF_XCHG: case BPF_CMPXCHG: return check_atomic_rmw(env, insn); + case BPF_LOAD_ACQ: + if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) { + verbose(env, + "64-bit load-acquires are only supported on 64-bit arches\n"); + return -EOPNOTSUPP; + } + return check_atomic_load(env, insn); + case BPF_STORE_REL: + if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) { + verbose(env, + "64-bit store-releases are only supported on 64-bit arches\n"); + return -EOPNOTSUPP; + } + return check_atomic_store(env, insn); default: verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); @@ -20605,7 +20652,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code == (BPF_ST | BPF_MEM | BPF_W) || insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { type = BPF_WRITE; - } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || + } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_B) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_H) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) && env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) { insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index beac5cdf2d2c6..bb37897c03939 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -51,6 +51,9 @@ #define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ #define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */ +#define BPF_LOAD_ACQ 0x100 /* load-acquire */ +#define BPF_STORE_REL 0x110 /* store-release */ + enum bpf_cond_pseudo_jmp { BPF_MAY_GOTO = 0, }; From 4170a60c473dd0770c3de47134b6140f547939f1 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 4 Mar 2025 01:06:19 +0000 Subject: [PATCH 44/80] arm64: insn: Add BIT(23) to {load,store}_ex's mask We are planning to add load-acquire (LDAR{,B,H}) and store-release (STLR{,B,H}) instructions to insn.{c,h}; add BIT(23) to mask of load_ex and store_ex to prevent aarch64_insn_is_{load,store}_ex() from returning false-positives for load-acquire and store-release instructions. Reference: Arm Architecture Reference Manual (ARM DDI 0487K.a, ID032224), * C6.2.228 LDXR * C6.2.165 LDAXR * C6.2.161 LDAR * C6.2.393 STXR * C6.2.360 STLXR * C6.2.353 STLR Acked-by: Xu Kuohai Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/5a4d2a52b2cc022bf86d0b572789f0b3bc3d5162.1741049567.git.yepeilin@google.com Signed-off-by: Alexei Starovoitov --- arch/arm64/include/asm/insn.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index e390c432f546e..2d8316b3abafb 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -351,8 +351,8 @@ __AARCH64_INSN_FUNCS(ldr_imm, 0x3FC00000, 0x39400000) __AARCH64_INSN_FUNCS(ldr_lit, 0xBF000000, 0x18000000) __AARCH64_INSN_FUNCS(ldrsw_lit, 0xFF000000, 0x98000000) __AARCH64_INSN_FUNCS(exclusive, 0x3F800000, 0x08000000) -__AARCH64_INSN_FUNCS(load_ex, 0x3F400000, 0x08400000) -__AARCH64_INSN_FUNCS(store_ex, 0x3F400000, 0x08000000) +__AARCH64_INSN_FUNCS(load_ex, 0x3FC00000, 0x08400000) +__AARCH64_INSN_FUNCS(store_ex, 0x3FC00000, 0x08000000) __AARCH64_INSN_FUNCS(mops, 0x3B200C00, 0x19000400) __AARCH64_INSN_FUNCS(stp, 0x7FC00000, 0x29000000) __AARCH64_INSN_FUNCS(ldp, 0x7FC00000, 0x29400000) From 248b1900dd952ccc7c7371b399ca797974f188eb Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 4 Mar 2025 01:06:27 +0000 Subject: [PATCH 45/80] arm64: insn: Add load-acquire and store-release instructions Add load-acquire ("load_acq", LDAR{,B,H}) and store-release ("store_rel", STLR{,B,H}) instructions. Breakdown of encoding: size L (Rs) o0 (Rt2) Rn Rt mask (0x3fdffc00): 00 111111 1 1 0 11111 1 11111 00000 00000 value, load_acq (0x08dffc00): 00 001000 1 1 0 11111 1 11111 00000 00000 value, store_rel (0x089ffc00): 00 001000 1 0 0 11111 1 11111 00000 00000 As suggested by Xu [1], include all Should-Be-One (SBO) bits ("Rs" and "Rt2" fields) in the "mask" and "value" numbers. It is worth noting that we are adding the "no offset" variant of STLR instead of the "pre-index" variant, which has a different encoding. Reference: Arm Architecture Reference Manual (ARM DDI 0487K.a, ID032224), * C6.2.161 LDAR * C6.2.353 STLR [1] https://lore.kernel.org/bpf/4e6641ce-3f1e-4251-8daf-4dd4b77d08c4@huaweicloud.com/ Acked-by: Xu Kuohai Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/ba92057b7502ce4c9c9b03b7d637abe5e178134e.1741049567.git.yepeilin@google.com Signed-off-by: Alexei Starovoitov --- arch/arm64/include/asm/insn.h | 8 ++++++++ arch/arm64/lib/insn.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index 2d8316b3abafb..39577f1d079a9 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -188,8 +188,10 @@ enum aarch64_insn_ldst_type { AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX, AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX, AARCH64_INSN_LDST_STORE_PAIR_POST_INDEX, + AARCH64_INSN_LDST_LOAD_ACQ, AARCH64_INSN_LDST_LOAD_EX, AARCH64_INSN_LDST_LOAD_ACQ_EX, + AARCH64_INSN_LDST_STORE_REL, AARCH64_INSN_LDST_STORE_EX, AARCH64_INSN_LDST_STORE_REL_EX, AARCH64_INSN_LDST_SIGNED_LOAD_IMM_OFFSET, @@ -351,6 +353,8 @@ __AARCH64_INSN_FUNCS(ldr_imm, 0x3FC00000, 0x39400000) __AARCH64_INSN_FUNCS(ldr_lit, 0xBF000000, 0x18000000) __AARCH64_INSN_FUNCS(ldrsw_lit, 0xFF000000, 0x98000000) __AARCH64_INSN_FUNCS(exclusive, 0x3F800000, 0x08000000) +__AARCH64_INSN_FUNCS(load_acq, 0x3FDFFC00, 0x08DFFC00) +__AARCH64_INSN_FUNCS(store_rel, 0x3FDFFC00, 0x089FFC00) __AARCH64_INSN_FUNCS(load_ex, 0x3FC00000, 0x08400000) __AARCH64_INSN_FUNCS(store_ex, 0x3FC00000, 0x08000000) __AARCH64_INSN_FUNCS(mops, 0x3B200C00, 0x19000400) @@ -602,6 +606,10 @@ u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1, int offset, enum aarch64_insn_variant variant, enum aarch64_insn_ldst_type type); +u32 aarch64_insn_gen_load_acq_store_rel(enum aarch64_insn_register reg, + enum aarch64_insn_register base, + enum aarch64_insn_size_type size, + enum aarch64_insn_ldst_type type); u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg, enum aarch64_insn_register base, enum aarch64_insn_register state, diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c index b008a9b46a7ff..9bef696e2230b 100644 --- a/arch/arm64/lib/insn.c +++ b/arch/arm64/lib/insn.c @@ -540,6 +540,35 @@ u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1, offset >> shift); } +u32 aarch64_insn_gen_load_acq_store_rel(enum aarch64_insn_register reg, + enum aarch64_insn_register base, + enum aarch64_insn_size_type size, + enum aarch64_insn_ldst_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_LDST_LOAD_ACQ: + insn = aarch64_insn_get_load_acq_value(); + break; + case AARCH64_INSN_LDST_STORE_REL: + insn = aarch64_insn_get_store_rel_value(); + break; + default: + pr_err("%s: unknown load-acquire/store-release encoding %d\n", + __func__, type); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_ldst_size(size, insn); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, + reg); + + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, + base); +} + u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg, enum aarch64_insn_register base, enum aarch64_insn_register state, From 1bfe7f657cf4f67e1806d0bf8921b434969cde74 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 4 Mar 2025 01:06:33 +0000 Subject: [PATCH 46/80] bpf, arm64: Support load-acquire and store-release instructions Support BPF load-acquire (BPF_LOAD_ACQ) and store-release (BPF_STORE_REL) instructions in the arm64 JIT compiler. For example (assuming little-endian): db 10 00 00 00 01 00 00 r0 = load_acquire((u64 *)(r1 + 0x0)) 95 00 00 00 00 00 00 00 exit opcode (0xdb): BPF_ATOMIC | BPF_DW | BPF_STX imm (0x00000100): BPF_LOAD_ACQ The JIT compiler would emit an LDAR instruction for the above, e.g.: ldar x7, [x0] Similarly, consider the following 16-bit store-release: cb 21 00 00 10 01 00 00 store_release((u16 *)(r1 + 0x0), w2) 95 00 00 00 00 00 00 00 exit opcode (0xcb): BPF_ATOMIC | BPF_H | BPF_STX imm (0x00000110): BPF_STORE_REL An STLRH instruction would be emitted, e.g.: stlrh w1, [x0] For a complete mapping: load-acquire 8-bit LDARB (BPF_LOAD_ACQ) 16-bit LDARH 32-bit LDAR (32-bit) 64-bit LDAR (64-bit) store-release 8-bit STLRB (BPF_STORE_REL) 16-bit STLRH 32-bit STLR (32-bit) 64-bit STLR (64-bit) Arena accesses are supported. bpf_jit_supports_insn(..., /*in_arena=*/true) always returns true for BPF_LOAD_ACQ and BPF_STORE_REL instructions, as they don't depend on ARM64_HAS_LSE_ATOMICS. Acked-by: Xu Kuohai Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/51664a1300710238ba2d4d95142b57a52c4f0cae.1741049567.git.yepeilin@google.com Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit.h | 20 ++++++++ arch/arm64/net/bpf_jit_comp.c | 90 ++++++++++++++++++++++++++++++++--- 2 files changed, 104 insertions(+), 6 deletions(-) diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h index b22ab2f97a300..a3b0e693a125c 100644 --- a/arch/arm64/net/bpf_jit.h +++ b/arch/arm64/net/bpf_jit.h @@ -119,6 +119,26 @@ aarch64_insn_gen_load_store_ex(Rt, Rn, Rs, A64_SIZE(sf), \ AARCH64_INSN_LDST_STORE_REL_EX) +/* Load-acquire & store-release */ +#define A64_LDAR(Rt, Rn, size) \ + aarch64_insn_gen_load_acq_store_rel(Rt, Rn, AARCH64_INSN_SIZE_##size, \ + AARCH64_INSN_LDST_LOAD_ACQ) +#define A64_STLR(Rt, Rn, size) \ + aarch64_insn_gen_load_acq_store_rel(Rt, Rn, AARCH64_INSN_SIZE_##size, \ + AARCH64_INSN_LDST_STORE_REL) + +/* Rt = [Rn] (load acquire) */ +#define A64_LDARB(Wt, Xn) A64_LDAR(Wt, Xn, 8) +#define A64_LDARH(Wt, Xn) A64_LDAR(Wt, Xn, 16) +#define A64_LDAR32(Wt, Xn) A64_LDAR(Wt, Xn, 32) +#define A64_LDAR64(Xt, Xn) A64_LDAR(Xt, Xn, 64) + +/* [Rn] = Rt (store release) */ +#define A64_STLRB(Wt, Xn) A64_STLR(Wt, Xn, 8) +#define A64_STLRH(Wt, Xn) A64_STLR(Wt, Xn, 16) +#define A64_STLR32(Wt, Xn) A64_STLR(Wt, Xn, 32) +#define A64_STLR64(Xt, Xn) A64_STLR(Xt, Xn, 64) + /* * LSE atomics * diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index bdda5a77bb16c..70d7c89d3ac90 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -647,6 +647,81 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) return 0; } +static int emit_atomic_ld_st(const struct bpf_insn *insn, struct jit_ctx *ctx) +{ + const s32 imm = insn->imm; + const s16 off = insn->off; + const u8 code = insn->code; + const bool arena = BPF_MODE(code) == BPF_PROBE_ATOMIC; + const u8 arena_vm_base = bpf2a64[ARENA_VM_START]; + const u8 dst = bpf2a64[insn->dst_reg]; + const u8 src = bpf2a64[insn->src_reg]; + const u8 tmp = bpf2a64[TMP_REG_1]; + u8 reg; + + switch (imm) { + case BPF_LOAD_ACQ: + reg = src; + break; + case BPF_STORE_REL: + reg = dst; + break; + default: + pr_err_once("unknown atomic load/store op code %02x\n", imm); + return -EINVAL; + } + + if (off) { + emit_a64_add_i(1, tmp, reg, tmp, off, ctx); + reg = tmp; + } + if (arena) { + emit(A64_ADD(1, tmp, reg, arena_vm_base), ctx); + reg = tmp; + } + + switch (imm) { + case BPF_LOAD_ACQ: + switch (BPF_SIZE(code)) { + case BPF_B: + emit(A64_LDARB(dst, reg), ctx); + break; + case BPF_H: + emit(A64_LDARH(dst, reg), ctx); + break; + case BPF_W: + emit(A64_LDAR32(dst, reg), ctx); + break; + case BPF_DW: + emit(A64_LDAR64(dst, reg), ctx); + break; + } + break; + case BPF_STORE_REL: + switch (BPF_SIZE(code)) { + case BPF_B: + emit(A64_STLRB(src, reg), ctx); + break; + case BPF_H: + emit(A64_STLRH(src, reg), ctx); + break; + case BPF_W: + emit(A64_STLR32(src, reg), ctx); + break; + case BPF_DW: + emit(A64_STLR64(src, reg), ctx); + break; + } + break; + default: + pr_err_once("unexpected atomic load/store op code %02x\n", + imm); + return -EINVAL; + } + + return 0; +} + #ifdef CONFIG_ARM64_LSE_ATOMICS static int emit_lse_atomic(const struct bpf_insn *insn, struct jit_ctx *ctx) { @@ -1641,11 +1716,17 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, return ret; break; + case BPF_STX | BPF_ATOMIC | BPF_B: + case BPF_STX | BPF_ATOMIC | BPF_H: case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_B: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_H: case BPF_STX | BPF_PROBE_ATOMIC | BPF_W: case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW: - if (cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) + if (bpf_atomic_is_load_store(insn)) + ret = emit_atomic_ld_st(insn, ctx); + else if (cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) ret = emit_lse_atomic(insn, ctx); else ret = emit_ll_sc_atomic(insn, ctx); @@ -2667,13 +2748,10 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) if (!in_arena) return true; switch (insn->code) { - case BPF_STX | BPF_ATOMIC | BPF_B: - case BPF_STX | BPF_ATOMIC | BPF_H: case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: - if (bpf_atomic_is_load_store(insn)) - return false; - if (!cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) + if (!bpf_atomic_is_load_store(insn) && + !cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) return false; } return true; From 14c0427b8a098046f80dad896076ab27631c25ba Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 4 Mar 2025 01:06:40 +0000 Subject: [PATCH 47/80] bpf, x86: Support load-acquire and store-release instructions Recently we introduced BPF load-acquire (BPF_LOAD_ACQ) and store-release (BPF_STORE_REL) instructions. For x86-64, simply implement them as regular BPF_LDX/BPF_STX loads and stores. The verifier always rejects misaligned load-acquires/store-releases (even if BPF_F_ANY_ALIGNMENT is set), so emitted MOV* instructions are guaranteed to be atomic. Arena accesses are supported. 8- and 16-bit load-acquires are zero-extending (i.e., MOVZBQ, MOVZWQ). Rename emit_atomic{,_index}() to emit_atomic_rmw{,_index}() to make it clear that they only handle read-modify-write atomics, and extend their @atomic_op parameter from u8 to u32, since we are starting to use more than the lowest 8 bits of the 'imm' field. Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/d22bb3c69f126af1d962b7314f3489eff606a3b7.1741049567.git.yepeilin@google.com Signed-off-by: Alexei Starovoitov --- arch/x86/net/bpf_jit_comp.c | 99 ++++++++++++++++++++++++++++++------- 1 file changed, 82 insertions(+), 17 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index fb541e3b17625..d3491cc0898bf 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1242,8 +1242,8 @@ static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm) emit_st_index(pprog, size, dst_reg, X86_REG_R12, off, imm); } -static int emit_atomic(u8 **pprog, u8 atomic_op, - u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) +static int emit_atomic_rmw(u8 **pprog, u32 atomic_op, + u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) { u8 *prog = *pprog; @@ -1283,8 +1283,9 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, return 0; } -static int emit_atomic_index(u8 **pprog, u8 atomic_op, u32 size, - u32 dst_reg, u32 src_reg, u32 index_reg, int off) +static int emit_atomic_rmw_index(u8 **pprog, u32 atomic_op, u32 size, + u32 dst_reg, u32 src_reg, u32 index_reg, + int off) { u8 *prog = *pprog; @@ -1297,7 +1298,7 @@ static int emit_atomic_index(u8 **pprog, u8 atomic_op, u32 size, EMIT1(add_3mod(0x48, dst_reg, src_reg, index_reg)); break; default: - pr_err("bpf_jit: 1 and 2 byte atomics are not supported\n"); + pr_err("bpf_jit: 1- and 2-byte RMW atomics are not supported\n"); return -EFAULT; } @@ -1331,6 +1332,49 @@ static int emit_atomic_index(u8 **pprog, u8 atomic_op, u32 size, return 0; } +static int emit_atomic_ld_st(u8 **pprog, u32 atomic_op, u32 dst_reg, + u32 src_reg, s16 off, u8 bpf_size) +{ + switch (atomic_op) { + case BPF_LOAD_ACQ: + /* dst_reg = smp_load_acquire(src_reg + off16) */ + emit_ldx(pprog, bpf_size, dst_reg, src_reg, off); + break; + case BPF_STORE_REL: + /* smp_store_release(dst_reg + off16, src_reg) */ + emit_stx(pprog, bpf_size, dst_reg, src_reg, off); + break; + default: + pr_err("bpf_jit: unknown atomic load/store opcode %02x\n", + atomic_op); + return -EFAULT; + } + + return 0; +} + +static int emit_atomic_ld_st_index(u8 **pprog, u32 atomic_op, u32 size, + u32 dst_reg, u32 src_reg, u32 index_reg, + int off) +{ + switch (atomic_op) { + case BPF_LOAD_ACQ: + /* dst_reg = smp_load_acquire(src_reg + idx_reg + off16) */ + emit_ldx_index(pprog, size, dst_reg, src_reg, index_reg, off); + break; + case BPF_STORE_REL: + /* smp_store_release(dst_reg + idx_reg + off16, src_reg) */ + emit_stx_index(pprog, size, dst_reg, src_reg, index_reg, off); + break; + default: + pr_err("bpf_jit: unknown atomic load/store opcode %02x\n", + atomic_op); + return -EFAULT; + } + + return 0; +} + #define DONT_CLEAR 1 bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) @@ -2113,6 +2157,13 @@ st: if (is_imm8(insn->off)) } break; + case BPF_STX | BPF_ATOMIC | BPF_B: + case BPF_STX | BPF_ATOMIC | BPF_H: + if (!bpf_atomic_is_load_store(insn)) { + pr_err("bpf_jit: 1- and 2-byte RMW atomics are not supported\n"); + return -EFAULT; + } + fallthrough; case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: if (insn->imm == (BPF_AND | BPF_FETCH) || @@ -2148,10 +2199,10 @@ st: if (is_imm8(insn->off)) EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)], add_2reg(0xC0, AUX_REG, real_src_reg)); /* Attempt to swap in new value */ - err = emit_atomic(&prog, BPF_CMPXCHG, - real_dst_reg, AUX_REG, - insn->off, - BPF_SIZE(insn->code)); + err = emit_atomic_rmw(&prog, BPF_CMPXCHG, + real_dst_reg, AUX_REG, + insn->off, + BPF_SIZE(insn->code)); if (WARN_ON(err)) return err; /* @@ -2166,17 +2217,35 @@ st: if (is_imm8(insn->off)) break; } - err = emit_atomic(&prog, insn->imm, dst_reg, src_reg, - insn->off, BPF_SIZE(insn->code)); + if (bpf_atomic_is_load_store(insn)) + err = emit_atomic_ld_st(&prog, insn->imm, dst_reg, src_reg, + insn->off, BPF_SIZE(insn->code)); + else + err = emit_atomic_rmw(&prog, insn->imm, dst_reg, src_reg, + insn->off, BPF_SIZE(insn->code)); if (err) return err; break; + case BPF_STX | BPF_PROBE_ATOMIC | BPF_B: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_H: + if (!bpf_atomic_is_load_store(insn)) { + pr_err("bpf_jit: 1- and 2-byte RMW atomics are not supported\n"); + return -EFAULT; + } + fallthrough; case BPF_STX | BPF_PROBE_ATOMIC | BPF_W: case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW: start_of_ldx = prog; - err = emit_atomic_index(&prog, insn->imm, BPF_SIZE(insn->code), - dst_reg, src_reg, X86_REG_R12, insn->off); + + if (bpf_atomic_is_load_store(insn)) + err = emit_atomic_ld_st_index(&prog, insn->imm, + BPF_SIZE(insn->code), dst_reg, + src_reg, X86_REG_R12, insn->off); + else + err = emit_atomic_rmw_index(&prog, insn->imm, BPF_SIZE(insn->code), + dst_reg, src_reg, X86_REG_R12, + insn->off); if (err) return err; goto populate_extable; @@ -3771,12 +3840,8 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) if (!in_arena) return true; switch (insn->code) { - case BPF_STX | BPF_ATOMIC | BPF_B: - case BPF_STX | BPF_ATOMIC | BPF_H: case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: - if (bpf_atomic_is_load_store(insn)) - return false; if (insn->imm == (BPF_AND | BPF_FETCH) || insn->imm == (BPF_OR | BPF_FETCH) || insn->imm == (BPF_XOR | BPF_FETCH)) From 953df09aa42c754f2f38d8a1193be614b23ca850 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Tue, 4 Mar 2025 01:06:46 +0000 Subject: [PATCH 48/80] selftests/bpf: Add selftests for load-acquire and store-release instructions Add several ./test_progs tests: - arena_atomics/load_acquire - arena_atomics/store_release - verifier_load_acquire/* - verifier_store_release/* - verifier_precision/bpf_load_acquire - verifier_precision/bpf_store_release The last two tests are added to check if backtrack_insn() handles the new instructions correctly. Additionally, the last test also makes sure that the verifier "remembers" the value (in src_reg) we store-release into e.g. a stack slot. For example, if we take a look at the test program: #0: r1 = 8; /* store_release((u64 *)(r10 - 8), r1); */ #1: .8byte %[store_release]; #2: r1 = *(u64 *)(r10 - 8); #3: r2 = r10; #4: r2 += r1; #5: r0 = 0; #6: exit; At #1, if the verifier doesn't remember that we wrote 8 to the stack, then later at #4 we would be adding an unbounded scalar value to the stack pointer, which would cause the program to be rejected: VERIFIER LOG: ============= ... math between fp pointer and register with unbounded min value is not allowed For easier CI integration, instead of using built-ins like __atomic_{load,store}_n() which depend on the new __BPF_FEATURE_LOAD_ACQ_STORE_REL pre-defined macro, manually craft load-acquire/store-release instructions using __imm_insn(), as suggested by Eduard. All new tests depend on: (1) Clang major version >= 18, and (2) ENABLE_ATOMICS_TESTS is defined (currently implies -mcpu=v3 or v4), and (3) JIT supports load-acquire/store-release (currently arm64 and x86-64) In .../progs/arena_atomics.c: /* 8-byte-aligned */ __u8 __arena_global load_acquire8_value = 0x12; /* 1-byte hole */ __u16 __arena_global load_acquire16_value = 0x1234; That 1-byte hole in the .addr_space.1 ELF section caused clang-17 to crash: fatal error: error in backend: unable to write nop sequence of 1 bytes To work around such llvm-17 CI job failures, conditionally define __arena_global variables as 64-bit if __clang_major__ < 18, to make sure .addr_space.1 has no holes. Ideally we should avoid compiling this file using clang-17 at all (arena tests depend on __BPF_FEATURE_ADDR_SPACE_CAST, and are skipped for llvm-17 anyway), but that is a separate topic. Acked-by: Eduard Zingerman Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/1b46c6feaf0f1b6984d9ec80e500cc7383e9da1a.1741049567.git.yepeilin@google.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/arena_atomics.c | 66 ++++- .../selftests/bpf/prog_tests/verifier.c | 4 + .../selftests/bpf/progs/arena_atomics.c | 121 +++++++- .../bpf/progs/verifier_load_acquire.c | 197 +++++++++++++ .../selftests/bpf/progs/verifier_precision.c | 49 ++++ .../bpf/progs/verifier_store_release.c | 264 ++++++++++++++++++ 6 files changed, 698 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/verifier_load_acquire.c create mode 100644 tools/testing/selftests/bpf/progs/verifier_store_release.c diff --git a/tools/testing/selftests/bpf/prog_tests/arena_atomics.c b/tools/testing/selftests/bpf/prog_tests/arena_atomics.c index 26e7c06c6cb4d..d98577a6babc0 100644 --- a/tools/testing/selftests/bpf/prog_tests/arena_atomics.c +++ b/tools/testing/selftests/bpf/prog_tests/arena_atomics.c @@ -162,6 +162,66 @@ static void test_uaf(struct arena_atomics *skel) ASSERT_EQ(skel->arena->uaf_recovery_fails, 0, "uaf_recovery_fails"); } +static void test_load_acquire(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + if (skel->data->skip_lacq_srel_tests) { + printf("%s:SKIP: ENABLE_ATOMICS_TESTS not defined, Clang doesn't support addr_space_cast, and/or JIT doesn't support load-acquire\n", + __func__); + test__skip(); + return; + } + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.load_acquire); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->load_acquire8_result, 0x12, + "load_acquire8_result"); + ASSERT_EQ(skel->arena->load_acquire16_result, 0x1234, + "load_acquire16_result"); + ASSERT_EQ(skel->arena->load_acquire32_result, 0x12345678, + "load_acquire32_result"); + ASSERT_EQ(skel->arena->load_acquire64_result, 0x1234567890abcdef, + "load_acquire64_result"); +} + +static void test_store_release(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + if (skel->data->skip_lacq_srel_tests) { + printf("%s:SKIP: ENABLE_ATOMICS_TESTS not defined, Clang doesn't support addr_space_cast, and/or JIT doesn't support store-release\n", + __func__); + test__skip(); + return; + } + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.store_release); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->store_release8_result, 0x12, + "store_release8_result"); + ASSERT_EQ(skel->arena->store_release16_result, 0x1234, + "store_release16_result"); + ASSERT_EQ(skel->arena->store_release32_result, 0x12345678, + "store_release32_result"); + ASSERT_EQ(skel->arena->store_release64_result, 0x1234567890abcdef, + "store_release64_result"); +} + void test_arena_atomics(void) { struct arena_atomics *skel; @@ -171,7 +231,7 @@ void test_arena_atomics(void) if (!ASSERT_OK_PTR(skel, "arena atomics skeleton open")) return; - if (skel->data->skip_tests) { + if (skel->data->skip_all_tests) { printf("%s:SKIP:no ENABLE_ATOMICS_TESTS or no addr_space_cast support in clang", __func__); test__skip(); @@ -198,6 +258,10 @@ void test_arena_atomics(void) test_xchg(skel); if (test__start_subtest("uaf")) test_uaf(skel); + if (test__start_subtest("load_acquire")) + test_load_acquire(skel); + if (test__start_subtest("store_release")) + test_store_release(skel); cleanup: arena_atomics__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 8a0e1ff8a2dc6..cfe47b529e01a 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -45,6 +45,7 @@ #include "verifier_ldsx.skel.h" #include "verifier_leak_ptr.skel.h" #include "verifier_linked_scalars.skel.h" +#include "verifier_load_acquire.skel.h" #include "verifier_loops1.skel.h" #include "verifier_lwt.skel.h" #include "verifier_map_in_map.skel.h" @@ -80,6 +81,7 @@ #include "verifier_spill_fill.skel.h" #include "verifier_spin_lock.skel.h" #include "verifier_stack_ptr.skel.h" +#include "verifier_store_release.skel.h" #include "verifier_subprog_precision.skel.h" #include "verifier_subreg.skel.h" #include "verifier_tailcall_jit.skel.h" @@ -173,6 +175,7 @@ void test_verifier_int_ptr(void) { RUN(verifier_int_ptr); } void test_verifier_iterating_callbacks(void) { RUN(verifier_iterating_callbacks); } void test_verifier_jeq_infer_not_null(void) { RUN(verifier_jeq_infer_not_null); } void test_verifier_jit_convergence(void) { RUN(verifier_jit_convergence); } +void test_verifier_load_acquire(void) { RUN(verifier_load_acquire); } void test_verifier_ld_ind(void) { RUN(verifier_ld_ind); } void test_verifier_ldsx(void) { RUN(verifier_ldsx); } void test_verifier_leak_ptr(void) { RUN(verifier_leak_ptr); } @@ -211,6 +214,7 @@ void test_verifier_sockmap_mutate(void) { RUN(verifier_sockmap_mutate); } void test_verifier_spill_fill(void) { RUN(verifier_spill_fill); } void test_verifier_spin_lock(void) { RUN(verifier_spin_lock); } void test_verifier_stack_ptr(void) { RUN(verifier_stack_ptr); } +void test_verifier_store_release(void) { RUN(verifier_store_release); } void test_verifier_subprog_precision(void) { RUN(verifier_subprog_precision); } void test_verifier_subreg(void) { RUN(verifier_subreg); } void test_verifier_tailcall_jit(void) { RUN(verifier_tailcall_jit); } diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c index 40dd57fca5cc2..a52feff981126 100644 --- a/tools/testing/selftests/bpf/progs/arena_atomics.c +++ b/tools/testing/selftests/bpf/progs/arena_atomics.c @@ -6,6 +6,8 @@ #include #include #include "bpf_arena_common.h" +#include "../../../include/linux/filter.h" +#include "bpf_misc.h" struct { __uint(type, BPF_MAP_TYPE_ARENA); @@ -19,9 +21,17 @@ struct { } arena SEC(".maps"); #if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) -bool skip_tests __attribute((__section__(".data"))) = false; +bool skip_all_tests __attribute((__section__(".data"))) = false; #else -bool skip_tests = true; +bool skip_all_tests = true; +#endif + +#if defined(ENABLE_ATOMICS_TESTS) && \ + defined(__BPF_FEATURE_ADDR_SPACE_CAST) && \ + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) +bool skip_lacq_srel_tests __attribute((__section__(".data"))) = false; +#else +bool skip_lacq_srel_tests = true; #endif __u32 pid = 0; @@ -274,4 +284,111 @@ int uaf(const void *ctx) return 0; } +#if __clang_major__ >= 18 +__u8 __arena_global load_acquire8_value = 0x12; +__u16 __arena_global load_acquire16_value = 0x1234; +__u32 __arena_global load_acquire32_value = 0x12345678; +__u64 __arena_global load_acquire64_value = 0x1234567890abcdef; + +__u8 __arena_global load_acquire8_result = 0; +__u16 __arena_global load_acquire16_result = 0; +__u32 __arena_global load_acquire32_result = 0; +__u64 __arena_global load_acquire64_result = 0; +#else +/* clang-17 crashes if the .addr_space.1 ELF section has holes. Work around + * this issue by defining the below variables as 64-bit. + */ +__u64 __arena_global load_acquire8_value; +__u64 __arena_global load_acquire16_value; +__u64 __arena_global load_acquire32_value; +__u64 __arena_global load_acquire64_value; + +__u64 __arena_global load_acquire8_result; +__u64 __arena_global load_acquire16_result; +__u64 __arena_global load_acquire32_result; +__u64 __arena_global load_acquire64_result; +#endif + +SEC("raw_tp/sys_enter") +int load_acquire(const void *ctx) +{ +#if defined(ENABLE_ATOMICS_TESTS) && \ + defined(__BPF_FEATURE_ADDR_SPACE_CAST) && \ + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) + +#define LOAD_ACQUIRE_ARENA(SIZEOP, SIZE, SRC, DST) \ + { asm volatile ( \ + "r1 = %[" #SRC "] ll;" \ + "r1 = addr_space_cast(r1, 0x0, 0x1);" \ + ".8byte %[load_acquire_insn];" \ + "r3 = %[" #DST "] ll;" \ + "r3 = addr_space_cast(r3, 0x0, 0x1);" \ + "*(" #SIZE " *)(r3 + 0) = r2;" \ + : \ + : __imm_addr(SRC), \ + __imm_insn(load_acquire_insn, \ + BPF_ATOMIC_OP(BPF_##SIZEOP, BPF_LOAD_ACQ, \ + BPF_REG_2, BPF_REG_1, 0)), \ + __imm_addr(DST) \ + : __clobber_all); } \ + + LOAD_ACQUIRE_ARENA(B, u8, load_acquire8_value, load_acquire8_result) + LOAD_ACQUIRE_ARENA(H, u16, load_acquire16_value, + load_acquire16_result) + LOAD_ACQUIRE_ARENA(W, u32, load_acquire32_value, + load_acquire32_result) + LOAD_ACQUIRE_ARENA(DW, u64, load_acquire64_value, + load_acquire64_result) +#undef LOAD_ACQUIRE_ARENA + +#endif + return 0; +} + +#if __clang_major__ >= 18 +__u8 __arena_global store_release8_result = 0; +__u16 __arena_global store_release16_result = 0; +__u32 __arena_global store_release32_result = 0; +__u64 __arena_global store_release64_result = 0; +#else +/* clang-17 crashes if the .addr_space.1 ELF section has holes. Work around + * this issue by defining the below variables as 64-bit. + */ +__u64 __arena_global store_release8_result; +__u64 __arena_global store_release16_result; +__u64 __arena_global store_release32_result; +__u64 __arena_global store_release64_result; +#endif + +SEC("raw_tp/sys_enter") +int store_release(const void *ctx) +{ +#if defined(ENABLE_ATOMICS_TESTS) && \ + defined(__BPF_FEATURE_ADDR_SPACE_CAST) && \ + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) + +#define STORE_RELEASE_ARENA(SIZEOP, DST, VAL) \ + { asm volatile ( \ + "r1 = " VAL ";" \ + "r2 = %[" #DST "] ll;" \ + "r2 = addr_space_cast(r2, 0x0, 0x1);" \ + ".8byte %[store_release_insn];" \ + : \ + : __imm_addr(DST), \ + __imm_insn(store_release_insn, \ + BPF_ATOMIC_OP(BPF_##SIZEOP, BPF_STORE_REL, \ + BPF_REG_2, BPF_REG_1, 0)) \ + : __clobber_all); } \ + + STORE_RELEASE_ARENA(B, store_release8_result, "0x12") + STORE_RELEASE_ARENA(H, store_release16_result, "0x1234") + STORE_RELEASE_ARENA(W, store_release32_result, "0x12345678") + STORE_RELEASE_ARENA(DW, store_release64_result, + "0x1234567890abcdef ll") +#undef STORE_RELEASE_ARENA + +#endif + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_load_acquire.c b/tools/testing/selftests/bpf/progs/verifier_load_acquire.c new file mode 100644 index 0000000000000..ff308f24d7450 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_load_acquire.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Google LLC. */ + +#include +#include +#include "../../../include/linux/filter.h" +#include "bpf_misc.h" + +#if __clang_major__ >= 18 && defined(ENABLE_ATOMICS_TESTS) && \ + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) + +SEC("socket") +__description("load-acquire, 8-bit") +__success __success_unpriv __retval(0x12) +__naked void load_acquire_8(void) +{ + asm volatile ( + "w1 = 0x12;" + "*(u8 *)(r10 - 1) = w1;" + ".8byte %[load_acquire_insn];" // w0 = load_acquire((u8 *)(r10 - 1)); + "exit;" + : + : __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_B, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_10, -1)) + : __clobber_all); +} + +SEC("socket") +__description("load-acquire, 16-bit") +__success __success_unpriv __retval(0x1234) +__naked void load_acquire_16(void) +{ + asm volatile ( + "w1 = 0x1234;" + "*(u16 *)(r10 - 2) = w1;" + ".8byte %[load_acquire_insn];" // w0 = load_acquire((u16 *)(r10 - 2)); + "exit;" + : + : __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_H, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_10, -2)) + : __clobber_all); +} + +SEC("socket") +__description("load-acquire, 32-bit") +__success __success_unpriv __retval(0x12345678) +__naked void load_acquire_32(void) +{ + asm volatile ( + "w1 = 0x12345678;" + "*(u32 *)(r10 - 4) = w1;" + ".8byte %[load_acquire_insn];" // w0 = load_acquire((u32 *)(r10 - 4)); + "exit;" + : + : __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_W, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_10, -4)) + : __clobber_all); +} + +SEC("socket") +__description("load-acquire, 64-bit") +__success __success_unpriv __retval(0x1234567890abcdef) +__naked void load_acquire_64(void) +{ + asm volatile ( + "r1 = 0x1234567890abcdef ll;" + "*(u64 *)(r10 - 8) = r1;" + ".8byte %[load_acquire_insn];" // r0 = load_acquire((u64 *)(r10 - 8)); + "exit;" + : + : __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_10, -8)) + : __clobber_all); +} + +SEC("socket") +__description("load-acquire with uninitialized src_reg") +__failure __failure_unpriv __msg("R2 !read_ok") +__naked void load_acquire_with_uninitialized_src_reg(void) +{ + asm volatile ( + ".8byte %[load_acquire_insn];" // r0 = load_acquire((u64 *)(r2 + 0)); + "exit;" + : + : __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_2, 0)) + : __clobber_all); +} + +SEC("socket") +__description("load-acquire with non-pointer src_reg") +__failure __failure_unpriv __msg("R1 invalid mem access 'scalar'") +__naked void load_acquire_with_non_pointer_src_reg(void) +{ + asm volatile ( + "r1 = 0;" + ".8byte %[load_acquire_insn];" // r0 = load_acquire((u64 *)(r1 + 0)); + "exit;" + : + : __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_1, 0)) + : __clobber_all); +} + +SEC("socket") +__description("misaligned load-acquire") +__failure __failure_unpriv __msg("misaligned stack access off") +__flag(BPF_F_ANY_ALIGNMENT) +__naked void load_acquire_misaligned(void) +{ + asm volatile ( + "r1 = 0;" + "*(u64 *)(r10 - 8) = r1;" + ".8byte %[load_acquire_insn];" // w0 = load_acquire((u32 *)(r10 - 5)); + "exit;" + : + : __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_W, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_10, -5)) + : __clobber_all); +} + +SEC("socket") +__description("load-acquire from ctx pointer") +__failure __failure_unpriv __msg("BPF_ATOMIC loads from R1 ctx is not allowed") +__naked void load_acquire_from_ctx_pointer(void) +{ + asm volatile ( + ".8byte %[load_acquire_insn];" // w0 = load_acquire((u8 *)(r1 + 0)); + "exit;" + : + : __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_B, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_1, 0)) + : __clobber_all); +} + +SEC("xdp") +__description("load-acquire from pkt pointer") +__failure __msg("BPF_ATOMIC loads from R2 pkt is not allowed") +__naked void load_acquire_from_pkt_pointer(void) +{ + asm volatile ( + "r2 = *(u32 *)(r1 + %[xdp_md_data]);" + ".8byte %[load_acquire_insn];" // w0 = load_acquire((u8 *)(r2 + 0)); + "exit;" + : + : __imm_const(xdp_md_data, offsetof(struct xdp_md, data)), + __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_B, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_2, 0)) + : __clobber_all); +} + +SEC("flow_dissector") +__description("load-acquire from flow_keys pointer") +__failure __msg("BPF_ATOMIC loads from R2 flow_keys is not allowed") +__naked void load_acquire_from_flow_keys_pointer(void) +{ + asm volatile ( + "r2 = *(u64 *)(r1 + %[__sk_buff_flow_keys]);" + ".8byte %[load_acquire_insn];" // w0 = load_acquire((u8 *)(r2 + 0)); + "exit;" + : + : __imm_const(__sk_buff_flow_keys, + offsetof(struct __sk_buff, flow_keys)), + __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_B, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_2, 0)) + : __clobber_all); +} + +SEC("sk_reuseport") +__description("load-acquire from sock pointer") +__failure __msg("BPF_ATOMIC loads from R2 sock is not allowed") +__naked void load_acquire_from_sock_pointer(void) +{ + asm volatile ( + "r2 = *(u64 *)(r1 + %[sk_reuseport_md_sk]);" + ".8byte %[load_acquire_insn];" // w0 = load_acquire((u8 *)(r2 + 0)); + "exit;" + : + : __imm_const(sk_reuseport_md_sk, offsetof(struct sk_reuseport_md, sk)), + __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_B, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_2, 0)) + : __clobber_all); +} + +#else + +SEC("socket") +__description("Clang version < 18, ENABLE_ATOMICS_TESTS not defined, and/or JIT doesn't support load-acquire, use a dummy test") +__success +int dummy_test(void) +{ + return 0; +} + +#endif + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_precision.c b/tools/testing/selftests/bpf/progs/verifier_precision.c index 6b564d4c09866..6662d4b39969a 100644 --- a/tools/testing/selftests/bpf/progs/verifier_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_precision.c @@ -2,6 +2,7 @@ /* Copyright (C) 2023 SUSE LLC */ #include #include +#include "../../../include/linux/filter.h" #include "bpf_misc.h" SEC("?raw_tp") @@ -90,6 +91,54 @@ __naked int bpf_end_bswap(void) ::: __clobber_all); } +#if defined(ENABLE_ATOMICS_TESTS) && \ + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) + +SEC("?raw_tp") +__success __log_level(2) +__msg("mark_precise: frame0: regs=r2 stack= before 3: (bf) r3 = r10") +__msg("mark_precise: frame0: regs=r2 stack= before 2: (db) r2 = load_acquire((u64 *)(r10 -8))") +__msg("mark_precise: frame0: regs= stack=-8 before 1: (7b) *(u64 *)(r10 -8) = r1") +__msg("mark_precise: frame0: regs=r1 stack= before 0: (b7) r1 = 8") +__naked int bpf_load_acquire(void) +{ + asm volatile ( + "r1 = 8;" + "*(u64 *)(r10 - 8) = r1;" + ".8byte %[load_acquire_insn];" /* r2 = load_acquire((u64 *)(r10 - 8)); */ + "r3 = r10;" + "r3 += r2;" /* mark_precise */ + "r0 = 0;" + "exit;" + : + : __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_LOAD_ACQ, BPF_REG_2, BPF_REG_10, -8)) + : __clobber_all); +} + +SEC("?raw_tp") +__success __log_level(2) +__msg("mark_precise: frame0: regs=r1 stack= before 3: (bf) r2 = r10") +__msg("mark_precise: frame0: regs=r1 stack= before 2: (79) r1 = *(u64 *)(r10 -8)") +__msg("mark_precise: frame0: regs= stack=-8 before 1: (db) store_release((u64 *)(r10 -8), r1)") +__msg("mark_precise: frame0: regs=r1 stack= before 0: (b7) r1 = 8") +__naked int bpf_store_release(void) +{ + asm volatile ( + "r1 = 8;" + ".8byte %[store_release_insn];" /* store_release((u64 *)(r10 - 8), r1); */ + "r1 = *(u64 *)(r10 - 8);" + "r2 = r10;" + "r2 += r1;" /* mark_precise */ + "r0 = 0;" + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_STORE_REL, BPF_REG_10, BPF_REG_1, -8)) + : __clobber_all); +} + +#endif /* load-acquire, store-release */ #endif /* v4 instruction */ SEC("?raw_tp") diff --git a/tools/testing/selftests/bpf/progs/verifier_store_release.c b/tools/testing/selftests/bpf/progs/verifier_store_release.c new file mode 100644 index 0000000000000..f1c64c08f25fb --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_store_release.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Google LLC. */ + +#include +#include +#include "../../../include/linux/filter.h" +#include "bpf_misc.h" + +#if __clang_major__ >= 18 && defined(ENABLE_ATOMICS_TESTS) && \ + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) + +SEC("socket") +__description("store-release, 8-bit") +__success __success_unpriv __retval(0x12) +__naked void store_release_8(void) +{ + asm volatile ( + "w1 = 0x12;" + ".8byte %[store_release_insn];" // store_release((u8 *)(r10 - 1), w1); + "w0 = *(u8 *)(r10 - 1);" + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_B, BPF_STORE_REL, BPF_REG_10, BPF_REG_1, -1)) + : __clobber_all); +} + +SEC("socket") +__description("store-release, 16-bit") +__success __success_unpriv __retval(0x1234) +__naked void store_release_16(void) +{ + asm volatile ( + "w1 = 0x1234;" + ".8byte %[store_release_insn];" // store_release((u16 *)(r10 - 2), w1); + "w0 = *(u16 *)(r10 - 2);" + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_H, BPF_STORE_REL, BPF_REG_10, BPF_REG_1, -2)) + : __clobber_all); +} + +SEC("socket") +__description("store-release, 32-bit") +__success __success_unpriv __retval(0x12345678) +__naked void store_release_32(void) +{ + asm volatile ( + "w1 = 0x12345678;" + ".8byte %[store_release_insn];" // store_release((u32 *)(r10 - 4), w1); + "w0 = *(u32 *)(r10 - 4);" + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_W, BPF_STORE_REL, BPF_REG_10, BPF_REG_1, -4)) + : __clobber_all); +} + +SEC("socket") +__description("store-release, 64-bit") +__success __success_unpriv __retval(0x1234567890abcdef) +__naked void store_release_64(void) +{ + asm volatile ( + "r1 = 0x1234567890abcdef ll;" + ".8byte %[store_release_insn];" // store_release((u64 *)(r10 - 8), r1); + "r0 = *(u64 *)(r10 - 8);" + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_STORE_REL, BPF_REG_10, BPF_REG_1, -8)) + : __clobber_all); +} + +SEC("socket") +__description("store-release with uninitialized src_reg") +__failure __failure_unpriv __msg("R2 !read_ok") +__naked void store_release_with_uninitialized_src_reg(void) +{ + asm volatile ( + ".8byte %[store_release_insn];" // store_release((u64 *)(r10 - 8), r2); + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_STORE_REL, BPF_REG_10, BPF_REG_2, -8)) + : __clobber_all); +} + +SEC("socket") +__description("store-release with uninitialized dst_reg") +__failure __failure_unpriv __msg("R2 !read_ok") +__naked void store_release_with_uninitialized_dst_reg(void) +{ + asm volatile ( + "r1 = 0;" + ".8byte %[store_release_insn];" // store_release((u64 *)(r2 - 8), r1); + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_STORE_REL, BPF_REG_2, BPF_REG_1, -8)) + : __clobber_all); +} + +SEC("socket") +__description("store-release with non-pointer dst_reg") +__failure __failure_unpriv __msg("R1 invalid mem access 'scalar'") +__naked void store_release_with_non_pointer_dst_reg(void) +{ + asm volatile ( + "r1 = 0;" + ".8byte %[store_release_insn];" // store_release((u64 *)(r1 + 0), r1); + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_STORE_REL, BPF_REG_1, BPF_REG_1, 0)) + : __clobber_all); +} + +SEC("socket") +__description("misaligned store-release") +__failure __failure_unpriv __msg("misaligned stack access off") +__flag(BPF_F_ANY_ALIGNMENT) +__naked void store_release_misaligned(void) +{ + asm volatile ( + "w0 = 0;" + ".8byte %[store_release_insn];" // store_release((u32 *)(r10 - 5), w0); + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_W, BPF_STORE_REL, BPF_REG_10, BPF_REG_0, -5)) + : __clobber_all); +} + +SEC("socket") +__description("store-release to ctx pointer") +__failure __failure_unpriv __msg("BPF_ATOMIC stores into R1 ctx is not allowed") +__naked void store_release_to_ctx_pointer(void) +{ + asm volatile ( + "w0 = 0;" + ".8byte %[store_release_insn];" // store_release((u8 *)(r1 + 0), w0); + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_B, BPF_STORE_REL, BPF_REG_1, BPF_REG_0, 0)) + : __clobber_all); +} + +SEC("xdp") +__description("store-release to pkt pointer") +__failure __msg("BPF_ATOMIC stores into R2 pkt is not allowed") +__naked void store_release_to_pkt_pointer(void) +{ + asm volatile ( + "w0 = 0;" + "r2 = *(u32 *)(r1 + %[xdp_md_data]);" + ".8byte %[store_release_insn];" // store_release((u8 *)(r2 + 0), w0); + "exit;" + : + : __imm_const(xdp_md_data, offsetof(struct xdp_md, data)), + __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_B, BPF_STORE_REL, BPF_REG_2, BPF_REG_0, 0)) + : __clobber_all); +} + +SEC("flow_dissector") +__description("store-release to flow_keys pointer") +__failure __msg("BPF_ATOMIC stores into R2 flow_keys is not allowed") +__naked void store_release_to_flow_keys_pointer(void) +{ + asm volatile ( + "w0 = 0;" + "r2 = *(u64 *)(r1 + %[__sk_buff_flow_keys]);" + ".8byte %[store_release_insn];" // store_release((u8 *)(r2 + 0), w0); + "exit;" + : + : __imm_const(__sk_buff_flow_keys, + offsetof(struct __sk_buff, flow_keys)), + __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_B, BPF_STORE_REL, BPF_REG_2, BPF_REG_0, 0)) + : __clobber_all); +} + +SEC("sk_reuseport") +__description("store-release to sock pointer") +__failure __msg("BPF_ATOMIC stores into R2 sock is not allowed") +__naked void store_release_to_sock_pointer(void) +{ + asm volatile ( + "w0 = 0;" + "r2 = *(u64 *)(r1 + %[sk_reuseport_md_sk]);" + ".8byte %[store_release_insn];" // store_release((u8 *)(r2 + 0), w0); + "exit;" + : + : __imm_const(sk_reuseport_md_sk, offsetof(struct sk_reuseport_md, sk)), + __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_B, BPF_STORE_REL, BPF_REG_2, BPF_REG_0, 0)) + : __clobber_all); +} + +SEC("socket") +__description("store-release, leak pointer to stack") +__success __success_unpriv __retval(0) +__naked void store_release_leak_pointer_to_stack(void) +{ + asm volatile ( + ".8byte %[store_release_insn];" // store_release((u64 *)(r10 - 8), r1); + "r0 = 0;" + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_STORE_REL, BPF_REG_10, BPF_REG_1, -8)) + : __clobber_all); +} + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, long long); + __type(value, long long); +} map_hash_8b SEC(".maps"); + +SEC("socket") +__description("store-release, leak pointer to map") +__success __retval(0) +__failure_unpriv __msg_unpriv("R6 leaks addr into map") +__naked void store_release_leak_pointer_to_map(void) +{ + asm volatile ( + "r6 = r1;" + "r1 = %[map_hash_8b] ll;" + "r2 = 0;" + "*(u64 *)(r10 - 8) = r2;" + "r2 = r10;" + "r2 += -8;" + "call %[bpf_map_lookup_elem];" + "if r0 == 0 goto l0_%=;" + ".8byte %[store_release_insn];" // store_release((u64 *)(r0 + 0), r6); +"l0_%=:" + "r0 = 0;" + "exit;" + : + : __imm_addr(map_hash_8b), + __imm(bpf_map_lookup_elem), + __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_STORE_REL, BPF_REG_0, BPF_REG_6, 0)) + : __clobber_all); +} + +#else + +SEC("socket") +__description("Clang version < 18, ENABLE_ATOMICS_TESTS not defined, and/or JIT doesn't support store-release, use a dummy test") +__success +int dummy_test(void) +{ + return 0; +} + +#endif + +char _license[] SEC("license") = "GPL"; From 1c152572be596d90c72b5cd9c1490fcd27a46153 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 4 Mar 2025 11:50:20 -0800 Subject: [PATCH 49/80] bpf: jmp_offset() and verbose_insn() utility functions Extract two utility functions: - One BPF jump instruction uses .imm field to encode jump offset, while the rest use .off. Encapsulate this detail as jmp_offset() function. - Avoid duplicating instruction printing callback definitions by defining a verbose_insn() function, which disassembles an instruction into the verifier log while hiding this detail. These functions will be used in the next patch. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250304195024.2478889-2-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b6664d0f6914c..25910b740bbcb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3360,6 +3360,15 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) return 0; } +static int jmp_offset(struct bpf_insn *insn) +{ + u8 code = insn->code; + + if (code == (BPF_JMP32 | BPF_JA)) + return insn->imm; + return insn->off; +} + static int check_subprogs(struct bpf_verifier_env *env) { int i, subprog_start, subprog_end, off, cur_subprog = 0; @@ -3386,10 +3395,7 @@ static int check_subprogs(struct bpf_verifier_env *env) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) goto next; - if (code == (BPF_JMP32 | BPF_JA)) - off = i + insn[i].imm + 1; - else - off = i + insn[i].off + 1; + off = i + jmp_offset(&insn[i]) + 1; if (off < subprog_start || off >= subprog_end) { verbose(env, "jump out of range from insn %d to %d\n", i, off); return -EINVAL; @@ -3919,6 +3925,17 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) return btf_name_by_offset(desc_btf, func->name_off); } +static void verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + const struct bpf_insn_cbs cbs = { + .cb_call = disasm_kfunc_name, + .cb_print = verbose, + .private_data = env, + }; + + print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); +} + static inline void bt_init(struct backtrack_state *bt, u32 frame) { bt->frame = frame; @@ -4119,11 +4136,6 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, struct bpf_insn_hist_entry *hist, struct backtrack_state *bt) { - const struct bpf_insn_cbs cbs = { - .cb_call = disasm_kfunc_name, - .cb_print = verbose, - .private_data = env, - }; struct bpf_insn *insn = env->prog->insnsi + idx; u8 class = BPF_CLASS(insn->code); u8 opcode = BPF_OP(insn->code); @@ -4141,7 +4153,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); verbose(env, "stack=%s before ", env->tmp_str_buf); verbose(env, "%d: ", idx); - print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + verbose_insn(env, insn); } /* If there is a history record that some registers gained range at this insn, @@ -19273,19 +19285,13 @@ static int do_check(struct bpf_verifier_env *env) } if (env->log.level & BPF_LOG_LEVEL) { - const struct bpf_insn_cbs cbs = { - .cb_call = disasm_kfunc_name, - .cb_print = verbose, - .private_data = env, - }; - if (verifier_state_scratched(env)) print_insn_state(env, state, state->curframe); verbose_linfo(env, env->insn_idx, "; "); env->prev_log_pos = env->log.end_pos; verbose(env, "%d: ", env->insn_idx); - print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + verbose_insn(env, insn); env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos; env->prev_log_pos = env->log.end_pos; } From 0ae958eca164191708d64fa705a602422d2b48b0 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 4 Mar 2025 11:50:21 -0800 Subject: [PATCH 50/80] bpf: get_call_summary() utility function Refactor mark_fastcall_pattern_for_call() to extract a utility function get_call_summary(). For a helper or kfunc call this function fills the following information: {num_params, is_void, fastcall}. This function would be used in the next patch in order to get number of parameters of a helper or kfunc call. Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250304195024.2478889-3-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 121 ++++++++++++++++++++---------------------- 1 file changed, 57 insertions(+), 64 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 25910b740bbcb..5cc1b6ed0e92c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17019,27 +17019,6 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, /* Bitmask with 1s for all caller saved registers */ #define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1) -/* Return a bitmask specifying which caller saved registers are - * clobbered by a call to a helper *as if* this helper follows - * bpf_fastcall contract: - * - includes R0 if function is non-void; - * - includes R1-R5 if corresponding parameter has is described - * in the function prototype. - */ -static u32 helper_fastcall_clobber_mask(const struct bpf_func_proto *fn) -{ - u32 mask; - int i; - - mask = 0; - if (fn->ret_type != RET_VOID) - mask |= BIT(BPF_REG_0); - for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) - if (fn->arg_type[i] != ARG_DONTCARE) - mask |= BIT(BPF_REG_1 + i); - return mask; -} - /* True if do_misc_fixups() replaces calls to helper number 'imm', * replacement patch is presumed to follow bpf_fastcall contract * (see mark_fastcall_pattern_for_call() below). @@ -17056,24 +17035,54 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) } } -/* Same as helper_fastcall_clobber_mask() but for kfuncs, see comment above */ -static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta) +struct call_summary { + u8 num_params; + bool is_void; + bool fastcall; +}; + +/* If @call is a kfunc or helper call, fills @cs and returns true, + * otherwise returns false. + */ +static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call, + struct call_summary *cs) { - u32 vlen, i, mask; + struct bpf_kfunc_call_arg_meta meta; + const struct bpf_func_proto *fn; + int i; - vlen = btf_type_vlen(meta->func_proto); - mask = 0; - if (!btf_type_is_void(btf_type_by_id(meta->btf, meta->func_proto->type))) - mask |= BIT(BPF_REG_0); - for (i = 0; i < vlen; ++i) - mask |= BIT(BPF_REG_1 + i); - return mask; -} + if (bpf_helper_call(call)) { -/* Same as verifier_inlines_helper_call() but for kfuncs, see comment above */ -static bool is_fastcall_kfunc_call(struct bpf_kfunc_call_arg_meta *meta) -{ - return meta->kfunc_flags & KF_FASTCALL; + if (get_helper_proto(env, call->imm, &fn) < 0) + /* error would be reported later */ + return false; + cs->fastcall = fn->allow_fastcall && + (verifier_inlines_helper_call(env, call->imm) || + bpf_jit_inlines_helper_call(call->imm)); + cs->is_void = fn->ret_type == RET_VOID; + cs->num_params = 0; + for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) { + if (fn->arg_type[i] == ARG_DONTCARE) + break; + cs->num_params++; + } + return true; + } + + if (bpf_pseudo_kfunc_call(call)) { + int err; + + err = fetch_kfunc_meta(env, call, &meta, NULL); + if (err < 0) + /* error would be reported later */ + return false; + cs->num_params = btf_type_vlen(meta.func_proto); + cs->fastcall = meta.kfunc_flags & KF_FASTCALL; + cs->is_void = btf_type_is_void(btf_type_by_id(meta.btf, meta.func_proto->type)); + return true; + } + + return false; } /* LLVM define a bpf_fastcall function attribute. @@ -17156,39 +17165,23 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env, { struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx; struct bpf_insn *call = &env->prog->insnsi[insn_idx]; - const struct bpf_func_proto *fn; - u32 clobbered_regs_mask = ALL_CALLER_SAVED_REGS; + u32 clobbered_regs_mask; + struct call_summary cs; u32 expected_regs_mask; - bool can_be_inlined = false; s16 off; int i; - if (bpf_helper_call(call)) { - if (get_helper_proto(env, call->imm, &fn) < 0) - /* error would be reported later */ - return; - clobbered_regs_mask = helper_fastcall_clobber_mask(fn); - can_be_inlined = fn->allow_fastcall && - (verifier_inlines_helper_call(env, call->imm) || - bpf_jit_inlines_helper_call(call->imm)); - } - - if (bpf_pseudo_kfunc_call(call)) { - struct bpf_kfunc_call_arg_meta meta; - int err; - - err = fetch_kfunc_meta(env, call, &meta, NULL); - if (err < 0) - /* error would be reported later */ - return; - - clobbered_regs_mask = kfunc_fastcall_clobber_mask(&meta); - can_be_inlined = is_fastcall_kfunc_call(&meta); - } - - if (clobbered_regs_mask == ALL_CALLER_SAVED_REGS) + if (!get_call_summary(env, call, &cs)) return; + /* A bitmask specifying which caller saved registers are clobbered + * by a call to a helper/kfunc *as if* this helper/kfunc follows + * bpf_fastcall contract: + * - includes R0 if function is non-void; + * - includes R1-R5 if corresponding parameter has is described + * in the function prototype. + */ + clobbered_regs_mask = GENMASK(cs.num_params, cs.is_void ? 1 : 0); /* e.g. if helper call clobbers r{0,1}, expect r{2,3,4,5} in the pattern */ expected_regs_mask = ~clobbered_regs_mask & ALL_CALLER_SAVED_REGS; @@ -17246,7 +17239,7 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env, * don't set 'fastcall_spills_num' for call B so that remove_fastcall_spills_fills() * does not remove spill/fill pair {4,6}. */ - if (can_be_inlined) + if (cs.fastcall) env->insn_aux_data[insn_idx].fastcall_spills_num = i - 1; else subprog->keep_fastcall_stack = 1; From 7dad03653567bb4eae1c4d89c22e9382388eb4c1 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 4 Mar 2025 11:50:22 -0800 Subject: [PATCH 51/80] bpf: simple DFA-based live registers analysis Compute may-live registers before each instruction in the program. The register is live before the instruction I if it is read by I or some instruction S following I during program execution and is not overwritten between I and S. This information would be used in the next patch as a hint in func_states_equal(). Use a simple algorithm described in [1] to compute this information: - define the following: - I.use : a set of all registers read by instruction I; - I.def : a set of all registers written by instruction I; - I.in : a set of all registers that may be alive before I execution; - I.out : a set of all registers that may be alive after I execution; - I.successors : a set of instructions S that might immediately follow I for some program execution; - associate separate empty sets 'I.in' and 'I.out' with each instruction; - visit each instruction in a postorder and update corresponding 'I.in' and 'I.out' sets as follows: I.out = U [S.in for S in I.successors] I.in = (I.out / I.def) U I.use (where U stands for set union, / stands for set difference) - repeat the computation while I.{in,out} changes for any instruction. On implementation side keep things as simple, as possible: - check_cfg() already marks instructions EXPLORED in post-order, modify it to save the index of each EXPLORED instruction in a vector; - represent I.{in,out,use,def} as bitmasks; - don't split the program into basic blocks and don't maintain the work queue, instead: - do fixed-point computation by visiting each instruction; - maintain a simple 'changed' flag if I.{in,out} for any instruction change; Measurements show that even such simplistic implementation does not add measurable verification time overhead (for selftests, at-least). Note on check_cfg() ex_insn_beg/ex_done change: To avoid out of bounds access to env->cfg.insn_postorder array, it should be guaranteed that instruction transitions to EXPLORED state only once. Previously this was not the fact for incorrect programs with direct calls to exception callbacks. The 'align' selftest needs adjustment to skip computed insn/live registers printout. Otherwise it matches lines from the live registers printout. [1] https://en.wikipedia.org/wiki/Live-variable_analysis Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250304195024.2478889-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 6 + kernel/bpf/verifier.c | 320 +++++++++++++++++- .../testing/selftests/bpf/prog_tests/align.c | 11 +- 3 files changed, 330 insertions(+), 7 deletions(-) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index d338f2a96bbae..d6cfc4ee6820c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -591,6 +591,8 @@ struct bpf_insn_aux_data { * accepts callback function as a parameter. */ bool calls_callback; + /* registers alive before this instruction. */ + u16 live_regs_before; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ @@ -748,7 +750,11 @@ struct bpf_verifier_env { struct { int *insn_state; int *insn_stack; + /* vector of instruction indexes sorted in post-order */ + int *insn_postorder; int cur_stack; + /* current position in the insn_postorder vector */ + int cur_postorder; } cfg; struct backtrack_state bt; struct bpf_insn_hist_entry *insn_hist; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5cc1b6ed0e92c..b434251979b1a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -17402,9 +17402,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) static int check_cfg(struct bpf_verifier_env *env) { int insn_cnt = env->prog->len; - int *insn_stack, *insn_state; + int *insn_stack, *insn_state, *insn_postorder; int ex_insn_beg, i, ret = 0; - bool ex_done = false; insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) @@ -17416,6 +17415,17 @@ static int check_cfg(struct bpf_verifier_env *env) return -ENOMEM; } + insn_postorder = env->cfg.insn_postorder = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + if (!insn_postorder) { + kvfree(insn_state); + kvfree(insn_stack); + return -ENOMEM; + } + + ex_insn_beg = env->exception_callback_subprog + ? env->subprog_info[env->exception_callback_subprog].start + : 0; + insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ insn_stack[0] = 0; /* 0 is the first instruction */ env->cfg.cur_stack = 1; @@ -17429,6 +17439,7 @@ static int check_cfg(struct bpf_verifier_env *env) case DONE_EXPLORING: insn_state[t] = EXPLORED; env->cfg.cur_stack--; + insn_postorder[env->cfg.cur_postorder++] = t; break; case KEEP_EXPLORING: break; @@ -17447,13 +17458,10 @@ static int check_cfg(struct bpf_verifier_env *env) goto err_free; } - if (env->exception_callback_subprog && !ex_done) { - ex_insn_beg = env->subprog_info[env->exception_callback_subprog].start; - + if (ex_insn_beg && insn_state[ex_insn_beg] != EXPLORED) { insn_state[ex_insn_beg] = DISCOVERED; insn_stack[0] = ex_insn_beg; env->cfg.cur_stack = 1; - ex_done = true; goto walk_cfg; } @@ -23379,6 +23387,301 @@ static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, return 0; } +static bool can_fallthrough(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + + if (class != BPF_JMP && class != BPF_JMP32) + return true; + + if (opcode == BPF_EXIT || opcode == BPF_JA) + return false; + + return true; +} + +static bool can_jump(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + + if (class != BPF_JMP && class != BPF_JMP32) + return false; + + switch (opcode) { + case BPF_JA: + case BPF_JEQ: + case BPF_JNE: + case BPF_JLT: + case BPF_JLE: + case BPF_JGT: + case BPF_JGE: + case BPF_JSGT: + case BPF_JSGE: + case BPF_JSLT: + case BPF_JSLE: + case BPF_JCOND: + return true; + } + + return false; +} + +static int insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) +{ + struct bpf_insn *insn = &prog->insnsi[idx]; + int i = 0, insn_sz; + u32 dst; + + insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; + if (can_fallthrough(insn) && idx + 1 < prog->len) + succ[i++] = idx + insn_sz; + + if (can_jump(insn)) { + dst = idx + jmp_offset(insn) + 1; + if (i == 0 || succ[0] != dst) + succ[i++] = dst; + } + + return i; +} + +/* Each field is a register bitmask */ +struct insn_live_regs { + u16 use; /* registers read by instruction */ + u16 def; /* registers written by instruction */ + u16 in; /* registers that may be alive before instruction */ + u16 out; /* registers that may be alive after instruction */ +}; + +/* Bitmask with 1s for all caller saved registers */ +#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1) + +/* Compute info->{use,def} fields for the instruction */ +static void compute_insn_live_regs(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct insn_live_regs *info) +{ + struct call_summary cs; + u8 class = BPF_CLASS(insn->code); + u8 code = BPF_OP(insn->code); + u8 mode = BPF_MODE(insn->code); + u16 src = BIT(insn->src_reg); + u16 dst = BIT(insn->dst_reg); + u16 r0 = BIT(0); + u16 def = 0; + u16 use = 0xffff; + + switch (class) { + case BPF_LD: + switch (mode) { + case BPF_IMM: + if (BPF_SIZE(insn->code) == BPF_DW) { + def = dst; + use = 0; + } + break; + case BPF_LD | BPF_ABS: + case BPF_LD | BPF_IND: + /* stick with defaults */ + break; + } + break; + case BPF_LDX: + switch (mode) { + case BPF_MEM: + case BPF_MEMSX: + def = dst; + use = src; + break; + } + break; + case BPF_ST: + switch (mode) { + case BPF_MEM: + def = 0; + use = dst; + break; + } + break; + case BPF_STX: + switch (mode) { + case BPF_MEM: + def = 0; + use = dst | src; + break; + case BPF_ATOMIC: + switch (insn->imm) { + case BPF_CMPXCHG: + use = r0 | dst | src; + def = r0; + break; + case BPF_LOAD_ACQ: + def = dst; + use = src; + break; + case BPF_STORE_REL: + def = 0; + use = dst | src; + break; + default: + use = dst | src; + if (insn->imm & BPF_FETCH) + def = src; + else + def = 0; + } + break; + } + break; + case BPF_ALU: + case BPF_ALU64: + switch (code) { + case BPF_END: + use = dst; + def = dst; + break; + case BPF_MOV: + def = dst; + if (BPF_SRC(insn->code) == BPF_K) + use = 0; + else + use = src; + break; + default: + def = dst; + if (BPF_SRC(insn->code) == BPF_K) + use = dst; + else + use = dst | src; + } + break; + case BPF_JMP: + case BPF_JMP32: + switch (code) { + case BPF_JA: + def = 0; + use = 0; + break; + case BPF_EXIT: + def = 0; + use = r0; + break; + case BPF_CALL: + def = ALL_CALLER_SAVED_REGS; + use = def & ~BIT(BPF_REG_0); + if (get_call_summary(env, insn, &cs)) + use = GENMASK(cs.num_params, 1); + break; + default: + def = 0; + if (BPF_SRC(insn->code) == BPF_K) + use = dst; + else + use = dst | src; + } + break; + } + + info->def = def; + info->use = use; +} + +/* Compute may-live registers after each instruction in the program. + * The register is live after the instruction I if it is read by some + * instruction S following I during program execution and is not + * overwritten between I and S. + * + * Store result in env->insn_aux_data[i].live_regs. + */ +static int compute_live_registers(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *insn_aux = env->insn_aux_data; + struct bpf_insn *insns = env->prog->insnsi; + struct insn_live_regs *state; + int insn_cnt = env->prog->len; + int err = 0, i, j; + bool changed; + + /* Use the following algorithm: + * - define the following: + * - I.use : a set of all registers read by instruction I; + * - I.def : a set of all registers written by instruction I; + * - I.in : a set of all registers that may be alive before I execution; + * - I.out : a set of all registers that may be alive after I execution; + * - insn_successors(I): a set of instructions S that might immediately + * follow I for some program execution; + * - associate separate empty sets 'I.in' and 'I.out' with each instruction; + * - visit each instruction in a postorder and update + * state[i].in, state[i].out as follows: + * + * state[i].out = U [state[s].in for S in insn_successors(i)] + * state[i].in = (state[i].out / state[i].def) U state[i].use + * + * (where U stands for set union, / stands for set difference) + * - repeat the computation while {in,out} fields changes for + * any instruction. + */ + state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL); + if (!state) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < insn_cnt; ++i) + compute_insn_live_regs(env, &insns[i], &state[i]); + + changed = true; + while (changed) { + changed = false; + for (i = 0; i < env->cfg.cur_postorder; ++i) { + int insn_idx = env->cfg.insn_postorder[i]; + struct insn_live_regs *live = &state[insn_idx]; + int succ_num; + u32 succ[2]; + u16 new_out = 0; + u16 new_in = 0; + + succ_num = insn_successors(env->prog, insn_idx, succ); + for (int s = 0; s < succ_num; ++s) + new_out |= state[succ[s]].in; + new_in = (new_out & ~live->def) | live->use; + if (new_out != live->out || new_in != live->in) { + live->in = new_in; + live->out = new_out; + changed = true; + } + } + } + + for (i = 0; i < insn_cnt; ++i) + insn_aux[i].live_regs_before = state[i].in; + + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "Live regs before insn:\n"); + for (i = 0; i < insn_cnt; ++i) { + verbose(env, "%3d: ", i); + for (j = BPF_REG_0; j < BPF_REG_10; ++j) + if (insn_aux[i].live_regs_before & BIT(j)) + verbose(env, "%d", j); + else + verbose(env, "."); + verbose(env, " "); + verbose_insn(env, &insns[i]); + if (bpf_is_ldimm64(&insns[i])) + i++; + } + } + +out: + kvfree(state); + kvfree(env->cfg.insn_postorder); + env->cfg.insn_postorder = NULL; + env->cfg.cur_postorder = 0; + return err; +} + int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { u64 start_time = ktime_get_ns(); @@ -23500,6 +23803,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret) goto skip_full_check; + ret = compute_live_registers(env); + if (ret < 0) + goto skip_full_check; + ret = mark_fastcall_patterns(env); if (ret < 0) goto skip_full_check; @@ -23638,6 +23945,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 vfree(env->insn_aux_data); kvfree(env->insn_hist); err_free_env: + kvfree(env->cfg.insn_postorder); kvfree(env); return ret; } diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c index 4ebd0da898f5c..1d53a8561ee2f 100644 --- a/tools/testing/selftests/bpf/prog_tests/align.c +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -610,9 +610,11 @@ static int do_test_single(struct bpf_align_test *test) .log_size = sizeof(bpf_vlog), .log_level = 2, ); + const char *main_pass_start = "0: R1=ctx() R10=fp0"; const char *line_ptr; int cur_line = -1; int prog_len, i; + char *start; int fd_prog; int ret; @@ -632,7 +634,13 @@ static int do_test_single(struct bpf_align_test *test) ret = 0; /* We make a local copy so that we can strtok() it */ strncpy(bpf_vlog_copy, bpf_vlog, sizeof(bpf_vlog_copy)); - line_ptr = strtok(bpf_vlog_copy, "\n"); + start = strstr(bpf_vlog_copy, main_pass_start); + if (!start) { + ret = 1; + printf("Can't find initial line '%s'\n", main_pass_start); + goto out; + } + line_ptr = strtok(start, "\n"); for (i = 0; i < MAX_MATCHES; i++) { struct bpf_reg_match m = test->matches[i]; const char *p; @@ -682,6 +690,7 @@ static int do_test_single(struct bpf_align_test *test) break; } } +out: if (fd_prog >= 0) close(fd_prog); } From 994a876a076a3374da51b2c33f5e582029418104 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 4 Mar 2025 11:50:23 -0800 Subject: [PATCH 52/80] bpf: use register liveness information for func_states_equal Liveness analysis DFA computes a set of registers live before each instruction. Leverage this information to skip comparison of dead registers in func_states_equal(). This helps with convergance of iterator processing loops, as bpf_reg_state->live marks can't be used when loops are processed. This has certain performance impact for selftests, here is a veristat listing using `-f "insns_pct>5" -f "!insns<200"` selftests: File Program States (A) States (B) States (DIFF) -------------------- ----------------------------- ---------- ---------- -------------- arena_htab.bpf.o arena_htab_llvm 37 35 -2 (-5.41%) arena_htab_asm.bpf.o arena_htab_asm 37 33 -4 (-10.81%) arena_list.bpf.o arena_list_add 37 22 -15 (-40.54%) dynptr_success.bpf.o test_dynptr_copy 22 16 -6 (-27.27%) dynptr_success.bpf.o test_dynptr_copy_xdp 68 58 -10 (-14.71%) iters.bpf.o checkpoint_states_deletion 918 40 -878 (-95.64%) iters.bpf.o clean_live_states 136 66 -70 (-51.47%) iters.bpf.o iter_nested_deeply_iters 43 37 -6 (-13.95%) iters.bpf.o iter_nested_iters 72 62 -10 (-13.89%) iters.bpf.o iter_pass_iter_ptr_to_subprog 30 26 -4 (-13.33%) iters.bpf.o iter_subprog_iters 68 59 -9 (-13.24%) iters.bpf.o loop_state_deps2 35 32 -3 (-8.57%) iters_css.bpf.o iter_css_for_each 32 29 -3 (-9.38%) pyperf600_iter.bpf.o on_event 286 192 -94 (-32.87%) Total progs: 3578 Old success: 2061 New success: 2061 States diff min: -95.64% States diff max: 0.00% -100 .. -90 %: 1 -55 .. -45 %: 3 -45 .. -35 %: 2 -35 .. -25 %: 5 -20 .. -10 %: 12 -10 .. 0 %: 6 sched_ext: File Program States (A) States (B) States (DIFF) ----------------- ---------------------- ---------- ---------- --------------- bpf.bpf.o lavd_dispatch 8950 7065 -1885 (-21.06%) bpf.bpf.o lavd_init 516 480 -36 (-6.98%) bpf.bpf.o layered_dispatch 662 501 -161 (-24.32%) bpf.bpf.o layered_dump 298 237 -61 (-20.47%) bpf.bpf.o layered_init 523 423 -100 (-19.12%) bpf.bpf.o layered_init_task 24 22 -2 (-8.33%) bpf.bpf.o layered_runnable 151 125 -26 (-17.22%) bpf.bpf.o p2dq_dispatch 66 53 -13 (-19.70%) bpf.bpf.o p2dq_init 170 142 -28 (-16.47%) bpf.bpf.o refresh_layer_cpumasks 120 78 -42 (-35.00%) bpf.bpf.o rustland_init 37 34 -3 (-8.11%) bpf.bpf.o rustland_init 37 34 -3 (-8.11%) bpf.bpf.o rusty_select_cpu 125 108 -17 (-13.60%) scx_central.bpf.o central_dispatch 59 43 -16 (-27.12%) scx_central.bpf.o central_init 39 28 -11 (-28.21%) scx_nest.bpf.o nest_init 58 51 -7 (-12.07%) scx_pair.bpf.o pair_dispatch 142 111 -31 (-21.83%) scx_qmap.bpf.o qmap_dispatch 174 141 -33 (-18.97%) scx_qmap.bpf.o qmap_init 768 654 -114 (-14.84%) Total progs: 216 Old success: 186 New success: 186 States diff min: -35.00% States diff max: 0.00% -35 .. -25 %: 3 -25 .. -20 %: 6 -20 .. -15 %: 6 -15 .. -5 %: 7 -5 .. 0 %: 6 Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250304195024.2478889-5-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b434251979b1a..4edb2db0f889c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -18500,15 +18500,17 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c * the current state will reach 'bpf_exit' instruction safely */ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, enum exact_level exact) + struct bpf_func_state *cur, u32 insn_idx, enum exact_level exact) { - int i; + u16 live_regs = env->insn_aux_data[insn_idx].live_regs_before; + u16 i; if (old->callback_depth > cur->callback_depth) return false; for (i = 0; i < MAX_BPF_REG; i++) - if (!regsafe(env, &old->regs[i], &cur->regs[i], + if (((1 << i) & live_regs) && + !regsafe(env, &old->regs[i], &cur->regs[i], &env->idmap_scratch, exact)) return false; @@ -18529,6 +18531,7 @@ static bool states_equal(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, enum exact_level exact) { + u32 insn_idx; int i; if (old->curframe != cur->curframe) @@ -18552,9 +18555,12 @@ static bool states_equal(struct bpf_verifier_env *env, * and all frame states need to be equivalent */ for (i = 0; i <= old->curframe; i++) { + insn_idx = i == old->curframe + ? env->insn_idx + : old->frame[i + 1]->callsite; if (old->frame[i]->callsite != cur->frame[i]->callsite) return false; - if (!func_states_equal(env, old->frame[i], cur->frame[i], exact)) + if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact)) return false; } return true; From 8a3fc22ddec7dcfd29c40a247d38f45c596b9bf6 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Tue, 4 Mar 2025 11:50:24 -0800 Subject: [PATCH 53/80] selftests/bpf: test cases for compute_live_registers() Cover instructions from each kind: - assignment - arithmetic - store/load - endian conversion - atomics - branches, conditional branches, may_goto, calls - LD_ABS/LD_IND - address_space_cast Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250304195024.2478889-6-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/compute_live_registers.c | 9 + tools/testing/selftests/bpf/progs/bpf_misc.h | 17 + .../bpf/progs/compute_live_registers.c | 424 ++++++++++++++++++ .../selftests/bpf/progs/verifier_gotol.c | 6 +- .../bpf/progs/verifier_iterating_callbacks.c | 6 +- .../bpf/progs/verifier_load_acquire.c | 7 +- 6 files changed, 455 insertions(+), 14 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/compute_live_registers.c create mode 100644 tools/testing/selftests/bpf/progs/compute_live_registers.c diff --git a/tools/testing/selftests/bpf/prog_tests/compute_live_registers.c b/tools/testing/selftests/bpf/prog_tests/compute_live_registers.c new file mode 100644 index 0000000000000..285f20241fe14 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/compute_live_registers.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "compute_live_registers.skel.h" +#include "test_progs.h" + +void test_compute_live_registers(void) +{ + RUN_TESTS(compute_live_registers); +} diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index 34f555da546fb..13a2e22f54658 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -213,4 +213,21 @@ #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #endif +#if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || \ + defined(__TARGET_ARCH_arm) || defined(__TARGET_ARCH_s390) || \ + defined(__TARGET_ARCH_loongarch)) && \ + __clang_major__ >= 18 +#define CAN_USE_GOTOL +#endif + +#if _clang_major__ >= 18 +#define CAN_USE_BPF_ST +#endif + +#if __clang_major__ >= 18 && defined(ENABLE_ATOMICS_TESTS) && \ + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) +#define CAN_USE_LOAD_ACQ_STORE_REL +#endif + #endif diff --git a/tools/testing/selftests/bpf/progs/compute_live_registers.c b/tools/testing/selftests/bpf/progs/compute_live_registers.c new file mode 100644 index 0000000000000..14df92949e81a --- /dev/null +++ b/tools/testing/selftests/bpf/progs/compute_live_registers.c @@ -0,0 +1,424 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "../../../include/linux/filter.h" +#include "bpf_arena_common.h" +#include "bpf_misc.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} test_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 1); +} arena SEC(".maps"); + +SEC("socket") +__log_level(2) +__msg(" 0: .......... (b7) r0 = 42") +__msg(" 1: 0......... (bf) r1 = r0") +__msg(" 2: .1........ (bf) r2 = r1") +__msg(" 3: ..2....... (bf) r3 = r2") +__msg(" 4: ...3...... (bf) r4 = r3") +__msg(" 5: ....4..... (bf) r5 = r4") +__msg(" 6: .....5.... (bf) r6 = r5") +__msg(" 7: ......6... (bf) r7 = r6") +__msg(" 8: .......7.. (bf) r8 = r7") +__msg(" 9: ........8. (bf) r9 = r8") +__msg("10: .........9 (bf) r0 = r9") +__msg("11: 0......... (95) exit") +__naked void assign_chain(void) +{ + asm volatile ( + "r0 = 42;" + "r1 = r0;" + "r2 = r1;" + "r3 = r2;" + "r4 = r3;" + "r5 = r4;" + "r6 = r5;" + "r7 = r6;" + "r8 = r7;" + "r9 = r8;" + "r0 = r9;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("0: .......... (b7) r1 = 7") +__msg("1: .1........ (07) r1 += 7") +__msg("2: .......... (b7) r2 = 7") +__msg("3: ..2....... (b7) r3 = 42") +__msg("4: ..23...... (0f) r2 += r3") +__msg("5: .......... (b7) r0 = 0") +__msg("6: 0......... (95) exit") +__naked void arithmetics(void) +{ + asm volatile ( + "r1 = 7;" + "r1 += 7;" + "r2 = 7;" + "r3 = 42;" + "r2 += r3;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +#ifdef CAN_USE_BPF_ST +SEC("socket") +__log_level(2) +__msg(" 1: .1........ (07) r1 += -8") +__msg(" 2: .1........ (7a) *(u64 *)(r1 +0) = 7") +__msg(" 3: .1........ (b7) r2 = 42") +__msg(" 4: .12....... (7b) *(u64 *)(r1 +0) = r2") +__msg(" 5: .12....... (7b) *(u64 *)(r1 +0) = r2") +__msg(" 6: .......... (b7) r0 = 0") +__naked void store(void) +{ + asm volatile ( + "r1 = r10;" + "r1 += -8;" + "*(u64 *)(r1 +0) = 7;" + "r2 = 42;" + "*(u64 *)(r1 +0) = r2;" + "*(u64 *)(r1 +0) = r2;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} +#endif + +SEC("socket") +__log_level(2) +__msg("1: ....4..... (07) r4 += -8") +__msg("2: ....4..... (79) r5 = *(u64 *)(r4 +0)") +__msg("3: ....45.... (07) r4 += -8") +__naked void load(void) +{ + asm volatile ( + "r4 = r10;" + "r4 += -8;" + "r5 = *(u64 *)(r4 +0);" + "r4 += -8;" + "r0 = r5;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("0: .1........ (61) r2 = *(u32 *)(r1 +0)") +__msg("1: ..2....... (d4) r2 = le64 r2") +__msg("2: ..2....... (bf) r0 = r2") +__naked void endian(void) +{ + asm volatile ( + "r2 = *(u32 *)(r1 +0);" + "r2 = le64 r2;" + "r0 = r2;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg(" 8: 0......... (b7) r1 = 1") +__msg(" 9: 01........ (db) r1 = atomic64_fetch_add((u64 *)(r0 +0), r1)") +__msg("10: 01........ (c3) lock *(u32 *)(r0 +0) += r1") +__msg("11: 01........ (db) r1 = atomic64_xchg((u64 *)(r0 +0), r1)") +__msg("12: 01........ (bf) r2 = r0") +__msg("13: .12....... (bf) r0 = r1") +__msg("14: 012....... (db) r0 = atomic64_cmpxchg((u64 *)(r2 +0), r0, r1)") +__naked void atomic(void) +{ + asm volatile ( + "r2 = r10;" + "r2 += -8;" + "r1 = 0;" + "*(u64 *)(r2 +0) = r1;" + "r1 = %[test_map] ll;" + "call %[bpf_map_lookup_elem];" + "if r0 == 0 goto 1f;" + "r1 = 1;" + "r1 = atomic_fetch_add((u64 *)(r0 +0), r1);" + ".8byte %[add_nofetch];" /* same as "lock *(u32 *)(r0 +0) += r1;" */ + "r1 = xchg_64(r0 + 0, r1);" + "r2 = r0;" + "r0 = r1;" + "r0 = cmpxchg_64(r2 + 0, r0, r1);" + "1: exit;" + : + : __imm(bpf_map_lookup_elem), + __imm_addr(test_map), + __imm_insn(add_nofetch, BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_0, BPF_REG_1, 0)) + : __clobber_all); +} + +#ifdef CAN_USE_LOAD_ACQ_STORE_REL + +SEC("socket") +__log_level(2) +__msg("2: .12....... (db) store_release((u64 *)(r2 -8), r1)") +__msg("3: .......... (bf) r3 = r10") +__msg("4: ...3...... (db) r4 = load_acquire((u64 *)(r3 -8))") +__naked void atomic_load_acq_store_rel(void) +{ + asm volatile ( + "r1 = 42;" + "r2 = r10;" + ".8byte %[store_release_insn];" /* store_release((u64 *)(r2 - 8), r1); */ + "r3 = r10;" + ".8byte %[load_acquire_insn];" /* r4 = load_acquire((u64 *)(r3 + 0)); */ + "r0 = r4;" + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_STORE_REL, BPF_REG_2, BPF_REG_1, -8)), + __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_LOAD_ACQ, BPF_REG_4, BPF_REG_3, -8)) + : __clobber_all); +} + +#endif /* CAN_USE_LOAD_ACQ_STORE_REL */ + +SEC("socket") +__log_level(2) +__msg("4: .12....7.. (85) call bpf_trace_printk#6") +__msg("5: 0......7.. (0f) r0 += r7") +__naked void regular_call(void) +{ + asm volatile ( + "r7 = 1;" + "r1 = r10;" + "r1 += -8;" + "r2 = 1;" + "call %[bpf_trace_printk];" + "r0 += r7;" + "exit;" + : + : __imm(bpf_trace_printk) + : __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("2: 012....... (25) if r1 > 0x7 goto pc+1") +__msg("3: ..2....... (bf) r0 = r2") +__naked void if1(void) +{ + asm volatile ( + "r0 = 1;" + "r2 = 2;" + "if r1 > 0x7 goto +1;" + "r0 = r2;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("3: 0123...... (2d) if r1 > r3 goto pc+1") +__msg("4: ..2....... (bf) r0 = r2") +__naked void if2(void) +{ + asm volatile ( + "r0 = 1;" + "r2 = 2;" + "r3 = 7;" + "if r1 > r3 goto +1;" + "r0 = r2;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("0: .......... (b7) r1 = 0") +__msg("1: .1........ (b7) r2 = 7") +__msg("2: .12....... (25) if r1 > 0x7 goto pc+4") +__msg("3: .12....... (07) r1 += 1") +__msg("4: .12....... (27) r2 *= 2") +__msg("5: .12....... (05) goto pc+0") +__msg("6: .12....... (05) goto pc-5") +__msg("7: .......... (b7) r0 = 0") +__msg("8: 0......... (95) exit") +__naked void loop(void) +{ + asm volatile ( + "r1 = 0;" + "r2 = 7;" + "if r1 > 0x7 goto +4;" + "r1 += 1;" + "r2 *= 2;" + "goto +0;" + "goto -5;" + "r0 = 0;" + "exit;" + : + : __imm(bpf_trace_printk) + : __clobber_all); +} + +#ifdef CAN_USE_GOTOL +SEC("socket") +__log_level(2) +__msg("2: .123...... (25) if r1 > 0x7 goto pc+2") +__msg("3: ..2....... (bf) r0 = r2") +__msg("4: 0......... (06) gotol pc+1") +__msg("5: ...3...... (bf) r0 = r3") +__msg("6: 0......... (95) exit") +__naked void gotol(void) +{ + asm volatile ( + "r2 = 42;" + "r3 = 24;" + "if r1 > 0x7 goto +2;" + "r0 = r2;" + "gotol +1;" + "r0 = r3;" + "exit;" + : + : __imm(bpf_trace_printk) + : __clobber_all); +} +#endif + +SEC("socket") +__log_level(2) +__msg("0: 0......... (b7) r1 = 1") +__msg("1: 01........ (e5) may_goto pc+1") +__msg("2: 0......... (05) goto pc-3") +__msg("3: .1........ (bf) r0 = r1") +__msg("4: 0......... (95) exit") +__naked void may_goto(void) +{ + asm volatile ( + "1: r1 = 1;" + ".8byte %[may_goto];" + "goto 1b;" + "r0 = r1;" + "exit;" + : + : __imm(bpf_get_smp_processor_id), + __imm_insn(may_goto, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, +1 /* offset */, 0)) + : __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("1: 0......... (18) r2 = 0x7") +__msg("3: 0.2....... (0f) r0 += r2") +__naked void ldimm64(void) +{ + asm volatile ( + "r0 = 0;" + "r2 = 0x7 ll;" + "r0 += r2;" + "exit;" + : + :: __clobber_all); +} + +/* No rules specific for LD_ABS/LD_IND, default behaviour kicks in */ +SEC("socket") +__log_level(2) +__msg("2: 0123456789 (30) r0 = *(u8 *)skb[42]") +__msg("3: 012.456789 (0f) r7 += r0") +__msg("4: 012.456789 (b7) r3 = 42") +__msg("5: 0123456789 (50) r0 = *(u8 *)skb[r3 + 0]") +__msg("6: 0......7.. (0f) r7 += r0") +__naked void ldabs(void) +{ + asm volatile ( + "r6 = r1;" + "r7 = 0;" + "r0 = *(u8 *)skb[42];" + "r7 += r0;" + "r3 = 42;" + ".8byte %[ld_ind];" /* same as "r0 = *(u8 *)skb[r3];" */ + "r7 += r0;" + "r0 = r7;" + "exit;" + : + : __imm_insn(ld_ind, BPF_LD_IND(BPF_B, BPF_REG_3, 0)) + : __clobber_all); +} + + +#ifdef __BPF_FEATURE_ADDR_SPACE_CAST +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__log_level(2) +__msg(" 6: .12345.... (85) call bpf_arena_alloc_pages") +__msg(" 7: 0......... (bf) r1 = addr_space_cast(r0, 0, 1)") +__msg(" 8: .1........ (b7) r2 = 42") +__naked void addr_space_cast(void) +{ + asm volatile ( + "r1 = %[arena] ll;" + "r2 = 0;" + "r3 = 1;" + "r4 = 0;" + "r5 = 0;" + "call %[bpf_arena_alloc_pages];" + "r1 = addr_space_cast(r0, 0, 1);" + "r2 = 42;" + "*(u64 *)(r1 +0) = r2;" + "r0 = 0;" + "exit;" + : + : __imm(bpf_arena_alloc_pages), + __imm_addr(arena) + : __clobber_all); +} +#endif + +static __used __naked int aux1(void) +{ + asm volatile ( + "r0 = r1;" + "r0 += r2;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("0: ....45.... (b7) r1 = 1") +__msg("1: .1..45.... (b7) r2 = 2") +__msg("2: .12.45.... (b7) r3 = 3") +/* Conservative liveness for subprog parameters. */ +__msg("3: .12345.... (85) call pc+2") +__msg("4: .......... (b7) r0 = 0") +__msg("5: 0......... (95) exit") +__msg("6: .12....... (bf) r0 = r1") +__msg("7: 0.2....... (0f) r0 += r2") +/* Conservative liveness for subprog return value. */ +__msg("8: 0......... (95) exit") +__naked void subprog1(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "call aux1;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +/* to retain debug info for BTF generation */ +void kfunc_root(void) +{ + bpf_arena_alloc_pages(0, 0, 0, 0, 0); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_gotol.c b/tools/testing/selftests/bpf/progs/verifier_gotol.c index 05a329ee45ee9..d5d8f24df3949 100644 --- a/tools/testing/selftests/bpf/progs/verifier_gotol.c +++ b/tools/testing/selftests/bpf/progs/verifier_gotol.c @@ -4,11 +4,7 @@ #include #include "bpf_misc.h" -#if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ - (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || \ - defined(__TARGET_ARCH_arm) || defined(__TARGET_ARCH_s390) || \ - defined(__TARGET_ARCH_loongarch)) && \ - __clang_major__ >= 18 +#ifdef CAN_USE_GOTOL SEC("socket") __description("gotol, small_imm") diff --git a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c index e54bb5385bc1b..75dd922e4e9f8 100644 --- a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c +++ b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c @@ -407,11 +407,7 @@ l0_%=: call %[bpf_jiffies64]; \ : __clobber_all); } -#if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ - (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || \ - defined(__TARGET_ARCH_arm) || defined(__TARGET_ARCH_s390) || \ - defined(__TARGET_ARCH_loongarch)) && \ - __clang_major__ >= 18 +#ifdef CAN_USE_GOTOL SEC("socket") __success __retval(0) __naked void gotol_and_may_goto(void) diff --git a/tools/testing/selftests/bpf/progs/verifier_load_acquire.c b/tools/testing/selftests/bpf/progs/verifier_load_acquire.c index ff308f24d7450..6080974538327 100644 --- a/tools/testing/selftests/bpf/progs/verifier_load_acquire.c +++ b/tools/testing/selftests/bpf/progs/verifier_load_acquire.c @@ -6,8 +6,7 @@ #include "../../../include/linux/filter.h" #include "bpf_misc.h" -#if __clang_major__ >= 18 && defined(ENABLE_ATOMICS_TESTS) && \ - (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) +#ifdef CAN_USE_LOAD_ACQ_STORE_REL SEC("socket") __description("load-acquire, 8-bit") @@ -182,7 +181,7 @@ __naked void load_acquire_from_sock_pointer(void) : __clobber_all); } -#else +#else /* CAN_USE_LOAD_ACQ_STORE_REL */ SEC("socket") __description("Clang version < 18, ENABLE_ATOMICS_TESTS not defined, and/or JIT doesn't support load-acquire, use a dummy test") @@ -192,6 +191,6 @@ int dummy_test(void) return 0; } -#endif +#endif /* CAN_USE_LOAD_ACQ_STORE_REL */ char _license[] SEC("license") = "GPL"; From aae1add9053ecc0d2f2898de24e968811c412720 Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Wed, 5 Mar 2025 00:54:36 -0800 Subject: [PATCH 54/80] bpf: correct use/def for may_goto instruction may_goto instruction does not use any registers, but in compute_insn_live_regs() it was treated as a regular conditional jump of kind BPF_K with r0 as source register. Thus unnecessarily marking r0 as used. Fixes: 7dad03653567 ("bpf: simple DFA-based live registers analysis") Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250305085436.2731464-1-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 1 + tools/testing/selftests/bpf/progs/compute_live_registers.c | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4edb2db0f889c..3303a3605ee80 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -23567,6 +23567,7 @@ static void compute_insn_live_regs(struct bpf_verifier_env *env, case BPF_JMP32: switch (code) { case BPF_JA: + case BPF_JCOND: def = 0; use = 0; break; diff --git a/tools/testing/selftests/bpf/progs/compute_live_registers.c b/tools/testing/selftests/bpf/progs/compute_live_registers.c index 14df92949e81a..f3d79aecbf935 100644 --- a/tools/testing/selftests/bpf/progs/compute_live_registers.c +++ b/tools/testing/selftests/bpf/progs/compute_live_registers.c @@ -294,9 +294,9 @@ __naked void gotol(void) SEC("socket") __log_level(2) -__msg("0: 0......... (b7) r1 = 1") -__msg("1: 01........ (e5) may_goto pc+1") -__msg("2: 0......... (05) goto pc-3") +__msg("0: .......... (b7) r1 = 1") +__msg("1: .1........ (e5) may_goto pc+1") +__msg("2: .......... (05) goto pc-3") __msg("3: .1........ (bf) r0 = r1") __msg("4: 0......... (95) exit") __naked void may_goto(void) From 7781fd0ddeb43d078b750ff27a54dd3f85a26481 Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Tue, 4 Mar 2025 20:45:19 +0000 Subject: [PATCH 55/80] bpf, docs: Fix broken link to renamed bpf_iter_task_vmas.c This file was renamed from bpf_iter_task_vma.c. Fixes: 45b38941c81f ("selftests/bpf: Rename bpf_iter_task_vma.c to bpf_iter_task_vmas.c") Signed-off-by: T.J. Mercier Acked-by: Song Liu Link: https://lore.kernel.org/r/20250304204520.201115-1-tjmercier@google.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/bpf_iterators.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/bpf/bpf_iterators.rst b/Documentation/bpf/bpf_iterators.rst index 07433915aa414..7f514cb6b0525 100644 --- a/Documentation/bpf/bpf_iterators.rst +++ b/Documentation/bpf/bpf_iterators.rst @@ -86,7 +86,7 @@ following steps: The following are a few examples of selftest BPF iterator programs: * `bpf_iter_tcp4.c `_ -* `bpf_iter_task_vma.c `_ +* `bpf_iter_task_vmas.c `_ * `bpf_iter_task_file.c `_ Let us look at ``bpf_iter_task_file.c``, which runs in kernel space: From 994fd3eae1b563a04458c665ae26cd06754ed96d Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 5 Mar 2025 19:54:29 -0800 Subject: [PATCH 56/80] selftests/bpf: Introduce cond_break_label Add a new cond_break_label macro that jumps to the specified label when the cond_break termination check fires, and allows us to better handle the uncontrolled termination of the loop. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250306035431.2186189-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/bpf_experimental.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index cd8ecd39c3f3c..6535c8ae3c469 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -368,12 +368,12 @@ l_true: \ ret; \ }) -#define cond_break \ +#define __cond_break(expr) \ ({ __label__ l_break, l_continue; \ asm volatile goto("may_goto %l[l_break]" \ :::: l_break); \ goto l_continue; \ - l_break: break; \ + l_break: expr; \ l_continue:; \ }) #else @@ -392,7 +392,7 @@ l_true: \ ret; \ }) -#define cond_break \ +#define __cond_break(expr) \ ({ __label__ l_break, l_continue; \ asm volatile goto("1:.byte 0xe5; \ .byte 0; \ @@ -400,7 +400,7 @@ l_true: \ .short 0" \ :::: l_break); \ goto l_continue; \ - l_break: break; \ + l_break: expr; \ l_continue:; \ }) #else @@ -418,7 +418,7 @@ l_true: \ ret; \ }) -#define cond_break \ +#define __cond_break(expr) \ ({ __label__ l_break, l_continue; \ asm volatile goto("1:.byte 0xe5; \ .byte 0; \ @@ -426,12 +426,15 @@ l_true: \ .short 0" \ :::: l_break); \ goto l_continue; \ - l_break: break; \ + l_break: expr; \ l_continue:; \ }) #endif #endif +#define cond_break __cond_break(break) +#define cond_break_label(label) __cond_break(goto label) + #ifndef bpf_nop_mov #define bpf_nop_mov(var) \ asm volatile("%[reg]=%[reg]"::[reg]"r"((short)var)) From 0201027a026c7afcd00cbeb697197f142313090e Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 5 Mar 2025 19:54:30 -0800 Subject: [PATCH 57/80] selftests/bpf: Introduce arena spin lock Implement queued spin lock algorithm as BPF program for lock words living in BPF arena. The algorithm is copied from kernel/locking/qspinlock.c and adapted for BPF use. We first implement abstract helpers for portable atomics and acquire/release load instructions, by relying on X86_64 presence to elide expensive barriers and rely on implementation details of the JIT, and fall back to slow but correct implementations elsewhere. When support for acquire/release load/stores lands, we can improve this state. Then, the qspinlock algorithm is adapted to remove dependence on multi-word atomics due to lack of support in BPF ISA. For instance, xchg_tail cannot use 16-bit xchg, and needs to be a implemented as a 32-bit try_cmpxchg loop. Loops which are seemingly infinite from verifier PoV are annotated with cond_break_label macro to return an error. Only 1024 NR_CPUs are supported. Note that the slow path is a global function, hence the verifier doesn't know the return value's precision. The recommended way of usage is to always test against zero for success, and not ret < 0 for error, as the verifier would assume ret > 0 has not been accounted for. Add comments in the function documentation about this quirk. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250306035431.2186189-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/bpf_arena_spin_lock.h | 512 ++++++++++++++++++ tools/testing/selftests/bpf/bpf_atomic.h | 140 +++++ 2 files changed, 652 insertions(+) create mode 100644 tools/testing/selftests/bpf/bpf_arena_spin_lock.h create mode 100644 tools/testing/selftests/bpf/bpf_atomic.h diff --git a/tools/testing/selftests/bpf/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/bpf_arena_spin_lock.h new file mode 100644 index 0000000000000..3aca389ce424a --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_arena_spin_lock.h @@ -0,0 +1,512 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#ifndef BPF_ARENA_SPIN_LOCK_H +#define BPF_ARENA_SPIN_LOCK_H + +#include +#include +#include "bpf_atomic.h" + +#define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label) +#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1) + +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) + +#define EBUSY 16 +#define EOPNOTSUPP 95 +#define ETIMEDOUT 110 + +#ifndef __arena +#define __arena __attribute__((address_space(1))) +#endif + +extern unsigned long CONFIG_NR_CPUS __kconfig; + +#define arena_spinlock_t struct qspinlock +/* FIXME: Using typedef causes CO-RE relocation error */ +/* typedef struct qspinlock arena_spinlock_t; */ + +struct arena_mcs_spinlock { + struct arena_mcs_spinlock __arena *next; + int locked; + int count; +}; + +struct arena_qnode { + struct arena_mcs_spinlock mcs; +}; + +#define _Q_MAX_NODES 4 +#define _Q_PENDING_LOOPS 1 + +/* + * Bitfields in the atomic value: + * + * 0- 7: locked byte + * 8: pending + * 9-15: not used + * 16-17: tail index + * 18-31: tail cpu (+1) + */ +#define _Q_MAX_CPUS 1024 + +#define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ + << _Q_ ## type ## _OFFSET) +#define _Q_LOCKED_OFFSET 0 +#define _Q_LOCKED_BITS 8 +#define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED) + +#define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS) +#define _Q_PENDING_BITS 8 +#define _Q_PENDING_MASK _Q_SET_MASK(PENDING) + +#define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS) +#define _Q_TAIL_IDX_BITS 2 +#define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX) + +#define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS) +#define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET) +#define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) + +#define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET +#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) + +#define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET) +#define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET) + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES]; + +static inline u32 encode_tail(int cpu, int idx) +{ + u32 tail; + + tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; + tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ + + return tail; +} + +static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail) +{ + u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; + u32 idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; + + return &qnodes[cpu][idx].mcs; +} + +static inline +struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx) +{ + return &((struct arena_qnode __arena *)base + idx)->mcs; +} + +#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) + +/** + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail) + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail) +{ + u32 old, new; + + old = atomic_read(&lock->val); + do { + new = (old & _Q_LOCKED_PENDING_MASK) | tail; + /* + * We can use relaxed semantics since the caller ensures that + * the MCS node is properly initialized before updating the + * tail. + */ + /* These loops are not expected to stall, but we still need to + * prove to the verifier they will terminate eventually. + */ + cond_break_label(out); + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); + + return old; +out: + bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); + return old; +} + +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(arena_spinlock_t __arena *lock) +{ + WRITE_ONCE(lock->pending, 0); +} + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + * + * Lock stealing is not allowed if this function is used. + */ +static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock) +{ + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); +} + +/** + * set_locked - Set the lock bit and own the lock + * @lock: Pointer to queued spinlock structure + * + * *,*,0 -> *,0,1 + */ +static __always_inline void set_locked(arena_spinlock_t __arena *lock) +{ + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); +} + +static __always_inline +u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock) +{ + u32 old, new; + + old = atomic_read(&lock->val); + do { + new = old | _Q_PENDING_VAL; + /* + * These loops are not expected to stall, but we still need to + * prove to the verifier they will terminate eventually. + */ + cond_break_label(out); + } while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new)); + + return old; +out: + bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); + return old; +} + +/** + * arena_spin_trylock - try to acquire the queued spinlock + * @lock : Pointer to queued spinlock structure + * Return: 1 if lock acquired, 0 if failed + */ +static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock) +{ + int val = atomic_read(&lock->val); + + if (unlikely(val)) + return 0; + + return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)); +} + +__noinline +int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val) +{ + struct arena_mcs_spinlock __arena *prev, *next, *node0, *node; + int ret = -ETIMEDOUT; + u32 old, tail; + int idx; + + /* + * Wait for in-progress pending->locked hand-overs with a bounded + * number of spins so that we guarantee forward progress. + * + * 0,1,0 -> 0,0,1 + */ + if (val == _Q_PENDING_VAL) { + int cnt = _Q_PENDING_LOOPS; + val = atomic_cond_read_relaxed_label(&lock->val, + (VAL != _Q_PENDING_VAL) || !cnt--, + release_err); + } + + /* + * If we observe any contention; queue. + */ + if (val & ~_Q_LOCKED_MASK) + goto queue; + + /* + * trylock || pending + * + * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock + */ + val = arena_fetch_set_pending_acquire(lock); + + /* + * If we observe contention, there is a concurrent locker. + * + * Undo and queue; our setting of PENDING might have made the + * n,0,0 -> 0,0,0 transition fail and it will now be waiting + * on @next to become !NULL. + */ + if (unlikely(val & ~_Q_LOCKED_MASK)) { + + /* Undo PENDING if we set it. */ + if (!(val & _Q_PENDING_MASK)) + clear_pending(lock); + + goto queue; + } + + /* + * We're pending, wait for the owner to go away. + * + * 0,1,1 -> *,1,0 + * + * this wait loop must be a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because not all + * clear_pending_set_locked() implementations imply full + * barriers. + */ + if (val & _Q_LOCKED_MASK) + smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); + + /* + * take ownership and clear the pending bit. + * + * 0,1,0 -> 0,0,1 + */ + clear_pending_set_locked(lock); + return 0; + + /* + * End of pending bit optimistic spinning and beginning of MCS + * queuing. + */ +queue: + node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs; + idx = node0->count++; + tail = encode_tail(bpf_get_smp_processor_id(), idx); + + /* + * 4 nodes are allocated based on the assumption that there will not be + * nested NMIs taking spinlocks. That may not be true in some + * architectures even though the chance of needing more than 4 nodes + * will still be extremely unlikely. When that happens, we simply return + * an error. Original qspinlock has a trylock fallback in this case. + */ + if (unlikely(idx >= _Q_MAX_NODES)) { + ret = -EBUSY; + goto release_node_err; + } + + node = grab_mcs_node(node0, idx); + + /* + * Ensure that we increment the head node->count before initialising + * the actual node. If the compiler is kind enough to reorder these + * stores, then an IRQ could overwrite our assignments. + */ + barrier(); + + node->locked = 0; + node->next = NULL; + + /* + * We touched a (possibly) cold cacheline in the per-cpu queue node; + * attempt the trylock once more in the hope someone let go while we + * weren't watching. + */ + if (arena_spin_trylock(lock)) + goto release; + + /* + * Ensure that the initialisation of @node is complete before we + * publish the updated tail via xchg_tail() and potentially link + * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. + */ + smp_wmb(); + + /* + * Publish the updated tail. + * We have already touched the queueing cacheline; don't bother with + * pending stuff. + * + * p,*,* -> n,*,* + */ + old = xchg_tail(lock, tail); + next = NULL; + + /* + * if there was a previous node; link it and wait until reaching the + * head of the waitqueue. + */ + if (old & _Q_TAIL_MASK) { + prev = decode_tail(old); + + /* Link @node into the waitqueue. */ + WRITE_ONCE(prev->next, node); + + arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); + + /* + * While waiting for the MCS lock, the next pointer may have + * been set by another lock waiter. We cannot prefetch here + * due to lack of equivalent instruction in BPF ISA. + */ + next = READ_ONCE(node->next); + } + + /* + * we're at the head of the waitqueue, wait for the owner & pending to + * go away. + * + * *,x,y -> *,0,0 + * + * this wait loop must use a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because the set_locked() function below + * does not imply a full barrier. + */ + val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK), + release_node_err); + + /* + * claim the lock: + * + * n,0,0 -> 0,0,1 : lock, uncontended + * *,*,0 -> *,*,1 : lock, contended + * + * If the queue head is the only one in the queue (lock value == tail) + * and nobody is pending, clear the tail code and grab the lock. + * Otherwise, we only need to grab the lock. + */ + + /* + * In the PV case we might already have _Q_LOCKED_VAL set, because + * of lock stealing; therefore we must also allow: + * + * n,0,1 -> 0,0,1 + * + * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the + * above wait condition, therefore any concurrent setting of + * PENDING will make the uncontended transition fail. + */ + if ((val & _Q_TAIL_MASK) == tail) { + if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) + goto release; /* No contention */ + } + + /* + * Either somebody is queued behind us or _Q_PENDING_VAL got set + * which will then detect the remaining tail and queue behind us + * ensuring we'll see a @next. + */ + set_locked(lock); + + /* + * contended path; wait for next if not observed yet, release. + */ + if (!next) + next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err); + + arch_mcs_spin_unlock_contended(&next->locked); + +release:; + /* + * release the node + * + * Doing a normal dec vs this_cpu_dec is fine. An upper context always + * decrements count it incremented before returning, thus we're fine. + * For contexts interrupting us, they either observe our dec or not. + * Just ensure the compiler doesn't reorder this statement, as a + * this_cpu_dec implicitly implied that. + */ + barrier(); + node0->count--; + return 0; +release_node_err: + barrier(); + node0->count--; + goto release_err; +release_err: + return ret; +} + +/** + * arena_spin_lock - acquire a queued spinlock + * @lock: Pointer to queued spinlock structure + * + * On error, returned value will be negative. + * On success, zero is returned. + * + * The return value _must_ be tested against zero for success, + * instead of checking it against negative, for passing the + * BPF verifier. + * + * The user should do: + * if (arena_spin_lock(...) != 0) // failure + * or + * if (arena_spin_lock(...) == 0) // success + * or + * if (arena_spin_lock(...)) // failure + * or + * if (!arena_spin_lock(...)) // success + * instead of: + * if (arena_spin_lock(...) < 0) // failure + * + * The return value can still be inspected later. + */ +static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock) +{ + int val = 0; + + if (CONFIG_NR_CPUS > 1024) + return -EOPNOTSUPP; + + bpf_preempt_disable(); + if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) + return 0; + + val = arena_spin_lock_slowpath(lock, val); + /* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */ + if (val) + bpf_preempt_enable(); + return val; +} + +/** + * arena_spin_unlock - release a queued spinlock + * @lock : Pointer to queued spinlock structure + */ +static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock) +{ + /* + * unlock() needs release semantics: + */ + smp_store_release(&lock->locked, 0); + bpf_preempt_enable(); +} + +#define arena_spin_lock_irqsave(lock, flags) \ + ({ \ + int __ret; \ + bpf_local_irq_save(&(flags)); \ + __ret = arena_spin_lock((lock)); \ + if (__ret) \ + bpf_local_irq_restore(&(flags)); \ + (__ret); \ + }) + +#define arena_spin_unlock_irqrestore(lock, flags) \ + ({ \ + arena_spin_unlock((lock)); \ + bpf_local_irq_restore(&(flags)); \ + }) + +#endif + +#endif /* BPF_ARENA_SPIN_LOCK_H */ diff --git a/tools/testing/selftests/bpf/bpf_atomic.h b/tools/testing/selftests/bpf/bpf_atomic.h new file mode 100644 index 0000000000000..a9674e5443222 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_atomic.h @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#ifndef BPF_ATOMIC_H +#define BPF_ATOMIC_H + +#include +#include +#include "bpf_experimental.h" + +extern bool CONFIG_X86_64 __kconfig __weak; + +/* + * __unqual_typeof(x) - Declare an unqualified scalar type, leaving + * non-scalar types unchanged, + * + * Prefer C11 _Generic for better compile-times and simpler code. Note: 'char' + * is not type-compatible with 'signed char', and we define a separate case. + * + * This is copied verbatim from kernel's include/linux/compiler_types.h, but + * with default expression (for pointers) changed from (x) to (typeof(x)0). + * + * This is because LLVM has a bug where for lvalue (x), it does not get rid of + * an extra address_space qualifier, but does in case of rvalue (typeof(x)0). + * Hence, for pointers, we need to create an rvalue expression to get the + * desired type. See https://github.com/llvm/llvm-project/issues/53400. + */ +#define __scalar_type_to_expr_cases(type) \ + unsigned type : (unsigned type)0, signed type : (signed type)0 + +#define __unqual_typeof(x) \ + typeof(_Generic((x), \ + char: (char)0, \ + __scalar_type_to_expr_cases(char), \ + __scalar_type_to_expr_cases(short), \ + __scalar_type_to_expr_cases(int), \ + __scalar_type_to_expr_cases(long), \ + __scalar_type_to_expr_cases(long long), \ + default: (typeof(x))0)) + +/* No-op for BPF */ +#define cpu_relax() ({}) + +#define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) + +#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *)&(x)) = (val)) + +#define cmpxchg(p, old, new) __sync_val_compare_and_swap((p), old, new) + +#define try_cmpxchg(p, pold, new) \ + ({ \ + __unqual_typeof(*(pold)) __o = *(pold); \ + __unqual_typeof(*(p)) __r = cmpxchg(p, __o, new); \ + if (__r != __o) \ + *(pold) = __r; \ + __r == __o; \ + }) + +#define try_cmpxchg_relaxed(p, pold, new) try_cmpxchg(p, pold, new) + +#define try_cmpxchg_acquire(p, pold, new) try_cmpxchg(p, pold, new) + +#define smp_mb() \ + ({ \ + unsigned long __val; \ + __sync_fetch_and_add(&__val, 0); \ + }) + +#define smp_rmb() \ + ({ \ + if (!CONFIG_X86_64) \ + smp_mb(); \ + else \ + barrier(); \ + }) + +#define smp_wmb() \ + ({ \ + if (!CONFIG_X86_64) \ + smp_mb(); \ + else \ + barrier(); \ + }) + +/* Control dependency provides LOAD->STORE, provide LOAD->LOAD */ +#define smp_acquire__after_ctrl_dep() ({ smp_rmb(); }) + +#define smp_load_acquire(p) \ + ({ \ + __unqual_typeof(*(p)) __v = READ_ONCE(*(p)); \ + if (!CONFIG_X86_64) \ + smp_mb(); \ + barrier(); \ + __v; \ + }) + +#define smp_store_release(p, val) \ + ({ \ + if (!CONFIG_X86_64) \ + smp_mb(); \ + barrier(); \ + WRITE_ONCE(*(p), val); \ + }) + +#define smp_cond_load_relaxed_label(p, cond_expr, label) \ + ({ \ + typeof(p) __ptr = (p); \ + __unqual_typeof(*(p)) VAL; \ + for (;;) { \ + VAL = (__unqual_typeof(*(p)))READ_ONCE(*__ptr); \ + if (cond_expr) \ + break; \ + cond_break_label(label); \ + cpu_relax(); \ + } \ + (typeof(*(p)))VAL; \ + }) + +#define smp_cond_load_acquire_label(p, cond_expr, label) \ + ({ \ + __unqual_typeof(*p) __val = \ + smp_cond_load_relaxed_label(p, cond_expr, label); \ + smp_acquire__after_ctrl_dep(); \ + (typeof(*(p)))__val; \ + }) + +#define atomic_read(p) READ_ONCE((p)->counter) + +#define atomic_cond_read_relaxed_label(p, cond_expr, label) \ + smp_cond_load_relaxed_label(&(p)->counter, cond_expr, label) + +#define atomic_cond_read_acquire_label(p, cond_expr, label) \ + smp_cond_load_acquire_label(&(p)->counter, cond_expr, label) + +#define atomic_try_cmpxchg_relaxed(p, pold, new) \ + try_cmpxchg_relaxed(&(p)->counter, pold, new) + +#define atomic_try_cmpxchg_acquire(p, pold, new) \ + try_cmpxchg_acquire(&(p)->counter, pold, new) + +#endif /* BPF_ATOMIC_H */ From 313149fd0a0772b7e5b8a689277e1b21e69093b0 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Wed, 5 Mar 2025 19:54:31 -0800 Subject: [PATCH 58/80] selftests/bpf: Add tests for arena spin lock Add some basic selftests for qspinlock built over BPF arena using cond_break_label macro. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20250306035431.2186189-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- .../bpf/prog_tests/arena_spin_lock.c | 108 ++++++++++++++++++ .../selftests/bpf/progs/arena_spin_lock.c | 51 +++++++++ 2 files changed, 159 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c create mode 100644 tools/testing/selftests/bpf/progs/arena_spin_lock.c diff --git a/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c b/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c new file mode 100644 index 0000000000000..bc3616ba891c2 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include + +struct qspinlock { int val; }; +typedef struct qspinlock arena_spinlock_t; + +struct arena_qnode { + unsigned long next; + int count; + int locked; +}; + +#include "arena_spin_lock.skel.h" + +static long cpu; +static int repeat; + +pthread_barrier_t barrier; + +static void *spin_lock_thread(void *arg) +{ + int err, prog_fd = *(u32 *)arg; + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = repeat, + ); + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + CPU_SET(__sync_fetch_and_add(&cpu, 1), &cpuset); + ASSERT_OK(pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset), "cpu affinity"); + + err = pthread_barrier_wait(&barrier); + if (err != PTHREAD_BARRIER_SERIAL_THREAD && err != 0) + ASSERT_FALSE(true, "pthread_barrier"); + + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run err"); + ASSERT_EQ((int)topts.retval, 0, "test_run retval"); + + pthread_exit(arg); +} + +static void test_arena_spin_lock_size(int size) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct arena_spin_lock *skel; + pthread_t thread_id[16]; + int prog_fd, i, err; + void *ret; + + if (get_nprocs() < 2) { + test__skip(); + return; + } + + skel = arena_spin_lock__open_and_load(); + if (!ASSERT_OK_PTR(skel, "arena_spin_lock__open_and_load")) + return; + if (skel->data->test_skip == 2) { + test__skip(); + goto end; + } + skel->bss->cs_count = size; + skel->bss->limit = repeat * 16; + + ASSERT_OK(pthread_barrier_init(&barrier, NULL, 16), "barrier init"); + + prog_fd = bpf_program__fd(skel->progs.prog); + for (i = 0; i < 16; i++) { + err = pthread_create(&thread_id[i], NULL, &spin_lock_thread, &prog_fd); + if (!ASSERT_OK(err, "pthread_create")) + goto end_barrier; + } + + for (i = 0; i < 16; i++) { + if (!ASSERT_OK(pthread_join(thread_id[i], &ret), "pthread_join")) + goto end_barrier; + if (!ASSERT_EQ(ret, &prog_fd, "ret == prog_fd")) + goto end_barrier; + } + + ASSERT_EQ(skel->bss->counter, repeat * 16, "check counter value"); + +end_barrier: + pthread_barrier_destroy(&barrier); +end: + arena_spin_lock__destroy(skel); + return; +} + +void test_arena_spin_lock(void) +{ + repeat = 1000; + if (test__start_subtest("arena_spin_lock_1")) + test_arena_spin_lock_size(1); + cpu = 0; + if (test__start_subtest("arena_spin_lock_1000")) + test_arena_spin_lock_size(1000); + cpu = 0; + repeat = 100; + if (test__start_subtest("arena_spin_lock_50000")) + test_arena_spin_lock_size(50000); +} diff --git a/tools/testing/selftests/bpf/progs/arena_spin_lock.c b/tools/testing/selftests/bpf/progs/arena_spin_lock.c new file mode 100644 index 0000000000000..c4500c37f85e0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/arena_spin_lock.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include "bpf_misc.h" +#include "bpf_arena_spin_lock.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 100); /* number of pages */ +#ifdef __TARGET_ARCH_arm64 + __ulong(map_extra, 0x1ull << 32); /* start of mmap() region */ +#else + __ulong(map_extra, 0x1ull << 44); /* start of mmap() region */ +#endif +} arena SEC(".maps"); + +int cs_count; + +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) +arena_spinlock_t __arena lock; +int test_skip = 1; +#else +int test_skip = 2; +#endif + +int counter; +int limit; + +SEC("tc") +int prog(void *ctx) +{ + int ret = -2; + +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) + unsigned long flags; + + if ((ret = arena_spin_lock_irqsave(&lock, flags))) + return ret; + if (counter != limit) + counter++; + bpf_repeat(cs_count); + ret = 0; + arena_spin_unlock_irqrestore(&lock, flags); +#endif + return ret; +} + +char _license[] SEC("license") = "GPL"; From 88b1c429671f475ee5bc11a00f5a31983ab98960 Mon Sep 17 00:00:00 2001 From: "Bastien Curutchet (eBPF Foundation)" Date: Tue, 4 Mar 2025 10:24:51 +0100 Subject: [PATCH 59/80] selftests/bpf: Move test_lwt_ip_encap to test_progs test_lwt_ip_encap.sh isn't used by the BPF CI. Add a new file in the test_progs framework to migrate the tests done by test_lwt_ip_encap.sh. It uses the same network topology and the same BPF programs located in progs/test_lwt_ip_encap.c. Rework the GSO part to avoid using nc and dd. Remove test_lwt_ip_encap.sh and its Makefile entry. Signed-off-by: Bastien Curutchet (eBPF Foundation) Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250304-lwt_ip-v1-1-8fdeb9e79a56@bootlin.com --- tools/testing/selftests/bpf/Makefile | 3 +- .../selftests/bpf/prog_tests/lwt_ip_encap.c | 540 ++++++++++++++++++ .../selftests/bpf/test_lwt_ip_encap.sh | 476 --------------- 3 files changed, 541 insertions(+), 478 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/lwt_ip_encap.c delete mode 100755 tools/testing/selftests/bpf/test_lwt_ip_encap.sh diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index e6a02d5b87d12..df4814b5200a5 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -95,7 +95,7 @@ TEST_GEN_PROGS += test_progs-cpuv4 TEST_INST_SUBDIRS += cpuv4 endif -TEST_GEN_FILES = test_lwt_ip_encap.bpf.o test_tc_edt.bpf.o +TEST_GEN_FILES = test_tc_edt.bpf.o TEST_FILES = xsk_prereqs.sh $(wildcard progs/btf_dump_test_case_*.c) # Order correspond to 'make run_tests' order @@ -104,7 +104,6 @@ TEST_PROGS := test_kmod.sh \ test_lirc_mode2.sh \ test_xdp_vlan_mode_generic.sh \ test_xdp_vlan_mode_native.sh \ - test_lwt_ip_encap.sh \ test_tc_tunnel.sh \ test_tc_edt.sh \ test_xdping.sh \ diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_ip_encap.c b/tools/testing/selftests/bpf/prog_tests/lwt_ip_encap.c new file mode 100644 index 0000000000000..b6391af5f6f96 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/lwt_ip_encap.c @@ -0,0 +1,540 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include + +#include "network_helpers.h" +#include "test_progs.h" + +#define BPF_FILE "test_lwt_ip_encap.bpf.o" + +#define NETNS_NAME_SIZE 32 +#define NETNS_BASE "ns-lwt-ip-encap" + +#define IP4_ADDR_1 "172.16.1.100" +#define IP4_ADDR_2 "172.16.2.100" +#define IP4_ADDR_3 "172.16.3.100" +#define IP4_ADDR_4 "172.16.4.100" +#define IP4_ADDR_5 "172.16.5.100" +#define IP4_ADDR_6 "172.16.6.100" +#define IP4_ADDR_7 "172.16.7.100" +#define IP4_ADDR_8 "172.16.8.100" +#define IP4_ADDR_GRE "172.16.16.100" + +#define IP4_ADDR_SRC IP4_ADDR_1 +#define IP4_ADDR_DST IP4_ADDR_4 + +#define IP6_ADDR_1 "fb01::1" +#define IP6_ADDR_2 "fb02::1" +#define IP6_ADDR_3 "fb03::1" +#define IP6_ADDR_4 "fb04::1" +#define IP6_ADDR_5 "fb05::1" +#define IP6_ADDR_6 "fb06::1" +#define IP6_ADDR_7 "fb07::1" +#define IP6_ADDR_8 "fb08::1" +#define IP6_ADDR_GRE "fb10::1" + +#define IP6_ADDR_SRC IP6_ADDR_1 +#define IP6_ADDR_DST IP6_ADDR_4 + +/* Setup/topology: + * + * NS1 NS2 NS3 + * veth1 <---> veth2 veth3 <---> veth4 (the top route) + * veth5 <---> veth6 veth7 <---> veth8 (the bottom route) + * + * Each vethN gets IP[4|6]_ADDR_N address. + * + * IP*_ADDR_SRC = IP*_ADDR_1 + * IP*_ADDR_DST = IP*_ADDR_4 + * + * All tests test pings from IP*_ADDR__SRC to IP*_ADDR_DST. + * + * By default, routes are configured to allow packets to go + * IP*_ADDR_1 <=> IP*_ADDR_2 <=> IP*_ADDR_3 <=> IP*_ADDR_4 (the top route). + * + * A GRE device is installed in NS3 with IP*_ADDR_GRE, and + * NS1/NS2 are configured to route packets to IP*_ADDR_GRE via IP*_ADDR_8 + * (the bottom route). + * + * Tests: + * + * 1. Routes NS2->IP*_ADDR_DST are brought down, so the only way a ping + * from IP*_ADDR_SRC to IP*_ADDR_DST can work is via IP*_ADDR_GRE. + * + * 2a. In an egress test, a bpf LWT_XMIT program is installed on veth1 + * that encaps the packets with an IP/GRE header to route to IP*_ADDR_GRE. + * + * ping: SRC->[encap at veth1:egress]->GRE:decap->DST + * ping replies go DST->SRC directly + * + * 2b. In an ingress test, a bpf LWT_IN program is installed on veth2 + * that encaps the packets with an IP/GRE header to route to IP*_ADDR_GRE. + * + * ping: SRC->[encap at veth2:ingress]->GRE:decap->DST + * ping replies go DST->SRC directly + */ + +static int create_ns(char *name, size_t name_sz) +{ + if (!name) + goto fail; + + if (!ASSERT_OK(append_tid(name, name_sz), "append TID")) + goto fail; + + SYS(fail, "ip netns add %s", name); + + /* rp_filter gets confused by what these tests are doing, so disable it */ + SYS(fail, "ip netns exec %s sysctl -wq net.ipv4.conf.all.rp_filter=0", name); + SYS(fail, "ip netns exec %s sysctl -wq net.ipv4.conf.default.rp_filter=0", name); + /* Disable IPv6 DAD because it sometimes takes too long and fails tests */ + SYS(fail, "ip netns exec %s sysctl -wq net.ipv6.conf.all.accept_dad=0", name); + SYS(fail, "ip netns exec %s sysctl -wq net.ipv6.conf.default.accept_dad=0", name); + + return 0; +fail: + return -1; +} + +static int set_top_addr(const char *ns1, const char *ns2, const char *ns3) +{ + SYS(fail, "ip -n %s a add %s/24 dev veth1", ns1, IP4_ADDR_1); + SYS(fail, "ip -n %s a add %s/24 dev veth2", ns2, IP4_ADDR_2); + SYS(fail, "ip -n %s a add %s/24 dev veth3", ns2, IP4_ADDR_3); + SYS(fail, "ip -n %s a add %s/24 dev veth4", ns3, IP4_ADDR_4); + SYS(fail, "ip -n %s -6 a add %s/128 dev veth1", ns1, IP6_ADDR_1); + SYS(fail, "ip -n %s -6 a add %s/128 dev veth2", ns2, IP6_ADDR_2); + SYS(fail, "ip -n %s -6 a add %s/128 dev veth3", ns2, IP6_ADDR_3); + SYS(fail, "ip -n %s -6 a add %s/128 dev veth4", ns3, IP6_ADDR_4); + + SYS(fail, "ip -n %s link set dev veth1 up", ns1); + SYS(fail, "ip -n %s link set dev veth2 up", ns2); + SYS(fail, "ip -n %s link set dev veth3 up", ns2); + SYS(fail, "ip -n %s link set dev veth4 up", ns3); + + return 0; +fail: + return 1; +} + +static int set_bottom_addr(const char *ns1, const char *ns2, const char *ns3) +{ + SYS(fail, "ip -n %s a add %s/24 dev veth5", ns1, IP4_ADDR_5); + SYS(fail, "ip -n %s a add %s/24 dev veth6", ns2, IP4_ADDR_6); + SYS(fail, "ip -n %s a add %s/24 dev veth7", ns2, IP4_ADDR_7); + SYS(fail, "ip -n %s a add %s/24 dev veth8", ns3, IP4_ADDR_8); + SYS(fail, "ip -n %s -6 a add %s/128 dev veth5", ns1, IP6_ADDR_5); + SYS(fail, "ip -n %s -6 a add %s/128 dev veth6", ns2, IP6_ADDR_6); + SYS(fail, "ip -n %s -6 a add %s/128 dev veth7", ns2, IP6_ADDR_7); + SYS(fail, "ip -n %s -6 a add %s/128 dev veth8", ns3, IP6_ADDR_8); + + SYS(fail, "ip -n %s link set dev veth5 up", ns1); + SYS(fail, "ip -n %s link set dev veth6 up", ns2); + SYS(fail, "ip -n %s link set dev veth7 up", ns2); + SYS(fail, "ip -n %s link set dev veth8 up", ns3); + + return 0; +fail: + return 1; +} + +static int configure_vrf(const char *ns1, const char *ns2) +{ + if (!ns1 || !ns2) + goto fail; + + SYS(fail, "ip -n %s link add red type vrf table 1001", ns1); + SYS(fail, "ip -n %s link set red up", ns1); + SYS(fail, "ip -n %s route add table 1001 unreachable default metric 8192", ns1); + SYS(fail, "ip -n %s -6 route add table 1001 unreachable default metric 8192", ns1); + SYS(fail, "ip -n %s link set veth1 vrf red", ns1); + SYS(fail, "ip -n %s link set veth5 vrf red", ns1); + + SYS(fail, "ip -n %s link add red type vrf table 1001", ns2); + SYS(fail, "ip -n %s link set red up", ns2); + SYS(fail, "ip -n %s route add table 1001 unreachable default metric 8192", ns2); + SYS(fail, "ip -n %s -6 route add table 1001 unreachable default metric 8192", ns2); + SYS(fail, "ip -n %s link set veth2 vrf red", ns2); + SYS(fail, "ip -n %s link set veth3 vrf red", ns2); + SYS(fail, "ip -n %s link set veth6 vrf red", ns2); + SYS(fail, "ip -n %s link set veth7 vrf red", ns2); + + return 0; +fail: + return -1; +} + +static int configure_ns1(const char *ns1, const char *vrf) +{ + struct nstoken *nstoken = NULL; + + if (!ns1 || !vrf) + goto fail; + + nstoken = open_netns(ns1); + if (!ASSERT_OK_PTR(nstoken, "open ns1")) + goto fail; + + /* Top route */ + SYS(fail, "ip route add %s/32 dev veth1 %s", IP4_ADDR_2, vrf); + SYS(fail, "ip route add default dev veth1 via %s %s", IP4_ADDR_2, vrf); + SYS(fail, "ip -6 route add %s/128 dev veth1 %s", IP6_ADDR_2, vrf); + SYS(fail, "ip -6 route add default dev veth1 via %s %s", IP6_ADDR_2, vrf); + /* Bottom route */ + SYS(fail, "ip route add %s/32 dev veth5 %s", IP4_ADDR_6, vrf); + SYS(fail, "ip route add %s/32 dev veth5 via %s %s", IP4_ADDR_7, IP4_ADDR_6, vrf); + SYS(fail, "ip route add %s/32 dev veth5 via %s %s", IP4_ADDR_8, IP4_ADDR_6, vrf); + SYS(fail, "ip -6 route add %s/128 dev veth5 %s", IP6_ADDR_6, vrf); + SYS(fail, "ip -6 route add %s/128 dev veth5 via %s %s", IP6_ADDR_7, IP6_ADDR_6, vrf); + SYS(fail, "ip -6 route add %s/128 dev veth5 via %s %s", IP6_ADDR_8, IP6_ADDR_6, vrf); + + close_netns(nstoken); + return 0; +fail: + close_netns(nstoken); + return -1; +} + +static int configure_ns2(const char *ns2, const char *vrf) +{ + struct nstoken *nstoken = NULL; + + if (!ns2 || !vrf) + goto fail; + + nstoken = open_netns(ns2); + if (!ASSERT_OK_PTR(nstoken, "open ns2")) + goto fail; + + SYS(fail, "ip netns exec %s sysctl -wq net.ipv4.ip_forward=1", ns2); + SYS(fail, "ip netns exec %s sysctl -wq net.ipv6.conf.all.forwarding=1", ns2); + + /* Top route */ + SYS(fail, "ip route add %s/32 dev veth2 %s", IP4_ADDR_1, vrf); + SYS(fail, "ip route add %s/32 dev veth3 %s", IP4_ADDR_4, vrf); + SYS(fail, "ip -6 route add %s/128 dev veth2 %s", IP6_ADDR_1, vrf); + SYS(fail, "ip -6 route add %s/128 dev veth3 %s", IP6_ADDR_4, vrf); + /* Bottom route */ + SYS(fail, "ip route add %s/32 dev veth6 %s", IP4_ADDR_5, vrf); + SYS(fail, "ip route add %s/32 dev veth7 %s", IP4_ADDR_8, vrf); + SYS(fail, "ip -6 route add %s/128 dev veth6 %s", IP6_ADDR_5, vrf); + SYS(fail, "ip -6 route add %s/128 dev veth7 %s", IP6_ADDR_8, vrf); + + close_netns(nstoken); + return 0; +fail: + close_netns(nstoken); + return -1; +} + +static int configure_ns3(const char *ns3) +{ + struct nstoken *nstoken = NULL; + + if (!ns3) + goto fail; + + nstoken = open_netns(ns3); + if (!ASSERT_OK_PTR(nstoken, "open ns3")) + goto fail; + + /* Top route */ + SYS(fail, "ip route add %s/32 dev veth4", IP4_ADDR_3); + SYS(fail, "ip route add %s/32 dev veth4 via %s", IP4_ADDR_1, IP4_ADDR_3); + SYS(fail, "ip route add %s/32 dev veth4 via %s", IP4_ADDR_2, IP4_ADDR_3); + SYS(fail, "ip -6 route add %s/128 dev veth4", IP6_ADDR_3); + SYS(fail, "ip -6 route add %s/128 dev veth4 via %s", IP6_ADDR_1, IP6_ADDR_3); + SYS(fail, "ip -6 route add %s/128 dev veth4 via %s", IP6_ADDR_2, IP6_ADDR_3); + /* Bottom route */ + SYS(fail, "ip route add %s/32 dev veth8", IP4_ADDR_7); + SYS(fail, "ip route add %s/32 dev veth8 via %s", IP4_ADDR_5, IP4_ADDR_7); + SYS(fail, "ip route add %s/32 dev veth8 via %s", IP4_ADDR_6, IP4_ADDR_7); + SYS(fail, "ip -6 route add %s/128 dev veth8", IP6_ADDR_7); + SYS(fail, "ip -6 route add %s/128 dev veth8 via %s", IP6_ADDR_5, IP6_ADDR_7); + SYS(fail, "ip -6 route add %s/128 dev veth8 via %s", IP6_ADDR_6, IP6_ADDR_7); + + /* Configure IPv4 GRE device */ + SYS(fail, "ip tunnel add gre_dev mode gre remote %s local %s ttl 255", + IP4_ADDR_1, IP4_ADDR_GRE); + SYS(fail, "ip link set gre_dev up"); + SYS(fail, "ip a add %s dev gre_dev", IP4_ADDR_GRE); + + /* Configure IPv6 GRE device */ + SYS(fail, "ip tunnel add gre6_dev mode ip6gre remote %s local %s ttl 255", + IP6_ADDR_1, IP6_ADDR_GRE); + SYS(fail, "ip link set gre6_dev up"); + SYS(fail, "ip a add %s dev gre6_dev", IP6_ADDR_GRE); + + close_netns(nstoken); + return 0; +fail: + close_netns(nstoken); + return -1; +} + +static int setup_network(char *ns1, char *ns2, char *ns3, const char *vrf) +{ + if (!ns1 || !ns2 || !ns3 || !vrf) + goto fail; + + SYS(fail, "ip -n %s link add veth1 type veth peer name veth2 netns %s", ns1, ns2); + SYS(fail, "ip -n %s link add veth3 type veth peer name veth4 netns %s", ns2, ns3); + SYS(fail, "ip -n %s link add veth5 type veth peer name veth6 netns %s", ns1, ns2); + SYS(fail, "ip -n %s link add veth7 type veth peer name veth8 netns %s", ns2, ns3); + + if (vrf[0]) { + if (!ASSERT_OK(configure_vrf(ns1, ns2), "configure vrf")) + goto fail; + } + if (!ASSERT_OK(set_top_addr(ns1, ns2, ns3), "set top addresses")) + goto fail; + + if (!ASSERT_OK(set_bottom_addr(ns1, ns2, ns3), "set bottom addresses")) + goto fail; + + if (!ASSERT_OK(configure_ns1(ns1, vrf), "configure ns1 routes")) + goto fail; + + if (!ASSERT_OK(configure_ns2(ns2, vrf), "configure ns2 routes")) + goto fail; + + if (!ASSERT_OK(configure_ns3(ns3), "configure ns3 routes")) + goto fail; + + /* Link bottom route to the GRE tunnels */ + SYS(fail, "ip -n %s route add %s/32 dev veth5 via %s %s", + ns1, IP4_ADDR_GRE, IP4_ADDR_6, vrf); + SYS(fail, "ip -n %s route add %s/32 dev veth7 via %s %s", + ns2, IP4_ADDR_GRE, IP4_ADDR_8, vrf); + SYS(fail, "ip -n %s -6 route add %s/128 dev veth5 via %s %s", + ns1, IP6_ADDR_GRE, IP6_ADDR_6, vrf); + SYS(fail, "ip -n %s -6 route add %s/128 dev veth7 via %s %s", + ns2, IP6_ADDR_GRE, IP6_ADDR_8, vrf); + + return 0; +fail: + return -1; +} + +static int remove_routes_to_gredev(const char *ns1, const char *ns2, const char *vrf) +{ + SYS(fail, "ip -n %s route del %s dev veth5 %s", ns1, IP4_ADDR_GRE, vrf); + SYS(fail, "ip -n %s route del %s dev veth7 %s", ns2, IP4_ADDR_GRE, vrf); + SYS(fail, "ip -n %s -6 route del %s/128 dev veth5 %s", ns1, IP6_ADDR_GRE, vrf); + SYS(fail, "ip -n %s -6 route del %s/128 dev veth7 %s", ns2, IP6_ADDR_GRE, vrf); + + return 0; +fail: + return -1; +} + +static int add_unreachable_routes_to_gredev(const char *ns1, const char *ns2, const char *vrf) +{ + SYS(fail, "ip -n %s route add unreachable %s/32 %s", ns1, IP4_ADDR_GRE, vrf); + SYS(fail, "ip -n %s route add unreachable %s/32 %s", ns2, IP4_ADDR_GRE, vrf); + SYS(fail, "ip -n %s -6 route add unreachable %s/128 %s", ns1, IP6_ADDR_GRE, vrf); + SYS(fail, "ip -n %s -6 route add unreachable %s/128 %s", ns2, IP6_ADDR_GRE, vrf); + + return 0; +fail: + return -1; +} + +#define GSO_SIZE 5000 +#define GSO_TCP_PORT 9000 +/* This tests the fix from commit ea0371f78799 ("net: fix GSO in bpf_lwt_push_ip_encap") */ +static int test_gso_fix(const char *ns1, const char *ns3, int family) +{ + const char *ip_addr = family == AF_INET ? IP4_ADDR_DST : IP6_ADDR_DST; + char gso_packet[GSO_SIZE] = {}; + struct nstoken *nstoken = NULL; + int sfd, cfd, afd; + ssize_t bytes; + int ret = -1; + + if (!ns1 || !ns3) + return ret; + + nstoken = open_netns(ns3); + if (!ASSERT_OK_PTR(nstoken, "open ns3")) + return ret; + + sfd = start_server_str(family, SOCK_STREAM, ip_addr, GSO_TCP_PORT, NULL); + if (!ASSERT_OK_FD(sfd, "start server")) + goto close_netns; + + close_netns(nstoken); + + nstoken = open_netns(ns1); + if (!ASSERT_OK_PTR(nstoken, "open ns1")) + goto close_server; + + cfd = connect_to_addr_str(family, SOCK_STREAM, ip_addr, GSO_TCP_PORT, NULL); + if (!ASSERT_OK_FD(cfd, "connect to server")) + goto close_server; + + close_netns(nstoken); + nstoken = NULL; + + afd = accept(sfd, NULL, NULL); + if (!ASSERT_OK_FD(afd, "accept")) + goto close_client; + + /* Send a packet larger than MTU */ + bytes = send(cfd, gso_packet, GSO_SIZE, 0); + if (!ASSERT_EQ(bytes, GSO_SIZE, "send packet")) + goto close_accept; + + /* Verify we received all expected bytes */ + bytes = read(afd, gso_packet, GSO_SIZE); + if (!ASSERT_EQ(bytes, GSO_SIZE, "receive packet")) + goto close_accept; + + ret = 0; + +close_accept: + close(afd); +close_client: + close(cfd); +close_server: + close(sfd); +close_netns: + close_netns(nstoken); + + return ret; +} + +static int check_ping_ok(const char *ns1) +{ + SYS(fail, "ip netns exec %s ping -c 1 -W1 -I veth1 %s > /dev/null", ns1, IP4_ADDR_DST); + SYS(fail, "ip netns exec %s ping6 -c 1 -W1 -I veth1 %s > /dev/null", ns1, IP6_ADDR_DST); + return 0; +fail: + return -1; +} + +static int check_ping_fails(const char *ns1) +{ + int ret; + + ret = SYS_NOFAIL("ip netns exec %s ping -c 1 -W1 -I veth1 %s", ns1, IP4_ADDR_DST); + if (!ret) + return -1; + + ret = SYS_NOFAIL("ip netns exec %s ping6 -c 1 -W1 -I veth1 %s", ns1, IP6_ADDR_DST); + if (!ret) + return -1; + + return 0; +} + +#define EGRESS true +#define INGRESS false +#define IPV4_ENCAP true +#define IPV6_ENCAP false +static void lwt_ip_encap(bool ipv4_encap, bool egress, const char *vrf) +{ + char ns1[NETNS_NAME_SIZE] = NETNS_BASE "-1-"; + char ns2[NETNS_NAME_SIZE] = NETNS_BASE "-2-"; + char ns3[NETNS_NAME_SIZE] = NETNS_BASE "-3-"; + char *sec = ipv4_encap ? "encap_gre" : "encap_gre6"; + + if (!vrf) + return; + + if (!ASSERT_OK(create_ns(ns1, NETNS_NAME_SIZE), "create ns1")) + goto out; + if (!ASSERT_OK(create_ns(ns2, NETNS_NAME_SIZE), "create ns2")) + goto out; + if (!ASSERT_OK(create_ns(ns3, NETNS_NAME_SIZE), "create ns3")) + goto out; + + if (!ASSERT_OK(setup_network(ns1, ns2, ns3, vrf), "setup network")) + goto out; + + /* By default, pings work */ + if (!ASSERT_OK(check_ping_ok(ns1), "ping OK")) + goto out; + + /* Remove NS2->DST routes, ping fails */ + SYS(out, "ip -n %s route del %s/32 dev veth3 %s", ns2, IP4_ADDR_DST, vrf); + SYS(out, "ip -n %s -6 route del %s/128 dev veth3 %s", ns2, IP6_ADDR_DST, vrf); + if (!ASSERT_OK(check_ping_fails(ns1), "ping expected fail")) + goto out; + + /* Install replacement routes (LWT/eBPF), pings succeed */ + if (egress) { + SYS(out, "ip -n %s route add %s encap bpf xmit obj %s sec %s dev veth1 %s", + ns1, IP4_ADDR_DST, BPF_FILE, sec, vrf); + SYS(out, "ip -n %s -6 route add %s encap bpf xmit obj %s sec %s dev veth1 %s", + ns1, IP6_ADDR_DST, BPF_FILE, sec, vrf); + } else { + SYS(out, "ip -n %s route add %s encap bpf in obj %s sec %s dev veth2 %s", + ns2, IP4_ADDR_DST, BPF_FILE, sec, vrf); + SYS(out, "ip -n %s -6 route add %s encap bpf in obj %s sec %s dev veth2 %s", + ns2, IP6_ADDR_DST, BPF_FILE, sec, vrf); + } + + if (!ASSERT_OK(check_ping_ok(ns1), "ping OK")) + goto out; + + /* Skip GSO tests with VRF: VRF routing needs properly assigned + * source IP/device, which is easy to do with ping but hard with TCP. + */ + if (egress && !vrf[0]) { + if (!ASSERT_OK(test_gso_fix(ns1, ns3, AF_INET), "test GSO")) + goto out; + } + + /* Negative test: remove routes to GRE devices: ping fails */ + if (!ASSERT_OK(remove_routes_to_gredev(ns1, ns2, vrf), "remove routes to gredev")) + goto out; + if (!ASSERT_OK(check_ping_fails(ns1), "ping expected fail")) + goto out; + + /* Another negative test */ + if (!ASSERT_OK(add_unreachable_routes_to_gredev(ns1, ns2, vrf), + "add unreachable routes")) + goto out; + ASSERT_OK(check_ping_fails(ns1), "ping expected fail"); + +out: + SYS_NOFAIL("ip netns del %s", ns1); + SYS_NOFAIL("ip netns del %s", ns2); + SYS_NOFAIL("ip netns del %s", ns3); +} + +void test_lwt_ip_encap_vrf_ipv6(void) +{ + if (test__start_subtest("egress")) + lwt_ip_encap(IPV6_ENCAP, EGRESS, "vrf red"); + + if (test__start_subtest("ingress")) + lwt_ip_encap(IPV6_ENCAP, INGRESS, "vrf red"); +} + +void test_lwt_ip_encap_vrf_ipv4(void) +{ + if (test__start_subtest("egress")) + lwt_ip_encap(IPV4_ENCAP, EGRESS, "vrf red"); + + if (test__start_subtest("ingress")) + lwt_ip_encap(IPV4_ENCAP, INGRESS, "vrf red"); +} + +void test_lwt_ip_encap_ipv6(void) +{ + if (test__start_subtest("egress")) + lwt_ip_encap(IPV6_ENCAP, EGRESS, ""); + + if (test__start_subtest("ingress")) + lwt_ip_encap(IPV6_ENCAP, INGRESS, ""); +} + +void test_lwt_ip_encap_ipv4(void) +{ + if (test__start_subtest("egress")) + lwt_ip_encap(IPV4_ENCAP, EGRESS, ""); + + if (test__start_subtest("ingress")) + lwt_ip_encap(IPV4_ENCAP, INGRESS, ""); +} diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh deleted file mode 100755 index 1e565f47aca94..0000000000000 --- a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh +++ /dev/null @@ -1,476 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Setup/topology: -# -# NS1 NS2 NS3 -# veth1 <---> veth2 veth3 <---> veth4 (the top route) -# veth5 <---> veth6 veth7 <---> veth8 (the bottom route) -# -# each vethN gets IPv[4|6]_N address -# -# IPv*_SRC = IPv*_1 -# IPv*_DST = IPv*_4 -# -# all tests test pings from IPv*_SRC to IPv*_DST -# -# by default, routes are configured to allow packets to go -# IP*_1 <=> IP*_2 <=> IP*_3 <=> IP*_4 (the top route) -# -# a GRE device is installed in NS3 with IPv*_GRE, and -# NS1/NS2 are configured to route packets to IPv*_GRE via IP*_8 -# (the bottom route) -# -# Tests: -# -# 1. routes NS2->IPv*_DST are brought down, so the only way a ping -# from IP*_SRC to IP*_DST can work is via IPv*_GRE -# -# 2a. in an egress test, a bpf LWT_XMIT program is installed on veth1 -# that encaps the packets with an IP/GRE header to route to IPv*_GRE -# -# ping: SRC->[encap at veth1:egress]->GRE:decap->DST -# ping replies go DST->SRC directly -# -# 2b. in an ingress test, a bpf LWT_IN program is installed on veth2 -# that encaps the packets with an IP/GRE header to route to IPv*_GRE -# -# ping: SRC->[encap at veth2:ingress]->GRE:decap->DST -# ping replies go DST->SRC directly - -BPF_FILE="test_lwt_ip_encap.bpf.o" -if [[ $EUID -ne 0 ]]; then - echo "This script must be run as root" - echo "FAIL" - exit 1 -fi - -readonly NS1="ns1-$(mktemp -u XXXXXX)" -readonly NS2="ns2-$(mktemp -u XXXXXX)" -readonly NS3="ns3-$(mktemp -u XXXXXX)" - -readonly IPv4_1="172.16.1.100" -readonly IPv4_2="172.16.2.100" -readonly IPv4_3="172.16.3.100" -readonly IPv4_4="172.16.4.100" -readonly IPv4_5="172.16.5.100" -readonly IPv4_6="172.16.6.100" -readonly IPv4_7="172.16.7.100" -readonly IPv4_8="172.16.8.100" -readonly IPv4_GRE="172.16.16.100" - -readonly IPv4_SRC=$IPv4_1 -readonly IPv4_DST=$IPv4_4 - -readonly IPv6_1="fb01::1" -readonly IPv6_2="fb02::1" -readonly IPv6_3="fb03::1" -readonly IPv6_4="fb04::1" -readonly IPv6_5="fb05::1" -readonly IPv6_6="fb06::1" -readonly IPv6_7="fb07::1" -readonly IPv6_8="fb08::1" -readonly IPv6_GRE="fb10::1" - -readonly IPv6_SRC=$IPv6_1 -readonly IPv6_DST=$IPv6_4 - -TEST_STATUS=0 -TESTS_SUCCEEDED=0 -TESTS_FAILED=0 - -TMPFILE="" - -process_test_results() -{ - if [[ "${TEST_STATUS}" -eq 0 ]] ; then - echo "PASS" - TESTS_SUCCEEDED=$((TESTS_SUCCEEDED+1)) - else - echo "FAIL" - TESTS_FAILED=$((TESTS_FAILED+1)) - fi -} - -print_test_summary_and_exit() -{ - echo "passed tests: ${TESTS_SUCCEEDED}" - echo "failed tests: ${TESTS_FAILED}" - if [ "${TESTS_FAILED}" -eq "0" ] ; then - exit 0 - else - exit 1 - fi -} - -setup() -{ - set -e # exit on error - TEST_STATUS=0 - - # create devices and namespaces - ip netns add "${NS1}" - ip netns add "${NS2}" - ip netns add "${NS3}" - - # rp_filter gets confused by what these tests are doing, so disable it - ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0 - ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0 - ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0 - ip netns exec ${NS1} sysctl -wq net.ipv4.conf.default.rp_filter=0 - ip netns exec ${NS2} sysctl -wq net.ipv4.conf.default.rp_filter=0 - ip netns exec ${NS3} sysctl -wq net.ipv4.conf.default.rp_filter=0 - - # disable IPv6 DAD because it sometimes takes too long and fails tests - ip netns exec ${NS1} sysctl -wq net.ipv6.conf.all.accept_dad=0 - ip netns exec ${NS2} sysctl -wq net.ipv6.conf.all.accept_dad=0 - ip netns exec ${NS3} sysctl -wq net.ipv6.conf.all.accept_dad=0 - ip netns exec ${NS1} sysctl -wq net.ipv6.conf.default.accept_dad=0 - ip netns exec ${NS2} sysctl -wq net.ipv6.conf.default.accept_dad=0 - ip netns exec ${NS3} sysctl -wq net.ipv6.conf.default.accept_dad=0 - - ip link add veth1 type veth peer name veth2 - ip link add veth3 type veth peer name veth4 - ip link add veth5 type veth peer name veth6 - ip link add veth7 type veth peer name veth8 - - ip netns exec ${NS2} sysctl -wq net.ipv4.ip_forward=1 - ip netns exec ${NS2} sysctl -wq net.ipv6.conf.all.forwarding=1 - - ip link set veth1 netns ${NS1} - ip link set veth2 netns ${NS2} - ip link set veth3 netns ${NS2} - ip link set veth4 netns ${NS3} - ip link set veth5 netns ${NS1} - ip link set veth6 netns ${NS2} - ip link set veth7 netns ${NS2} - ip link set veth8 netns ${NS3} - - if [ ! -z "${VRF}" ] ; then - ip -netns ${NS1} link add red type vrf table 1001 - ip -netns ${NS1} link set red up - ip -netns ${NS1} route add table 1001 unreachable default metric 8192 - ip -netns ${NS1} -6 route add table 1001 unreachable default metric 8192 - ip -netns ${NS1} link set veth1 vrf red - ip -netns ${NS1} link set veth5 vrf red - - ip -netns ${NS2} link add red type vrf table 1001 - ip -netns ${NS2} link set red up - ip -netns ${NS2} route add table 1001 unreachable default metric 8192 - ip -netns ${NS2} -6 route add table 1001 unreachable default metric 8192 - ip -netns ${NS2} link set veth2 vrf red - ip -netns ${NS2} link set veth3 vrf red - ip -netns ${NS2} link set veth6 vrf red - ip -netns ${NS2} link set veth7 vrf red - fi - - # configure addesses: the top route (1-2-3-4) - ip -netns ${NS1} addr add ${IPv4_1}/24 dev veth1 - ip -netns ${NS2} addr add ${IPv4_2}/24 dev veth2 - ip -netns ${NS2} addr add ${IPv4_3}/24 dev veth3 - ip -netns ${NS3} addr add ${IPv4_4}/24 dev veth4 - ip -netns ${NS1} -6 addr add ${IPv6_1}/128 nodad dev veth1 - ip -netns ${NS2} -6 addr add ${IPv6_2}/128 nodad dev veth2 - ip -netns ${NS2} -6 addr add ${IPv6_3}/128 nodad dev veth3 - ip -netns ${NS3} -6 addr add ${IPv6_4}/128 nodad dev veth4 - - # configure addresses: the bottom route (5-6-7-8) - ip -netns ${NS1} addr add ${IPv4_5}/24 dev veth5 - ip -netns ${NS2} addr add ${IPv4_6}/24 dev veth6 - ip -netns ${NS2} addr add ${IPv4_7}/24 dev veth7 - ip -netns ${NS3} addr add ${IPv4_8}/24 dev veth8 - ip -netns ${NS1} -6 addr add ${IPv6_5}/128 nodad dev veth5 - ip -netns ${NS2} -6 addr add ${IPv6_6}/128 nodad dev veth6 - ip -netns ${NS2} -6 addr add ${IPv6_7}/128 nodad dev veth7 - ip -netns ${NS3} -6 addr add ${IPv6_8}/128 nodad dev veth8 - - ip -netns ${NS1} link set dev veth1 up - ip -netns ${NS2} link set dev veth2 up - ip -netns ${NS2} link set dev veth3 up - ip -netns ${NS3} link set dev veth4 up - ip -netns ${NS1} link set dev veth5 up - ip -netns ${NS2} link set dev veth6 up - ip -netns ${NS2} link set dev veth7 up - ip -netns ${NS3} link set dev veth8 up - - # configure routes: IP*_SRC -> veth1/IP*_2 (= top route) default; - # the bottom route to specific bottom addresses - - # NS1 - # top route - ip -netns ${NS1} route add ${IPv4_2}/32 dev veth1 ${VRF} - ip -netns ${NS1} route add default dev veth1 via ${IPv4_2} ${VRF} # go top by default - ip -netns ${NS1} -6 route add ${IPv6_2}/128 dev veth1 ${VRF} - ip -netns ${NS1} -6 route add default dev veth1 via ${IPv6_2} ${VRF} # go top by default - # bottom route - ip -netns ${NS1} route add ${IPv4_6}/32 dev veth5 ${VRF} - ip -netns ${NS1} route add ${IPv4_7}/32 dev veth5 via ${IPv4_6} ${VRF} - ip -netns ${NS1} route add ${IPv4_8}/32 dev veth5 via ${IPv4_6} ${VRF} - ip -netns ${NS1} -6 route add ${IPv6_6}/128 dev veth5 ${VRF} - ip -netns ${NS1} -6 route add ${IPv6_7}/128 dev veth5 via ${IPv6_6} ${VRF} - ip -netns ${NS1} -6 route add ${IPv6_8}/128 dev veth5 via ${IPv6_6} ${VRF} - - # NS2 - # top route - ip -netns ${NS2} route add ${IPv4_1}/32 dev veth2 ${VRF} - ip -netns ${NS2} route add ${IPv4_4}/32 dev veth3 ${VRF} - ip -netns ${NS2} -6 route add ${IPv6_1}/128 dev veth2 ${VRF} - ip -netns ${NS2} -6 route add ${IPv6_4}/128 dev veth3 ${VRF} - # bottom route - ip -netns ${NS2} route add ${IPv4_5}/32 dev veth6 ${VRF} - ip -netns ${NS2} route add ${IPv4_8}/32 dev veth7 ${VRF} - ip -netns ${NS2} -6 route add ${IPv6_5}/128 dev veth6 ${VRF} - ip -netns ${NS2} -6 route add ${IPv6_8}/128 dev veth7 ${VRF} - - # NS3 - # top route - ip -netns ${NS3} route add ${IPv4_3}/32 dev veth4 - ip -netns ${NS3} route add ${IPv4_1}/32 dev veth4 via ${IPv4_3} - ip -netns ${NS3} route add ${IPv4_2}/32 dev veth4 via ${IPv4_3} - ip -netns ${NS3} -6 route add ${IPv6_3}/128 dev veth4 - ip -netns ${NS3} -6 route add ${IPv6_1}/128 dev veth4 via ${IPv6_3} - ip -netns ${NS3} -6 route add ${IPv6_2}/128 dev veth4 via ${IPv6_3} - # bottom route - ip -netns ${NS3} route add ${IPv4_7}/32 dev veth8 - ip -netns ${NS3} route add ${IPv4_5}/32 dev veth8 via ${IPv4_7} - ip -netns ${NS3} route add ${IPv4_6}/32 dev veth8 via ${IPv4_7} - ip -netns ${NS3} -6 route add ${IPv6_7}/128 dev veth8 - ip -netns ${NS3} -6 route add ${IPv6_5}/128 dev veth8 via ${IPv6_7} - ip -netns ${NS3} -6 route add ${IPv6_6}/128 dev veth8 via ${IPv6_7} - - # configure IPv4 GRE device in NS3, and a route to it via the "bottom" route - ip -netns ${NS3} tunnel add gre_dev mode gre remote ${IPv4_1} local ${IPv4_GRE} ttl 255 - ip -netns ${NS3} link set gre_dev up - ip -netns ${NS3} addr add ${IPv4_GRE} dev gre_dev - ip -netns ${NS1} route add ${IPv4_GRE}/32 dev veth5 via ${IPv4_6} ${VRF} - ip -netns ${NS2} route add ${IPv4_GRE}/32 dev veth7 via ${IPv4_8} ${VRF} - - - # configure IPv6 GRE device in NS3, and a route to it via the "bottom" route - ip -netns ${NS3} -6 tunnel add name gre6_dev mode ip6gre remote ${IPv6_1} local ${IPv6_GRE} ttl 255 - ip -netns ${NS3} link set gre6_dev up - ip -netns ${NS3} -6 addr add ${IPv6_GRE} nodad dev gre6_dev - ip -netns ${NS1} -6 route add ${IPv6_GRE}/128 dev veth5 via ${IPv6_6} ${VRF} - ip -netns ${NS2} -6 route add ${IPv6_GRE}/128 dev veth7 via ${IPv6_8} ${VRF} - - TMPFILE=$(mktemp /tmp/test_lwt_ip_encap.XXXXXX) - - sleep 1 # reduce flakiness - set +e -} - -cleanup() -{ - if [ -f ${TMPFILE} ] ; then - rm ${TMPFILE} - fi - - ip netns del ${NS1} 2> /dev/null - ip netns del ${NS2} 2> /dev/null - ip netns del ${NS3} 2> /dev/null -} - -trap cleanup EXIT - -remove_routes_to_gredev() -{ - ip -netns ${NS1} route del ${IPv4_GRE} dev veth5 ${VRF} - ip -netns ${NS2} route del ${IPv4_GRE} dev veth7 ${VRF} - ip -netns ${NS1} -6 route del ${IPv6_GRE}/128 dev veth5 ${VRF} - ip -netns ${NS2} -6 route del ${IPv6_GRE}/128 dev veth7 ${VRF} -} - -add_unreachable_routes_to_gredev() -{ - ip -netns ${NS1} route add unreachable ${IPv4_GRE}/32 ${VRF} - ip -netns ${NS2} route add unreachable ${IPv4_GRE}/32 ${VRF} - ip -netns ${NS1} -6 route add unreachable ${IPv6_GRE}/128 ${VRF} - ip -netns ${NS2} -6 route add unreachable ${IPv6_GRE}/128 ${VRF} -} - -test_ping() -{ - local readonly PROTO=$1 - local readonly EXPECTED=$2 - local RET=0 - - if [ "${PROTO}" == "IPv4" ] ; then - ip netns exec ${NS1} ping -c 1 -W 1 -I veth1 ${IPv4_DST} 2>&1 > /dev/null - RET=$? - elif [ "${PROTO}" == "IPv6" ] ; then - ip netns exec ${NS1} ping6 -c 1 -W 1 -I veth1 ${IPv6_DST} 2>&1 > /dev/null - RET=$? - else - echo " test_ping: unknown PROTO: ${PROTO}" - TEST_STATUS=1 - fi - - if [ "0" != "${RET}" ]; then - RET=1 - fi - - if [ "${EXPECTED}" != "${RET}" ] ; then - echo " test_ping failed: expected: ${EXPECTED}; got ${RET}" - TEST_STATUS=1 - fi -} - -test_gso() -{ - local readonly PROTO=$1 - local readonly PKT_SZ=5000 - local IP_DST="" - : > ${TMPFILE} # trim the capture file - - # check that nc is present - command -v nc >/dev/null 2>&1 || \ - { echo >&2 "nc is not available: skipping TSO tests"; return; } - - # listen on port 9000, capture TCP into $TMPFILE - if [ "${PROTO}" == "IPv4" ] ; then - IP_DST=${IPv4_DST} - ip netns exec ${NS3} bash -c \ - "nc -4 -l -p 9000 > ${TMPFILE} &" - elif [ "${PROTO}" == "IPv6" ] ; then - IP_DST=${IPv6_DST} - ip netns exec ${NS3} bash -c \ - "nc -6 -l -p 9000 > ${TMPFILE} &" - RET=$? - else - echo " test_gso: unknown PROTO: ${PROTO}" - TEST_STATUS=1 - fi - sleep 1 # let nc start listening - - # send a packet larger than MTU - ip netns exec ${NS1} bash -c \ - "dd if=/dev/zero bs=$PKT_SZ count=1 > /dev/tcp/${IP_DST}/9000 2>/dev/null" - sleep 2 # let the packet get delivered - - # verify we received all expected bytes - SZ=$(stat -c %s ${TMPFILE}) - if [ "$SZ" != "$PKT_SZ" ] ; then - echo " test_gso failed: ${PROTO}" - TEST_STATUS=1 - fi -} - -test_egress() -{ - local readonly ENCAP=$1 - echo "starting egress ${ENCAP} encap test ${VRF}" - setup - - # by default, pings work - test_ping IPv4 0 - test_ping IPv6 0 - - # remove NS2->DST routes, ping fails - ip -netns ${NS2} route del ${IPv4_DST}/32 dev veth3 ${VRF} - ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3 ${VRF} - test_ping IPv4 1 - test_ping IPv6 1 - - # install replacement routes (LWT/eBPF), pings succeed - if [ "${ENCAP}" == "IPv4" ] ; then - ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj \ - ${BPF_FILE} sec encap_gre dev veth1 ${VRF} - ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj \ - ${BPF_FILE} sec encap_gre dev veth1 ${VRF} - elif [ "${ENCAP}" == "IPv6" ] ; then - ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj \ - ${BPF_FILE} sec encap_gre6 dev veth1 ${VRF} - ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj \ - ${BPF_FILE} sec encap_gre6 dev veth1 ${VRF} - else - echo " unknown encap ${ENCAP}" - TEST_STATUS=1 - fi - test_ping IPv4 0 - test_ping IPv6 0 - - # skip GSO tests with VRF: VRF routing needs properly assigned - # source IP/device, which is easy to do with ping and hard with dd/nc. - if [ -z "${VRF}" ] ; then - test_gso IPv4 - test_gso IPv6 - fi - - # a negative test: remove routes to GRE devices: ping fails - remove_routes_to_gredev - test_ping IPv4 1 - test_ping IPv6 1 - - # another negative test - add_unreachable_routes_to_gredev - test_ping IPv4 1 - test_ping IPv6 1 - - cleanup - process_test_results -} - -test_ingress() -{ - local readonly ENCAP=$1 - echo "starting ingress ${ENCAP} encap test ${VRF}" - setup - - # need to wait a bit for IPv6 to autoconf, otherwise - # ping6 sometimes fails with "unable to bind to address" - - # by default, pings work - test_ping IPv4 0 - test_ping IPv6 0 - - # remove NS2->DST routes, pings fail - ip -netns ${NS2} route del ${IPv4_DST}/32 dev veth3 ${VRF} - ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3 ${VRF} - test_ping IPv4 1 - test_ping IPv6 1 - - # install replacement routes (LWT/eBPF), pings succeed - if [ "${ENCAP}" == "IPv4" ] ; then - ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj \ - ${BPF_FILE} sec encap_gre dev veth2 ${VRF} - ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj \ - ${BPF_FILE} sec encap_gre dev veth2 ${VRF} - elif [ "${ENCAP}" == "IPv6" ] ; then - ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj \ - ${BPF_FILE} sec encap_gre6 dev veth2 ${VRF} - ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj \ - ${BPF_FILE} sec encap_gre6 dev veth2 ${VRF} - else - echo "FAIL: unknown encap ${ENCAP}" - TEST_STATUS=1 - fi - test_ping IPv4 0 - test_ping IPv6 0 - - # a negative test: remove routes to GRE devices: ping fails - remove_routes_to_gredev - test_ping IPv4 1 - test_ping IPv6 1 - - # another negative test - add_unreachable_routes_to_gredev - test_ping IPv4 1 - test_ping IPv6 1 - - cleanup - process_test_results -} - -VRF="" -test_egress IPv4 -test_egress IPv6 -test_ingress IPv4 -test_ingress IPv6 - -VRF="vrf red" -test_egress IPv4 -test_egress IPv6 -test_ingress IPv4 -test_ingress IPv6 - -print_test_summary_and_exit From 5cb4077d3ae8a41249633364214e1784faab34db Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 5 Mar 2025 10:20:55 -0800 Subject: [PATCH 60/80] selftests/bpf: Clean up call sites of stdio_restore() reset_affinity() and save_ns() are only called in run_one_test(). There is no need to call stdio_restore() in reset_affinity() and save_ns() if stdio_restore() is moved right after a test finishes in run_one_test(). Also remove an unnecessary check of env.stdout_saved in crash_handler() by moving env.stdout_saved assignment to the beginning of main(). Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Acked-by: Eduard Zingerman Link: https://patch.msgid.link/20250305182057.2802606-1-ameryhung@gmail.com --- tools/testing/selftests/bpf/test_progs.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 0cb759632225d..ab0f2fed3c580 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -474,8 +474,6 @@ static void dump_test_log(const struct prog_test_def *test, print_test_result(test, test_state); } -static void stdio_restore(void); - /* A bunch of tests set custom affinity per-thread and/or per-process. Reset * it after each test/sub-test. */ @@ -490,13 +488,11 @@ static void reset_affinity(void) err = sched_setaffinity(0, sizeof(cpuset), &cpuset); if (err < 0) { - stdio_restore(); fprintf(stderr, "Failed to reset process affinity: %d!\n", err); exit(EXIT_ERR_SETUP_INFRA); } err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); if (err < 0) { - stdio_restore(); fprintf(stderr, "Failed to reset thread affinity: %d!\n", err); exit(EXIT_ERR_SETUP_INFRA); } @@ -514,7 +510,6 @@ static void save_netns(void) static void restore_netns(void) { if (setns(env.saved_netns_fd, CLONE_NEWNET) == -1) { - stdio_restore(); perror("setns(CLONE_NEWNS)"); exit(EXIT_ERR_SETUP_INFRA); } @@ -1270,8 +1265,7 @@ void crash_handler(int signum) sz = backtrace(bt, ARRAY_SIZE(bt)); - if (env.stdout_saved) - stdio_restore(); + stdio_restore(); if (env.test) { env.test_state->error_cnt++; dump_test_log(env.test, env.test_state, true, false, NULL); @@ -1400,6 +1394,8 @@ static void run_one_test(int test_num) state->tested = true; + stdio_restore(); + if (verbose() && env.worker_id == -1) print_test_result(test, state); @@ -1408,7 +1404,6 @@ static void run_one_test(int test_num) if (test->need_cgroup_cleanup) cleanup_cgroup_environment(); - stdio_restore(); free(stop_libbpf_log_capture()); dump_test_log(test, state, false, false, NULL); @@ -1943,6 +1938,9 @@ int main(int argc, char **argv) sigaction(SIGSEGV, &sigact, NULL); + env.stdout_saved = stdout; + env.stderr_saved = stderr; + env.secs_till_notify = 10; env.secs_till_kill = 120; err = argp_parse(&argp, argc, argv, 0, NULL, &env); @@ -1969,9 +1967,6 @@ int main(int argc, char **argv) return -1; } - env.stdout_saved = stdout; - env.stderr_saved = stderr; - env.has_testmod = true; if (!env.list_test_names) { /* ensure previous instance of the module is unloaded */ From 6d54a02c50c1bb1ad3972d413f5dc7eaaab17e13 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 5 Mar 2025 10:20:56 -0800 Subject: [PATCH 61/80] selftests/bpf: Allow assigning traffic monitor print function Allow users to change traffic monitor's print function. If not provided, traffic monitor will print to stdout by default. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Link: https://patch.msgid.link/20250305182057.2802606-2-ameryhung@gmail.com --- tools/testing/selftests/bpf/network_helpers.c | 69 ++++++++++++++----- tools/testing/selftests/bpf/network_helpers.h | 9 +++ 2 files changed, 59 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 737a952dcf80b..97fb7caf95f45 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -750,6 +750,36 @@ struct tmonitor_ctx { int pcap_fd; }; +static int __base_pr(const char *format, va_list args) +{ + return vfprintf(stdout, format, args); +} + +static tm_print_fn_t __tm_pr = __base_pr; + +tm_print_fn_t traffic_monitor_set_print(tm_print_fn_t fn) +{ + tm_print_fn_t old_print_fn; + + old_print_fn = __atomic_exchange_n(&__tm_pr, fn, __ATOMIC_RELAXED); + + return old_print_fn; +} + +void tm_print(const char *format, ...) +{ + tm_print_fn_t print_fn; + va_list args; + + print_fn = __atomic_load_n(&__tm_pr, __ATOMIC_RELAXED); + if (!print_fn) + return; + + va_start(args, format); + print_fn(format, args); + va_end(args); +} + /* Is this packet captured with a Ethernet protocol type? */ static bool is_ethernet(const u_char *packet) { @@ -767,7 +797,7 @@ static bool is_ethernet(const u_char *packet) case 770: /* ARPHRD_FRAD */ case 778: /* ARPHDR_IPGRE */ case 803: /* ARPHRD_IEEE80211_RADIOTAP */ - printf("Packet captured: arphdr_type=%d\n", arphdr_type); + tm_print("Packet captured: arphdr_type=%d\n", arphdr_type); return false; } return true; @@ -817,19 +847,19 @@ static void show_transport(const u_char *packet, u16 len, u32 ifindex, dst_port = ntohs(tcp->dest); transport_str = "TCP"; } else if (proto == IPPROTO_ICMP) { - printf("%-7s %-3s IPv4 %s > %s: ICMP, length %d, type %d, code %d\n", - ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, - packet[0], packet[1]); + tm_print("%-7s %-3s IPv4 %s > %s: ICMP, length %d, type %d, code %d\n", + ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, + packet[0], packet[1]); return; } else if (proto == IPPROTO_ICMPV6) { - printf("%-7s %-3s IPv6 %s > %s: ICMPv6, length %d, type %d, code %d\n", - ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, - packet[0], packet[1]); + tm_print("%-7s %-3s IPv6 %s > %s: ICMPv6, length %d, type %d, code %d\n", + ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, + packet[0], packet[1]); return; } else { - printf("%-7s %-3s %s %s > %s: protocol %d\n", - ifname, pkt_type_str(pkt_type), ipv6 ? "IPv6" : "IPv4", - src_addr, dst_addr, proto); + tm_print("%-7s %-3s %s %s > %s: protocol %d\n", + ifname, pkt_type_str(pkt_type), ipv6 ? "IPv6" : "IPv4", + src_addr, dst_addr, proto); return; } @@ -843,13 +873,13 @@ static void show_transport(const u_char *packet, u16 len, u32 ifindex, tcp->ack ? ", ACK" : ""); if (ipv6) - printf("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d%s\n", - ifname, pkt_type_str(pkt_type), src_addr, src_port, - dst_addr, dst_port, transport_str, len, flags); + tm_print("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d%s\n", + ifname, pkt_type_str(pkt_type), src_addr, src_port, + dst_addr, dst_port, transport_str, len, flags); else - printf("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d%s\n", - ifname, pkt_type_str(pkt_type), src_addr, src_port, - dst_addr, dst_port, transport_str, len, flags); + tm_print("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d%s\n", + ifname, pkt_type_str(pkt_type), src_addr, src_port, + dst_addr, dst_port, transport_str, len, flags); } static void show_ipv6_packet(const u_char *packet, u32 ifindex, u8 pkt_type) @@ -964,8 +994,8 @@ static void *traffic_monitor_thread(void *arg) ifname = _ifname; } - printf("%-7s %-3s Unknown network protocol type 0x%x\n", - ifname, pkt_type_str(ptype), proto); + tm_print("%-7s %-3s Unknown network protocol type 0x%x\n", + ifname, pkt_type_str(ptype), proto); } } @@ -1165,8 +1195,9 @@ void traffic_monitor_stop(struct tmonitor_ctx *ctx) write(ctx->wake_fd, &w, sizeof(w)); pthread_join(ctx->thread, NULL); - printf("Packet file: %s\n", strrchr(ctx->pkt_fname, '/') + 1); + tm_print("Packet file: %s\n", strrchr(ctx->pkt_fname, '/') + 1); traffic_monitor_release(ctx); } + #endif /* TRAFFIC_MONITOR */ diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 9f6e05d886c54..a4e20c12c57ef 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -17,6 +17,7 @@ typedef __u16 __sum16; #include #include #include +#include #define MAGIC_VAL 0x1234 #define NUM_ITER 100000 @@ -249,10 +250,13 @@ static inline __sum16 build_udp_v6_csum(const struct ipv6hdr *ip6h, struct tmonitor_ctx; +typedef int (*tm_print_fn_t)(const char *format, va_list args); + #ifdef TRAFFIC_MONITOR struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name, const char *subtest_name); void traffic_monitor_stop(struct tmonitor_ctx *ctx); +tm_print_fn_t traffic_monitor_set_print(tm_print_fn_t fn); #else static inline struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name, const char *subtest_name) @@ -263,6 +267,11 @@ static inline struct tmonitor_ctx *traffic_monitor_start(const char *netns, cons static inline void traffic_monitor_stop(struct tmonitor_ctx *ctx) { } + +static inline tm_print_fn_t traffic_monitor_set_print(tm_print_fn_t fn) +{ + return NULL; +} #endif #endif From 15bfc10814b8ee09d0f542ce8de3c93d48d72013 Mon Sep 17 00:00:00 2001 From: Amery Hung Date: Wed, 5 Mar 2025 10:20:57 -0800 Subject: [PATCH 62/80] selftests/bpf: Fix dangling stdout seen by traffic monitor thread Traffic monitor thread may see dangling stdout as the main thread closes and reassigns stdout without protection. This happens when the main thread finishes one subtest and moves to another one in the same netns_new() scope. The issue can be reproduced by running test_progs repeatedly with traffic monitor enabled: for ((i=1;i<=100;i++)); do ./test_progs -a flow_dissector_skb* -m '*' done For restoring stdout in crash_handler(), since it does not really care about closing stdout, simlpy flush stdout and restore it to the original one. Then, Fix the issue by consolidating stdio_restore_cleanup() and stdio_restore(), and protecting the use/close/assignment of stdout with a lock. The locking in the main thread is always performed regradless of whether traffic monitor is running or not for simplicity. It won't have any side-effect. Signed-off-by: Amery Hung Signed-off-by: Martin KaFai Lau Acked-by: Eduard Zingerman Link: https://patch.msgid.link/20250305182057.2802606-3-ameryhung@gmail.com --- tools/testing/selftests/bpf/test_progs.c | 39 +++++++++++++----------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index ab0f2fed3c580..d4ec9586b98ca 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -88,7 +88,9 @@ static void stdio_hijack(char **log_buf, size_t *log_cnt) #endif } -static void stdio_restore_cleanup(void) +static pthread_mutex_t stdout_lock = PTHREAD_MUTEX_INITIALIZER; + +static void stdio_restore(void) { #ifdef __GLIBC__ if (verbose() && env.worker_id == -1) { @@ -98,6 +100,8 @@ static void stdio_restore_cleanup(void) fflush(stdout); + pthread_mutex_lock(&stdout_lock); + if (env.subtest_state) { fclose(env.subtest_state->stdout_saved); env.subtest_state->stdout_saved = NULL; @@ -106,26 +110,21 @@ static void stdio_restore_cleanup(void) } else { fclose(env.test_state->stdout_saved); env.test_state->stdout_saved = NULL; + stdout = env.stdout_saved; + stderr = env.stderr_saved; } + + pthread_mutex_unlock(&stdout_lock); #endif } -static void stdio_restore(void) +static int traffic_monitor_print_fn(const char *format, va_list args) { -#ifdef __GLIBC__ - if (verbose() && env.worker_id == -1) { - /* nothing to do, output to stdout by default */ - return; - } - - if (stdout == env.stdout_saved) - return; - - stdio_restore_cleanup(); + pthread_mutex_lock(&stdout_lock); + vfprintf(stdout, format, args); + pthread_mutex_unlock(&stdout_lock); - stdout = env.stdout_saved; - stderr = env.stderr_saved; -#endif + return 0; } /* Adapted from perf/util/string.c */ @@ -536,7 +535,8 @@ void test__end_subtest(void) test_result(subtest_state->error_cnt, subtest_state->skipped)); - stdio_restore_cleanup(); + stdio_restore(); + env.subtest_state = NULL; } @@ -1265,7 +1265,10 @@ void crash_handler(int signum) sz = backtrace(bt, ARRAY_SIZE(bt)); - stdio_restore(); + fflush(stdout); + stdout = env.stdout_saved; + stderr = env.stderr_saved; + if (env.test) { env.test_state->error_cnt++; dump_test_log(env.test, env.test_state, true, false, NULL); @@ -1957,6 +1960,8 @@ int main(int argc, char **argv) libbpf_set_strict_mode(LIBBPF_STRICT_ALL); libbpf_set_print(libbpf_print_fn); + traffic_monitor_set_print(traffic_monitor_print_fn); + srand(time(NULL)); env.jit_enabled = is_jit_enabled(); From 7e437dcd3975d13674b943b5fb6c6d4764fa68dc Mon Sep 17 00:00:00 2001 From: Feng Yang Date: Wed, 5 Mar 2025 10:22:34 +0800 Subject: [PATCH 63/80] selftests/bpf: Fix cap_enable_effective() return code The caller of cap_enable_effective() expects negative error code. Fix it. Before: failed to restore CAP_SYS_ADMIN: -1, Unknown error -1 After: failed to restore CAP_SYS_ADMIN: -3, No such process failed to restore CAP_SYS_ADMIN: -22, Invalid argument Signed-off-by: Feng Yang Acked-by: Eduard Zingerman Link: https://lore.kernel.org/r/20250305022234.44932-1-yangfeng59949@163.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/cap_helpers.c | 8 ++++---- tools/testing/selftests/bpf/cap_helpers.h | 1 + tools/testing/selftests/bpf/prog_tests/verifier.c | 4 ++-- tools/testing/selftests/bpf/test_loader.c | 6 +++--- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/cap_helpers.c b/tools/testing/selftests/bpf/cap_helpers.c index d5ac507401d7c..98f840c3a38f7 100644 --- a/tools/testing/selftests/bpf/cap_helpers.c +++ b/tools/testing/selftests/bpf/cap_helpers.c @@ -19,7 +19,7 @@ int cap_enable_effective(__u64 caps, __u64 *old_caps) err = capget(&hdr, data); if (err) - return err; + return -errno; if (old_caps) *old_caps = (__u64)(data[1].effective) << 32 | data[0].effective; @@ -32,7 +32,7 @@ int cap_enable_effective(__u64 caps, __u64 *old_caps) data[1].effective |= cap1; err = capset(&hdr, data); if (err) - return err; + return -errno; return 0; } @@ -49,7 +49,7 @@ int cap_disable_effective(__u64 caps, __u64 *old_caps) err = capget(&hdr, data); if (err) - return err; + return -errno; if (old_caps) *old_caps = (__u64)(data[1].effective) << 32 | data[0].effective; @@ -61,7 +61,7 @@ int cap_disable_effective(__u64 caps, __u64 *old_caps) data[1].effective &= ~cap1; err = capset(&hdr, data); if (err) - return err; + return -errno; return 0; } diff --git a/tools/testing/selftests/bpf/cap_helpers.h b/tools/testing/selftests/bpf/cap_helpers.h index 6d163530cb0fd..8dcb28557f762 100644 --- a/tools/testing/selftests/bpf/cap_helpers.h +++ b/tools/testing/selftests/bpf/cap_helpers.h @@ -4,6 +4,7 @@ #include #include +#include #ifndef CAP_PERFMON #define CAP_PERFMON 38 diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index cfe47b529e01a..e66a57970d28c 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -123,7 +123,7 @@ static void run_tests_aux(const char *skel_name, /* test_verifier tests are executed w/o CAP_SYS_ADMIN, do the same here */ err = cap_disable_effective(1ULL << CAP_SYS_ADMIN, &old_caps); if (err) { - PRINT_FAIL("failed to drop CAP_SYS_ADMIN: %i, %s\n", err, strerror(err)); + PRINT_FAIL("failed to drop CAP_SYS_ADMIN: %i, %s\n", err, strerror(-err)); return; } @@ -133,7 +133,7 @@ static void run_tests_aux(const char *skel_name, err = cap_enable_effective(old_caps, NULL); if (err) - PRINT_FAIL("failed to restore CAP_SYS_ADMIN: %i, %s\n", err, strerror(err)); + PRINT_FAIL("failed to restore CAP_SYS_ADMIN: %i, %s\n", err, strerror(-err)); } #define RUN(skel) run_tests_aux(#skel, skel##__elf_bytes, NULL) diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 4d23a9c463ee2..49f2fc61061f5 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -793,7 +793,7 @@ static int drop_capabilities(struct cap_state *caps) err = cap_disable_effective(caps_to_drop, &caps->old_caps); if (err) { - PRINT_FAIL("failed to drop capabilities: %i, %s\n", err, strerror(err)); + PRINT_FAIL("failed to drop capabilities: %i, %s\n", err, strerror(-err)); return err; } @@ -810,7 +810,7 @@ static int restore_capabilities(struct cap_state *caps) err = cap_enable_effective(caps->old_caps, NULL); if (err) - PRINT_FAIL("failed to restore capabilities: %i, %s\n", err, strerror(err)); + PRINT_FAIL("failed to restore capabilities: %i, %s\n", err, strerror(-err)); caps->initialized = false; return err; } @@ -985,7 +985,7 @@ void run_subtest(struct test_loader *tester, if (subspec->caps) { err = cap_enable_effective(subspec->caps, NULL); if (err) { - PRINT_FAIL("failed to set capabilities: %i, %s\n", err, strerror(err)); + PRINT_FAIL("failed to set capabilities: %i, %s\n", err, strerror(-err)); goto subtest_cleanup; } } From 359d07044dd5d5ba1012437b06f9b36469bd687a Mon Sep 17 00:00:00 2001 From: "Bastien Curutchet (eBPF Foundation)" Date: Fri, 7 Mar 2025 10:18:23 +0100 Subject: [PATCH 64/80] selftests/bpf: lwt_seg6local: Remove unused routes Some routes in fb00:: are initialized during setup, even though they aren't needed by the test as the UDP packets will travel through the lightweight tunnels. Remove these unnecessary routes. Signed-off-by: Bastien Curutchet (eBPF Foundation) Link: https://lore.kernel.org/r/20250307-seg6local-v1-1-990fff8f180d@bootlin.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/test_lwt_seg6local.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tools/testing/selftests/bpf/test_lwt_seg6local.sh b/tools/testing/selftests/bpf/test_lwt_seg6local.sh index 0efea2292d6aa..9c74b88730ffd 100755 --- a/tools/testing/selftests/bpf/test_lwt_seg6local.sh +++ b/tools/testing/selftests/bpf/test_lwt_seg6local.sh @@ -99,19 +99,14 @@ ip netns exec ${NS6} ip link set dev lo up # All link scope addresses and routes required between veths ip netns exec ${NS1} ip -6 addr add fb00::12/16 dev veth1 scope link -ip netns exec ${NS1} ip -6 route add fb00::21 dev veth1 scope link ip netns exec ${NS2} ip -6 addr add fb00::21/16 dev veth2 scope link ip netns exec ${NS2} ip -6 addr add fb00::34/16 dev veth3 scope link -ip netns exec ${NS2} ip -6 route add fb00::43 dev veth3 scope link -ip netns exec ${NS3} ip -6 route add fb00::65 dev veth5 scope link ip netns exec ${NS3} ip -6 addr add fb00::43/16 dev veth4 scope link ip netns exec ${NS3} ip -6 addr add fb00::56/16 dev veth5 scope link ip netns exec ${NS4} ip -6 addr add fb00::65/16 dev veth6 scope link ip netns exec ${NS4} ip -6 addr add fb00::78/16 dev veth7 scope link -ip netns exec ${NS4} ip -6 route add fb00::87 dev veth7 scope link ip netns exec ${NS5} ip -6 addr add fb00::87/16 dev veth8 scope link ip netns exec ${NS5} ip -6 addr add fb00::910/16 dev veth9 scope link -ip netns exec ${NS5} ip -6 route add fb00::109 dev veth9 scope link ip netns exec ${NS5} ip -6 route add fb00::109 table 117 dev veth9 scope link ip netns exec ${NS6} ip -6 addr add fb00::109/16 dev veth10 scope link From 3fb97a2b2f2d3f8c7fe2f447ac1b3fc31c12b579 Mon Sep 17 00:00:00 2001 From: "Bastien Curutchet (eBPF Foundation)" Date: Fri, 7 Mar 2025 10:18:24 +0100 Subject: [PATCH 65/80] selftests/bpf: lwt_seg6local: Move test to test_progs test_lwt_seg6local.sh isn't used by the BPF CI. Add a new file in the test_progs framework to migrate the tests done by test_lwt_seg6local.sh. It uses the same network topology and the same BPF programs located in progs/test_lwt_seg6local.c. Use the network helpers instead of `nc` to exchange the final packet. Remove test_lwt_seg6local.sh and its Makefile entry. Signed-off-by: Bastien Curutchet (eBPF Foundation) Link: https://lore.kernel.org/r/20250307-seg6local-v1-2-990fff8f180d@bootlin.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/Makefile | 1 - .../selftests/bpf/prog_tests/lwt_seg6local.c | 176 ++++++++++++++++++ .../selftests/bpf/test_lwt_seg6local.sh | 151 --------------- 3 files changed, 176 insertions(+), 152 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/lwt_seg6local.c delete mode 100755 tools/testing/selftests/bpf/test_lwt_seg6local.sh diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index df4814b5200a5..7393050648390 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -100,7 +100,6 @@ TEST_FILES = xsk_prereqs.sh $(wildcard progs/btf_dump_test_case_*.c) # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ - test_lwt_seg6local.sh \ test_lirc_mode2.sh \ test_xdp_vlan_mode_generic.sh \ test_xdp_vlan_mode_native.sh \ diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_seg6local.c b/tools/testing/selftests/bpf/prog_tests/lwt_seg6local.c new file mode 100644 index 0000000000000..3bc730b7c7fa0 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/lwt_seg6local.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* Connects 6 network namespaces through veths. + * Each NS may have different IPv6 global scope addresses : + * + * NS1 NS2 NS3 NS4 NS5 NS6 + * lo veth1 <-> veth2 veth3 <-> veth4 veth5 <-> veth6 lo veth7 <-> veth8 veth9 <-> veth10 lo + * fb00 ::1 ::12 ::21 ::34 ::43 ::56 ::65 ::78 ::87 ::910 ::109 ::6 + * fd00 ::4 + * fc42 ::1 + * + * All IPv6 packets going to fb00::/16 through NS2 will be encapsulated in a + * IPv6 header with a Segment Routing Header, with segments : + * fd00::1 -> fd00::2 -> fd00::3 -> fd00::4 + * + * 3 fd00::/16 IPv6 addresses are binded to seg6local End.BPF actions : + * - fd00::1 : add a TLV, change the flags and apply a End.X action to fc42::1 + * - fd00::2 : remove the TLV, change the flags, add a tag + * - fd00::3 : apply an End.T action to fd00::4, through routing table 117 + * + * fd00::4 is a simple Segment Routing node decapsulating the inner IPv6 packet. + * Each End.BPF action will validate the operations applied on the SRH by the + * previous BPF program in the chain, otherwise the packet is dropped. + * + * An UDP datagram is sent from fb00::1 to fb00::6. The test succeeds if this + * datagram can be read on NS6 when binding to fb00::6. + */ + +#include "network_helpers.h" +#include "test_progs.h" + +#define NETNS_BASE "lwt-seg6local-" +#define BPF_FILE "test_lwt_seg6local.bpf.o" + +static void cleanup(void) +{ + int ns; + + for (ns = 1; ns < 7; ns++) + SYS_NOFAIL("ip netns del %s%d", NETNS_BASE, ns); +} + +static int setup(void) +{ + int ns; + + for (ns = 1; ns < 7; ns++) + SYS(fail, "ip netns add %s%d", NETNS_BASE, ns); + + SYS(fail, "ip -n %s6 link set dev lo up", NETNS_BASE); + + for (ns = 1; ns < 6; ns++) { + int local_id = ns * 2 - 1; + int peer_id = ns * 2; + int next_ns = ns + 1; + + SYS(fail, "ip -n %s%d link add veth%d type veth peer name veth%d netns %s%d", + NETNS_BASE, ns, local_id, peer_id, NETNS_BASE, next_ns); + + SYS(fail, "ip -n %s%d link set dev veth%d up", NETNS_BASE, ns, local_id); + SYS(fail, "ip -n %s%d link set dev veth%d up", NETNS_BASE, next_ns, peer_id); + + /* All link scope addresses to veths */ + SYS(fail, "ip -n %s%d -6 addr add fb00::%d%d/16 dev veth%d scope link", + NETNS_BASE, ns, local_id, peer_id, local_id); + SYS(fail, "ip -n %s%d -6 addr add fb00::%d%d/16 dev veth%d scope link", + NETNS_BASE, next_ns, peer_id, local_id, peer_id); + } + + + SYS(fail, "ip -n %s5 -6 route add fb00::109 table 117 dev veth9 scope link", NETNS_BASE); + + SYS(fail, "ip -n %s1 -6 addr add fb00::1/16 dev lo", NETNS_BASE); + SYS(fail, "ip -n %s1 -6 route add fb00::6 dev veth1 via fb00::21", NETNS_BASE); + + SYS(fail, "ip -n %s2 -6 route add fb00::6 encap bpf in obj %s sec encap_srh dev veth2", + NETNS_BASE, BPF_FILE); + SYS(fail, "ip -n %s2 -6 route add fd00::1 dev veth3 via fb00::43 scope link", NETNS_BASE); + + SYS(fail, "ip -n %s3 -6 route add fc42::1 dev veth5 via fb00::65", NETNS_BASE); + SYS(fail, + "ip -n %s3 -6 route add fd00::1 encap seg6local action End.BPF endpoint obj %s sec add_egr_x dev veth4", + NETNS_BASE, BPF_FILE); + + SYS(fail, + "ip -n %s4 -6 route add fd00::2 encap seg6local action End.BPF endpoint obj %s sec pop_egr dev veth6", + NETNS_BASE, BPF_FILE); + SYS(fail, "ip -n %s4 -6 addr add fc42::1 dev lo", NETNS_BASE); + SYS(fail, "ip -n %s4 -6 route add fd00::3 dev veth7 via fb00::87", NETNS_BASE); + + SYS(fail, "ip -n %s5 -6 route add fd00::4 table 117 dev veth9 via fb00::109", NETNS_BASE); + SYS(fail, + "ip -n %s5 -6 route add fd00::3 encap seg6local action End.BPF endpoint obj %s sec inspect_t dev veth8", + NETNS_BASE, BPF_FILE); + + SYS(fail, "ip -n %s6 -6 addr add fb00::6/16 dev lo", NETNS_BASE); + SYS(fail, "ip -n %s6 -6 addr add fd00::4/16 dev lo", NETNS_BASE); + + for (ns = 1; ns < 6; ns++) + SYS(fail, "ip netns exec %s%d sysctl -wq net.ipv6.conf.all.forwarding=1", + NETNS_BASE, ns); + + SYS(fail, "ip netns exec %s6 sysctl -wq net.ipv6.conf.all.seg6_enabled=1", NETNS_BASE); + SYS(fail, "ip netns exec %s6 sysctl -wq net.ipv6.conf.lo.seg6_enabled=1", NETNS_BASE); + SYS(fail, "ip netns exec %s6 sysctl -wq net.ipv6.conf.veth10.seg6_enabled=1", NETNS_BASE); + + return 0; +fail: + return -1; +} + +#define SERVER_PORT 7330 +#define CLIENT_PORT 2121 +void test_lwt_seg6local(void) +{ + struct sockaddr_in6 server_addr = {}; + const char *ns1 = NETNS_BASE "1"; + const char *ns6 = NETNS_BASE "6"; + struct nstoken *nstoken = NULL; + const char *foobar = "foobar"; + ssize_t bytes; + int sfd, cfd; + char buf[7]; + + if (!ASSERT_OK(setup(), "setup")) + goto out; + + nstoken = open_netns(ns6); + if (!ASSERT_OK_PTR(nstoken, "open ns6")) + goto out; + + sfd = start_server_str(AF_INET6, SOCK_DGRAM, "fb00::6", SERVER_PORT, NULL); + if (!ASSERT_OK_FD(sfd, "start server")) + goto close_netns; + + close_netns(nstoken); + + nstoken = open_netns(ns1); + if (!ASSERT_OK_PTR(nstoken, "open ns1")) + goto close_server; + + cfd = start_server_str(AF_INET6, SOCK_DGRAM, "fb00::1", CLIENT_PORT, NULL); + if (!ASSERT_OK_FD(cfd, "start client")) + goto close_server; + + close_netns(nstoken); + nstoken = NULL; + + /* Send a packet larger than MTU */ + server_addr.sin6_family = AF_INET6; + server_addr.sin6_port = htons(SERVER_PORT); + if (!ASSERT_EQ(inet_pton(AF_INET6, "fb00::6", &server_addr.sin6_addr), 1, + "build target addr")) + goto close_client; + + bytes = sendto(cfd, foobar, sizeof(foobar), 0, + (struct sockaddr *)&server_addr, sizeof(server_addr)); + if (!ASSERT_EQ(bytes, sizeof(foobar), "send packet")) + goto close_client; + + /* Verify we received all expected bytes */ + bytes = read(sfd, buf, sizeof(buf)); + if (!ASSERT_EQ(bytes, sizeof(buf), "receive packet")) + goto close_client; + ASSERT_STREQ(buf, foobar, "check udp packet"); + +close_client: + close(cfd); +close_server: + close(sfd); +close_netns: + close_netns(nstoken); + +out: + cleanup(); +} diff --git a/tools/testing/selftests/bpf/test_lwt_seg6local.sh b/tools/testing/selftests/bpf/test_lwt_seg6local.sh deleted file mode 100755 index 9c74b88730ffd..0000000000000 --- a/tools/testing/selftests/bpf/test_lwt_seg6local.sh +++ /dev/null @@ -1,151 +0,0 @@ -#!/bin/bash -# Connects 6 network namespaces through veths. -# Each NS may have different IPv6 global scope addresses : -# NS1 ---- NS2 ---- NS3 ---- NS4 ---- NS5 ---- NS6 -# fb00::1 fd00::1 fd00::2 fd00::3 fb00::6 -# fc42::1 fd00::4 -# -# All IPv6 packets going to fb00::/16 through NS2 will be encapsulated in a -# IPv6 header with a Segment Routing Header, with segments : -# fd00::1 -> fd00::2 -> fd00::3 -> fd00::4 -# -# 3 fd00::/16 IPv6 addresses are binded to seg6local End.BPF actions : -# - fd00::1 : add a TLV, change the flags and apply a End.X action to fc42::1 -# - fd00::2 : remove the TLV, change the flags, add a tag -# - fd00::3 : apply an End.T action to fd00::4, through routing table 117 -# -# fd00::4 is a simple Segment Routing node decapsulating the inner IPv6 packet. -# Each End.BPF action will validate the operations applied on the SRH by the -# previous BPF program in the chain, otherwise the packet is dropped. -# -# An UDP datagram is sent from fb00::1 to fb00::6. The test succeeds if this -# datagram can be read on NS6 when binding to fb00::6. - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -BPF_FILE="test_lwt_seg6local.bpf.o" -readonly NS1="ns1-$(mktemp -u XXXXXX)" -readonly NS2="ns2-$(mktemp -u XXXXXX)" -readonly NS3="ns3-$(mktemp -u XXXXXX)" -readonly NS4="ns4-$(mktemp -u XXXXXX)" -readonly NS5="ns5-$(mktemp -u XXXXXX)" -readonly NS6="ns6-$(mktemp -u XXXXXX)" - -msg="skip all tests:" -if [ $UID != 0 ]; then - echo $msg please run this as root >&2 - exit $ksft_skip -fi - -TMP_FILE="/tmp/selftest_lwt_seg6local.txt" - -cleanup() -{ - if [ "$?" = "0" ]; then - echo "selftests: test_lwt_seg6local [PASS]"; - else - echo "selftests: test_lwt_seg6local [FAILED]"; - fi - - set +e - ip netns del ${NS1} 2> /dev/null - ip netns del ${NS2} 2> /dev/null - ip netns del ${NS3} 2> /dev/null - ip netns del ${NS4} 2> /dev/null - ip netns del ${NS5} 2> /dev/null - ip netns del ${NS6} 2> /dev/null - rm -f $TMP_FILE -} - -set -e - -ip netns add ${NS1} -ip netns add ${NS2} -ip netns add ${NS3} -ip netns add ${NS4} -ip netns add ${NS5} -ip netns add ${NS6} - -trap cleanup 0 2 3 6 9 - -ip link add veth1 type veth peer name veth2 -ip link add veth3 type veth peer name veth4 -ip link add veth5 type veth peer name veth6 -ip link add veth7 type veth peer name veth8 -ip link add veth9 type veth peer name veth10 - -ip link set veth1 netns ${NS1} -ip link set veth2 netns ${NS2} -ip link set veth3 netns ${NS2} -ip link set veth4 netns ${NS3} -ip link set veth5 netns ${NS3} -ip link set veth6 netns ${NS4} -ip link set veth7 netns ${NS4} -ip link set veth8 netns ${NS5} -ip link set veth9 netns ${NS5} -ip link set veth10 netns ${NS6} - -ip netns exec ${NS1} ip link set dev veth1 up -ip netns exec ${NS2} ip link set dev veth2 up -ip netns exec ${NS2} ip link set dev veth3 up -ip netns exec ${NS3} ip link set dev veth4 up -ip netns exec ${NS3} ip link set dev veth5 up -ip netns exec ${NS4} ip link set dev veth6 up -ip netns exec ${NS4} ip link set dev veth7 up -ip netns exec ${NS5} ip link set dev veth8 up -ip netns exec ${NS5} ip link set dev veth9 up -ip netns exec ${NS6} ip link set dev veth10 up -ip netns exec ${NS6} ip link set dev lo up - -# All link scope addresses and routes required between veths -ip netns exec ${NS1} ip -6 addr add fb00::12/16 dev veth1 scope link -ip netns exec ${NS2} ip -6 addr add fb00::21/16 dev veth2 scope link -ip netns exec ${NS2} ip -6 addr add fb00::34/16 dev veth3 scope link -ip netns exec ${NS3} ip -6 addr add fb00::43/16 dev veth4 scope link -ip netns exec ${NS3} ip -6 addr add fb00::56/16 dev veth5 scope link -ip netns exec ${NS4} ip -6 addr add fb00::65/16 dev veth6 scope link -ip netns exec ${NS4} ip -6 addr add fb00::78/16 dev veth7 scope link -ip netns exec ${NS5} ip -6 addr add fb00::87/16 dev veth8 scope link -ip netns exec ${NS5} ip -6 addr add fb00::910/16 dev veth9 scope link -ip netns exec ${NS5} ip -6 route add fb00::109 table 117 dev veth9 scope link -ip netns exec ${NS6} ip -6 addr add fb00::109/16 dev veth10 scope link - -ip netns exec ${NS1} ip -6 addr add fb00::1/16 dev lo -ip netns exec ${NS1} ip -6 route add fb00::6 dev veth1 via fb00::21 - -ip netns exec ${NS2} ip -6 route add fb00::6 encap bpf in obj ${BPF_FILE} sec encap_srh dev veth2 -ip netns exec ${NS2} ip -6 route add fd00::1 dev veth3 via fb00::43 scope link - -ip netns exec ${NS3} ip -6 route add fc42::1 dev veth5 via fb00::65 -ip netns exec ${NS3} ip -6 route add fd00::1 encap seg6local action End.BPF endpoint obj ${BPF_FILE} sec add_egr_x dev veth4 - -ip netns exec ${NS4} ip -6 route add fd00::2 encap seg6local action End.BPF endpoint obj ${BPF_FILE} sec pop_egr dev veth6 -ip netns exec ${NS4} ip -6 addr add fc42::1 dev lo -ip netns exec ${NS4} ip -6 route add fd00::3 dev veth7 via fb00::87 - -ip netns exec ${NS5} ip -6 route add fd00::4 table 117 dev veth9 via fb00::109 -ip netns exec ${NS5} ip -6 route add fd00::3 encap seg6local action End.BPF endpoint obj ${BPF_FILE} sec inspect_t dev veth8 - -ip netns exec ${NS6} ip -6 addr add fb00::6/16 dev lo -ip netns exec ${NS6} ip -6 addr add fd00::4/16 dev lo - -ip netns exec ${NS1} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec ${NS2} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec ${NS3} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec ${NS4} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec ${NS5} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null - -ip netns exec ${NS6} sysctl net.ipv6.conf.all.seg6_enabled=1 > /dev/null -ip netns exec ${NS6} sysctl net.ipv6.conf.lo.seg6_enabled=1 > /dev/null -ip netns exec ${NS6} sysctl net.ipv6.conf.veth10.seg6_enabled=1 > /dev/null - -ip netns exec ${NS6} nc -l -6 -u -d 7330 > $TMP_FILE & -ip netns exec ${NS1} bash -c "echo 'foobar' | nc -w0 -6 -u -p 2121 -s fb00::1 fb00::6 7330" -sleep 5 # wait enough time to ensure the UDP datagram arrived to the last segment -kill -TERM $! - -if [[ $(< $TMP_FILE) != "foobar" ]]; then - exit 1 -fi - -exit 0 From 63f99cd6a53f64967582d1603b2efe5ef45af2fc Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Sun, 9 Mar 2025 19:04:24 -0400 Subject: [PATCH 66/80] bpf: add kfunc for populating cpumask bits Add a helper kfunc that sets the bitmap of a bpf_cpumask from BPF memory. Signed-off-by: Emil Tsalapatis (Meta) Acked-by: Hou Tao Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20250309230427.26603-2-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumask.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index cfa1c18e3a483..77900cbbbd758 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -420,6 +420,38 @@ __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask) return cpumask_weight(cpumask); } +/** + * bpf_cpumask_populate() - Populate the CPU mask from the contents of + * a BPF memory region. + * + * @cpumask: The cpumask being populated. + * @src: The BPF memory holding the bit pattern. + * @src__sz: Length of the BPF memory region in bytes. + * + * Return: + * * 0 if the struct cpumask * instance was populated successfully. + * * -EACCES if the memory region is too small to populate the cpumask. + * * -EINVAL if the memory region is not aligned to the size of a long + * and the architecture does not support efficient unaligned accesses. + */ +__bpf_kfunc int bpf_cpumask_populate(struct cpumask *cpumask, void *src, size_t src__sz) +{ + unsigned long source = (unsigned long)src; + + /* The memory region must be large enough to populate the entire CPU mask. */ + if (src__sz < bitmap_size(nr_cpu_ids)) + return -EACCES; + + /* If avoiding unaligned accesses, the input region must be aligned to the nearest long. */ + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && + !IS_ALIGNED(source, sizeof(long))) + return -EINVAL; + + bitmap_copy(cpumask_bits(cpumask), src, nr_cpu_ids); + + return 0; +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(cpumask_kfunc_btf_ids) @@ -448,6 +480,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_populate, KF_RCU) BTF_KFUNCS_END(cpumask_kfunc_btf_ids) static const struct btf_kfunc_id_set cpumask_kfunc_set = { From 3524b150f4ff3839190d62fa0c054ec0ed167f48 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Sun, 9 Mar 2025 19:04:25 -0400 Subject: [PATCH 67/80] selftests: bpf: add bpf_cpumask_populate selftests Add selftests for the bpf_cpumask_populate helper that sets a bpf_cpumask to a bit pattern provided by a BPF program. Signed-off-by: Emil Tsalapatis (Meta) Acked-by: Hou Tao Link: https://lore.kernel.org/r/20250309230427.26603-3-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/cpumask.c | 3 + .../selftests/bpf/progs/cpumask_common.h | 1 + .../selftests/bpf/progs/cpumask_failure.c | 38 ++++++ .../selftests/bpf/progs/cpumask_success.c | 119 ++++++++++++++++++ 4 files changed, 161 insertions(+) diff --git a/tools/testing/selftests/bpf/prog_tests/cpumask.c b/tools/testing/selftests/bpf/prog_tests/cpumask.c index e58a04654238c..9b09beba988b4 100644 --- a/tools/testing/selftests/bpf/prog_tests/cpumask.c +++ b/tools/testing/selftests/bpf/prog_tests/cpumask.c @@ -25,6 +25,9 @@ static const char * const cpumask_success_testcases[] = { "test_global_mask_nested_deep_rcu", "test_global_mask_nested_deep_array_rcu", "test_cpumask_weight", + "test_populate_reject_small_mask", + "test_populate_reject_unaligned", + "test_populate", }; static void verify_success(const char *prog_name) diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h index 4ece7873ba609..86085b79f5cac 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_common.h +++ b/tools/testing/selftests/bpf/progs/cpumask_common.h @@ -61,6 +61,7 @@ u32 bpf_cpumask_any_distribute(const struct cpumask *src) __ksym __weak; u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, const struct cpumask *src2) __ksym __weak; u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym __weak; +int bpf_cpumask_populate(struct cpumask *cpumask, void *src, size_t src__sz) __ksym __weak; void bpf_rcu_read_lock(void) __ksym __weak; void bpf_rcu_read_unlock(void) __ksym __weak; diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c index b40b52548ffb0..8a2fd596c8a39 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_failure.c +++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c @@ -222,3 +222,41 @@ int BPF_PROG(test_invalid_nested_array, struct task_struct *task, u64 clone_flag return 0; } + +SEC("tp_btf/task_newtask") +__failure __msg("type=scalar expected=fp") +int BPF_PROG(test_populate_invalid_destination, struct task_struct *task, u64 clone_flags) +{ + struct bpf_cpumask *invalid = (struct bpf_cpumask *)0x123456; + u64 bits; + int ret; + + ret = bpf_cpumask_populate((struct cpumask *)invalid, &bits, sizeof(bits)); + if (!ret) + err = 2; + + return 0; +} + +SEC("tp_btf/task_newtask") +__failure __msg("leads to invalid memory access") +int BPF_PROG(test_populate_invalid_source, struct task_struct *task, u64 clone_flags) +{ + void *garbage = (void *)0x123456; + struct bpf_cpumask *local; + int ret; + + local = create_cpumask(); + if (!local) { + err = 1; + return 0; + } + + ret = bpf_cpumask_populate((struct cpumask *)local, garbage, 8); + if (!ret) + err = 2; + + bpf_cpumask_release(local); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/cpumask_success.c b/tools/testing/selftests/bpf/progs/cpumask_success.c index 80ee469b0b602..91a5357769a8b 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_success.c +++ b/tools/testing/selftests/bpf/progs/cpumask_success.c @@ -770,3 +770,122 @@ int BPF_PROG(test_refcount_null_tracking, struct task_struct *task, u64 clone_fl bpf_cpumask_release(mask2); return 0; } + +SEC("tp_btf/task_newtask") +int BPF_PROG(test_populate_reject_small_mask, struct task_struct *task, u64 clone_flags) +{ + struct bpf_cpumask *local; + u8 toofewbits; + int ret; + + if (!is_test_task()) + return 0; + + local = create_cpumask(); + if (!local) + return 0; + + /* The kfunc should prevent this operation */ + ret = bpf_cpumask_populate((struct cpumask *)local, &toofewbits, sizeof(toofewbits)); + if (ret != -EACCES) + err = 2; + + bpf_cpumask_release(local); + + return 0; +} + +/* Mask is guaranteed to be large enough for bpf_cpumask_t. */ +#define CPUMASK_TEST_MASKLEN (sizeof(cpumask_t)) + +/* Add an extra word for the test_populate_reject_unaligned test. */ +u64 bits[CPUMASK_TEST_MASKLEN / 8 + 1]; +extern bool CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS __kconfig __weak; + +SEC("tp_btf/task_newtask") +int BPF_PROG(test_populate_reject_unaligned, struct task_struct *task, u64 clone_flags) +{ + struct bpf_cpumask *mask; + char *src; + int ret; + + if (!is_test_task()) + return 0; + + /* Skip if unaligned accesses are fine for this arch. */ + if (CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) + return 0; + + mask = bpf_cpumask_create(); + if (!mask) { + err = 1; + return 0; + } + + /* Misalign the source array by a byte. */ + src = &((char *)bits)[1]; + + ret = bpf_cpumask_populate((struct cpumask *)mask, src, CPUMASK_TEST_MASKLEN); + if (ret != -EINVAL) + err = 2; + + bpf_cpumask_release(mask); + + return 0; +} + + +SEC("tp_btf/task_newtask") +int BPF_PROG(test_populate, struct task_struct *task, u64 clone_flags) +{ + struct bpf_cpumask *mask; + bool bit; + int ret; + int i; + + if (!is_test_task()) + return 0; + + /* Set only odd bits. */ + __builtin_memset(bits, 0xaa, CPUMASK_TEST_MASKLEN); + + mask = bpf_cpumask_create(); + if (!mask) { + err = 1; + return 0; + } + + /* Pass the entire bits array, the kfunc will only copy the valid bits. */ + ret = bpf_cpumask_populate((struct cpumask *)mask, bits, CPUMASK_TEST_MASKLEN); + if (ret) { + err = 2; + goto out; + } + + /* + * Test is there to appease the verifier. We cannot directly + * access NR_CPUS, the upper bound for nr_cpus, so we infer + * it from the size of cpumask_t. + */ + if (nr_cpus < 0 || nr_cpus >= CPUMASK_TEST_MASKLEN * 8) { + err = 3; + goto out; + } + + bpf_for(i, 0, nr_cpus) { + /* Odd-numbered bits should be set, even ones unset. */ + bit = bpf_cpumask_test_cpu(i, (const struct cpumask *)mask); + if (bit == (i % 2 != 0)) + continue; + + err = 4; + break; + } + +out: + bpf_cpumask_release(mask); + + return 0; +} + +#undef CPUMASK_TEST_MASKLEN From d70870e809a9cb37c4f8ac32310ea35d827bf5c4 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Sun, 9 Mar 2025 19:04:26 -0400 Subject: [PATCH 68/80] bpf: fix missing kdoc string fields in cpumask.c Some bpf_cpumask-related kfuncs have kdoc strings that are missing return values. Add a the missing descriptions for the return values. Reported-by: Alexei Starovoitov Signed-off-by: Emil Tsalapatis (Meta) Acked-by: Hou Tao Link: https://lore.kernel.org/r/20250309230427.26603-4-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/cpumask.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index 77900cbbbd758..9876c5fe6c2a2 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -45,6 +45,10 @@ __bpf_kfunc_start_defs(); * * bpf_cpumask_create() allocates memory using the BPF memory allocator, and * will not block. It may return NULL if no memory is available. + * + * Return: + * * A pointer to a new struct bpf_cpumask instance on success. + * * NULL if the BPF memory allocator is out of memory. */ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void) { @@ -71,6 +75,10 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void) * Acquires a reference to a BPF cpumask. The cpumask returned by this function * must either be embedded in a map as a kptr, or freed with * bpf_cpumask_release(). + * + * Return: + * * The struct bpf_cpumask pointer passed to the function. + * */ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) { @@ -106,6 +114,9 @@ CFI_NOSEAL(bpf_cpumask_release_dtor); * * Find the index of the first nonzero bit of the cpumask. A struct bpf_cpumask * pointer may be safely passed to this function. + * + * Return: + * * The index of the first nonzero bit in the struct cpumask. */ __bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask) { @@ -119,6 +130,9 @@ __bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask) * * Find the index of the first unset bit of the cpumask. A struct bpf_cpumask * pointer may be safely passed to this function. + * + * Return: + * * The index of the first zero bit in the struct cpumask. */ __bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) { @@ -133,6 +147,9 @@ __bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) * * Find the index of the first nonzero bit of the AND of two cpumasks. * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + * + * Return: + * * The index of the first bit that is nonzero in both cpumask instances. */ __bpf_kfunc u32 bpf_cpumask_first_and(const struct cpumask *src1, const struct cpumask *src2) @@ -414,6 +431,9 @@ __bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, * @cpumask: The cpumask being queried. * * Count the number of set bits in the given cpumask. + * + * Return: + * * The number of bits set in the mask. */ __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask) { From 93ececb29a1923b1af6cea533481e7356ccf1e97 Mon Sep 17 00:00:00 2001 From: Emil Tsalapatis Date: Sun, 9 Mar 2025 19:04:27 -0400 Subject: [PATCH 69/80] selftests: bpf: fix duplicate selftests in cpumask_success. The BPF cpumask selftests are currently run twice in test_progs/cpumask.c, once by traversing cpumask_success_testcases, and once by invoking RUN_TESTS(cpumask_success). Remove the invocation of RUN_TESTS to properly run the selftests only once. Now that the tests are run only through cpumask_success_testscases, add to it the missing test_refcount_null_tracking testcase. Also remove the __success annotation from it, since it is now loaded and invoked by the runner. Signed-off-by: Emil Tsalapatis (Meta) Acked-by: Hou Tao Link: https://lore.kernel.org/r/20250309230427.26603-5-emil@etsalapatis.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/cpumask.c | 2 +- tools/testing/selftests/bpf/progs/cpumask_success.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/cpumask.c b/tools/testing/selftests/bpf/prog_tests/cpumask.c index 9b09beba988b4..6c45330a5ca31 100644 --- a/tools/testing/selftests/bpf/prog_tests/cpumask.c +++ b/tools/testing/selftests/bpf/prog_tests/cpumask.c @@ -25,6 +25,7 @@ static const char * const cpumask_success_testcases[] = { "test_global_mask_nested_deep_rcu", "test_global_mask_nested_deep_array_rcu", "test_cpumask_weight", + "test_refcount_null_tracking", "test_populate_reject_small_mask", "test_populate_reject_unaligned", "test_populate", @@ -81,6 +82,5 @@ void test_cpumask(void) verify_success(cpumask_success_testcases[i]); } - RUN_TESTS(cpumask_success); RUN_TESTS(cpumask_failure); } diff --git a/tools/testing/selftests/bpf/progs/cpumask_success.c b/tools/testing/selftests/bpf/progs/cpumask_success.c index 91a5357769a8b..0e04c31b91c0c 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_success.c +++ b/tools/testing/selftests/bpf/progs/cpumask_success.c @@ -749,7 +749,6 @@ int BPF_PROG(test_cpumask_weight, struct task_struct *task, u64 clone_flags) } SEC("tp_btf/task_newtask") -__success int BPF_PROG(test_refcount_null_tracking, struct task_struct *task, u64 clone_flags) { struct bpf_cpumask *mask1, *mask2; From 74f36a97e5e54e3560ded04f2ce3325c1fd4f2c2 Mon Sep 17 00:00:00 2001 From: Anton Protopopov Date: Mon, 10 Mar 2025 14:51:12 +0000 Subject: [PATCH 70/80] selftests/bpf: Fix selection of static vs. dynamic LLVM The Makefile uses the exit code of the `llvm-config --link-static --libs` command to choose between statically-linked and dynamically-linked LLVMs. The stdout and stderr of that command are redirected to /dev/null. To redirect the output the "&>" construction is used, which might not be supported by /bin/sh, which is executed by make for $(shell ...) commands. On such systems the test will fail even if static LLVM is actually supported. Replace "&>" by ">/dev/null 2>&1" to fix this. Fixes: 2a9d30fac818 ("selftests/bpf: Support dynamically linking LLVM if static is not available") Signed-off-by: Anton Protopopov Signed-off-by: Andrii Nakryiko Acked-by: Daniel Xu Link: https://lore.kernel.org/bpf/20250310145112.1261241-1-aspsk@isovalent.com --- tools/testing/selftests/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 7393050648390..ca41d47d4ba6a 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -180,7 +180,7 @@ ifeq ($(feature-llvm),1) # both llvm-config and lib.mk add -D_GNU_SOURCE, which ends up as conflict LLVM_CFLAGS += $(filter-out -D_GNU_SOURCE,$(shell $(LLVM_CONFIG) --cflags)) # Prefer linking statically if it's available, otherwise fallback to shared - ifeq ($(shell $(LLVM_CONFIG) --link-static --libs &> /dev/null && echo static),static) + ifeq ($(shell $(LLVM_CONFIG) --link-static --libs >/dev/null 2>&1 && echo static),static) LLVM_LDLIBS += $(shell $(LLVM_CONFIG) --link-static --libs $(LLVM_CONFIG_LIB_COMPONENTS)) LLVM_LDLIBS += $(shell $(LLVM_CONFIG) --link-static --system-libs $(LLVM_CONFIG_LIB_COMPONENTS)) LLVM_LDLIBS += -lstdc++ From 26350a2cf0f9754bc64b47398a291cf98ad1aa47 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 10 Mar 2025 13:40:17 +0100 Subject: [PATCH 71/80] mm: Fix the flipped condition in gfpflags_allow_spinning() The function gfpflags_allow_spinning() has a bug that makes it return the opposite result than intended. This could contribute to deadlocks as usage profilerates, for now it was noticed as a performance regression due to try_charge_memcg() not refilling memcg stock when it could. Fix the flipped condition. Fixes: 97769a53f117 ("mm, bpf: Introduce try_alloc_pages() for opportunistic page allocation") Reported-by: kernel test robot Acked-by: Shakeel Butt Signed-off-by: Vlastimil Babka Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250310124017.187-1-alexei.starovoitov@gmail.com Closes: https://lore.kernel.org/oe-lkp/202503101254.cfd454df-lkp@intel.com Signed-off-by: Alexei Starovoitov --- include/linux/gfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index ceb226c2e25c5..c9fa6309c903b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -55,7 +55,7 @@ static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags) * regular page allocator doesn't fully support this * allocation mode. */ - return !(gfp_flags & __GFP_RECLAIM); + return !!(gfp_flags & __GFP_RECLAIM); } #ifdef CONFIG_HIGHMEM From bf5af299b51d2ae8fef6ec87ac7de0f9136f1512 Mon Sep 17 00:00:00 2001 From: Chen Ni Date: Mon, 10 Mar 2025 11:20:45 +0800 Subject: [PATCH 72/80] selftests/bpf: Convert comma to semicolon Replace comma between expressions with semicolons. Using a ',' in place of a ';' can have unintended side effects. Although that is not the case here, it is seems best to use ';' unless ',' is intended. Found by inspection. No functional change intended. Compile tested only. Signed-off-by: Chen Ni Signed-off-by: Andrii Nakryiko Acked-by: Anton Protopopov Link: https://lore.kernel.org/bpf/20250310032045.651068-1-nichen@iscas.ac.cn Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/fd_array.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/fd_array.c b/tools/testing/selftests/bpf/prog_tests/fd_array.c index a1d52e73fb162..9add890c2d372 100644 --- a/tools/testing/selftests/bpf/prog_tests/fd_array.c +++ b/tools/testing/selftests/bpf/prog_tests/fd_array.c @@ -83,8 +83,8 @@ static inline int bpf_prog_get_map_ids(int prog_fd, __u32 *nr_map_ids, __u32 *ma int err; memset(&info, 0, len); - info.nr_map_ids = *nr_map_ids, - info.map_ids = ptr_to_u64(map_ids), + info.nr_map_ids = *nr_map_ids; + info.map_ids = ptr_to_u64(map_ids); err = bpf_prog_get_info_by_fd(prog_fd, &info, &len); if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd")) From a8cd03532fc96d6693d30c228666090d66bb5638 Mon Sep 17 00:00:00 2001 From: Blaise Boscaccy Date: Mon, 10 Mar 2025 15:17:11 -0700 Subject: [PATCH 73/80] security: Propagate caller information in bpf hooks Certain bpf syscall subcommands are available for usage from both userspace and the kernel. LSM modules or eBPF gatekeeper programs may need to take a different course of action depending on whether or not a BPF syscall originated from the kernel or userspace. Additionally, some of the bpf_attr struct fields contain pointers to arbitrary memory. Currently the functionality to determine whether or not a pointer refers to kernel memory or userspace memory is exposed to the bpf verifier, but that information is missing from various LSM hooks. Here we augment the LSM hooks to provide this data, by simply passing a boolean flag indicating whether or not the call originated in the kernel, in any hook that contains a bpf_attr struct that corresponds to a subcommand that may be called from the kernel. Signed-off-by: Blaise Boscaccy Acked-by: Song Liu Acked-by: Paul Moore Link: https://lore.kernel.org/r/20250310221737.821889-2-bboscaccy@linux.microsoft.com Signed-off-by: Alexei Starovoitov --- include/linux/lsm_hook_defs.h | 6 +++--- include/linux/security.h | 12 ++++++------ kernel/bpf/syscall.c | 10 +++++----- security/security.c | 15 +++++++++------ security/selinux/hooks.c | 6 +++--- tools/testing/selftests/bpf/progs/rcu_read_lock.c | 3 ++- .../selftests/bpf/progs/test_cgroup1_hierarchy.c | 4 ++-- .../selftests/bpf/progs/test_kfunc_dynptr_param.c | 6 +++--- .../testing/selftests/bpf/progs/test_lookup_key.c | 2 +- .../selftests/bpf/progs/test_ptr_untrusted.c | 2 +- .../selftests/bpf/progs/test_task_under_cgroup.c | 2 +- .../selftests/bpf/progs/test_verify_pkcs7_sig.c | 2 +- 12 files changed, 37 insertions(+), 33 deletions(-) diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index e2f1ce37c41ef..f5aafd3ba5d4d 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -426,14 +426,14 @@ LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule) #endif /* CONFIG_AUDIT */ #ifdef CONFIG_BPF_SYSCALL -LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size) +LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map) LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr, const struct path *path) diff --git a/include/linux/security.h b/include/linux/security.h index 980b6c207cade..b2010034f82bc 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2249,14 +2249,14 @@ struct bpf_map; struct bpf_prog; struct bpf_token; #ifdef CONFIG_SECURITY -extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size); +extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size, bool kernel); extern int security_bpf_map(struct bpf_map *map, fmode_t fmode); extern int security_bpf_prog(struct bpf_prog *prog); extern int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token); + struct bpf_token *token, bool kernel); extern void security_bpf_map_free(struct bpf_map *map); extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token); + struct bpf_token *token, bool kernel); extern void security_bpf_prog_free(struct bpf_prog *prog); extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, const struct path *path); @@ -2265,7 +2265,7 @@ extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cm extern int security_bpf_token_capable(const struct bpf_token *token, int cap); #else static inline int security_bpf(int cmd, union bpf_attr *attr, - unsigned int size) + unsigned int size, bool kernel) { return 0; } @@ -2281,7 +2281,7 @@ static inline int security_bpf_prog(struct bpf_prog *prog) } static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) { return 0; } @@ -2290,7 +2290,7 @@ static inline void security_bpf_map_free(struct bpf_map *map) { } static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) { return 0; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 57a4387062159..45dc98d526cc3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1332,7 +1332,7 @@ static bool bpf_net_capable(void) #define BPF_MAP_CREATE_LAST_FIELD map_token_fd /* called via syscall */ -static int map_create(union bpf_attr *attr) +static int map_create(union bpf_attr *attr, bool kernel) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1522,7 +1522,7 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_create(map, attr, token); + err = security_bpf_map_create(map, attr, token, kernel); if (err) goto free_map_sec; @@ -2959,7 +2959,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (err < 0) goto free_prog; - err = security_bpf_prog_load(prog, attr, token); + err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel); if (err) goto free_prog_sec; @@ -5784,13 +5784,13 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) if (copy_from_bpfptr(&attr, uattr, size) != 0) return -EFAULT; - err = security_bpf(cmd, &attr, size); + err = security_bpf(cmd, &attr, size, uattr.is_kernel); if (err < 0) return err; switch (cmd) { case BPF_MAP_CREATE: - err = map_create(&attr); + err = map_create(&attr, uattr.is_kernel); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); diff --git a/security/security.c b/security/security.c index 143561ebc3e89..c369481232585 100644 --- a/security/security.c +++ b/security/security.c @@ -5627,6 +5627,7 @@ int security_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, * @cmd: command * @attr: bpf attribute * @size: size + * @kernel: whether or not call originated from kernel * * Do a initial check for all bpf syscalls after the attribute is copied into * the kernel. The actual security module can implement their own rules to @@ -5634,9 +5635,9 @@ int security_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, * * Return: Returns 0 if permission is granted. */ -int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) +int security_bpf(int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { - return call_int_hook(bpf, cmd, attr, size); + return call_int_hook(bpf, cmd, attr, size, kernel); } /** @@ -5673,6 +5674,7 @@ int security_bpf_prog(struct bpf_prog *prog) * @map: BPF map object * @attr: BPF syscall attributes used to create BPF map * @token: BPF token used to grant user access + * @kernel: whether or not call originated from kernel * * Do a check when the kernel creates a new BPF map. This is also the * point where LSM blob is allocated for LSMs that need them. @@ -5680,9 +5682,9 @@ int security_bpf_prog(struct bpf_prog *prog) * Return: Returns 0 on success, error on failure. */ int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) { - return call_int_hook(bpf_map_create, map, attr, token); + return call_int_hook(bpf_map_create, map, attr, token, kernel); } /** @@ -5690,6 +5692,7 @@ int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, * @prog: BPF program object * @attr: BPF syscall attributes used to create BPF program * @token: BPF token used to grant user access to BPF subsystem + * @kernel: whether or not call originated from kernel * * Perform an access control check when the kernel loads a BPF program and * allocates associated BPF program object. This hook is also responsible for @@ -5698,9 +5701,9 @@ int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, * Return: Returns 0 on success, error on failure. */ int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) { - return call_int_hook(bpf_prog_load, prog, attr, token); + return call_int_hook(bpf_prog_load, prog, attr, token, kernel); } /** diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 7b867dfec88ba..71199d86fc97f 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6866,7 +6866,7 @@ static int selinux_ib_alloc_security(void *ib_sec) #ifdef CONFIG_BPF_SYSCALL static int selinux_bpf(int cmd, union bpf_attr *attr, - unsigned int size) + unsigned int size, bool kernel) { u32 sid = current_sid(); int ret; @@ -6953,7 +6953,7 @@ static int selinux_bpf_prog(struct bpf_prog *prog) } static int selinux_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) { struct bpf_security_struct *bpfsec; @@ -6976,7 +6976,7 @@ static void selinux_bpf_map_free(struct bpf_map *map) } static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) { struct bpf_security_struct *bpfsec; diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c index 5cf1ae637ec7f..43637ee2cdcd2 100644 --- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c @@ -242,7 +242,8 @@ int inproper_sleepable_helper(void *ctx) } SEC("?lsm.s/bpf") -int BPF_PROG(inproper_sleepable_kfunc, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(inproper_sleepable_kfunc, int cmd, union bpf_attr *attr, unsigned int size, + bool kernel) { struct bpf_key *bkey; diff --git a/tools/testing/selftests/bpf/progs/test_cgroup1_hierarchy.c b/tools/testing/selftests/bpf/progs/test_cgroup1_hierarchy.c index 44628865fe1d4..4fee0fdc76076 100644 --- a/tools/testing/selftests/bpf/progs/test_cgroup1_hierarchy.c +++ b/tools/testing/selftests/bpf/progs/test_cgroup1_hierarchy.c @@ -51,13 +51,13 @@ static int bpf_link_create_verify(int cmd) } SEC("lsm/bpf") -int BPF_PROG(lsm_run, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(lsm_run, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { return bpf_link_create_verify(cmd); } SEC("lsm.s/bpf") -int BPF_PROG(lsm_s_run, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(lsm_s_run, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { return bpf_link_create_verify(cmd); } diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c index cd4d752bd089c..061befb004c24 100644 --- a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c +++ b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c @@ -36,7 +36,7 @@ char _license[] SEC("license") = "GPL"; SEC("?lsm.s/bpf") __failure __msg("cannot pass in dynptr at an offset=-8") -int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { unsigned long val; @@ -46,7 +46,7 @@ int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size) SEC("?lsm.s/bpf") __failure __msg("arg#0 expected pointer to stack or const struct bpf_dynptr") -int BPF_PROG(not_ptr_to_stack, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(not_ptr_to_stack, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { unsigned long val = 0; @@ -55,7 +55,7 @@ int BPF_PROG(not_ptr_to_stack, int cmd, union bpf_attr *attr, unsigned int size) } SEC("lsm.s/bpf") -int BPF_PROG(dynptr_data_null, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(dynptr_data_null, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { struct bpf_key *trusted_keyring; struct bpf_dynptr ptr; diff --git a/tools/testing/selftests/bpf/progs/test_lookup_key.c b/tools/testing/selftests/bpf/progs/test_lookup_key.c index c73776990ae30..cdbbb12f1491a 100644 --- a/tools/testing/selftests/bpf/progs/test_lookup_key.c +++ b/tools/testing/selftests/bpf/progs/test_lookup_key.c @@ -23,7 +23,7 @@ extern struct bpf_key *bpf_lookup_system_key(__u64 id) __ksym; extern void bpf_key_put(struct bpf_key *key) __ksym; SEC("lsm.s/bpf") -int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { struct bpf_key *bkey; __u32 pid; diff --git a/tools/testing/selftests/bpf/progs/test_ptr_untrusted.c b/tools/testing/selftests/bpf/progs/test_ptr_untrusted.c index 2fdc44e766248..89b0cd5a3e06e 100644 --- a/tools/testing/selftests/bpf/progs/test_ptr_untrusted.c +++ b/tools/testing/selftests/bpf/progs/test_ptr_untrusted.c @@ -7,7 +7,7 @@ char tp_name[128]; SEC("lsm.s/bpf") -int BPF_PROG(lsm_run, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(lsm_run, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { switch (cmd) { case BPF_RAW_TRACEPOINT_OPEN: diff --git a/tools/testing/selftests/bpf/progs/test_task_under_cgroup.c b/tools/testing/selftests/bpf/progs/test_task_under_cgroup.c index 7e750309ce274..0b74b8bd22e87 100644 --- a/tools/testing/selftests/bpf/progs/test_task_under_cgroup.c +++ b/tools/testing/selftests/bpf/progs/test_task_under_cgroup.c @@ -49,7 +49,7 @@ int BPF_PROG(tp_btf_run, struct task_struct *task, u64 clone_flags) } SEC("lsm.s/bpf") -int BPF_PROG(lsm_run, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(lsm_run, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { struct cgroup *cgrp = NULL; struct task_struct *task; diff --git a/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c b/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c index 12034a73ee2d2..e96d09e111155 100644 --- a/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c +++ b/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c @@ -37,7 +37,7 @@ struct { char _license[] SEC("license") = "GPL"; SEC("lsm.s/bpf") -int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { struct bpf_dynptr data_ptr, sig_ptr; struct data *data_val; From f563314d729329b8b82ece1ff9a0d0abc5403d62 Mon Sep 17 00:00:00 2001 From: Blaise Boscaccy Date: Mon, 10 Mar 2025 15:17:12 -0700 Subject: [PATCH 74/80] selftests/bpf: Add a kernel flag test for LSM bpf hook This test exercises the kernel flag added to security_bpf by effectively blocking light-skeletons from loading while allowing normal skeletons to function as-is. Since this should work with any arbitrary BPF program, an existing program from LSKELS_EXTRA was used as a test payload. Signed-off-by: Blaise Boscaccy Acked-by: Song Liu Link: https://lore.kernel.org/r/20250310221737.821889-3-bboscaccy@linux.microsoft.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/prog_tests/kernel_flag.c | 43 +++++++++++++++++++ .../selftests/bpf/progs/test_kernel_flag.c | 28 ++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/kernel_flag.c create mode 100644 tools/testing/selftests/bpf/progs/test_kernel_flag.c diff --git a/tools/testing/selftests/bpf/prog_tests/kernel_flag.c b/tools/testing/selftests/bpf/prog_tests/kernel_flag.c new file mode 100644 index 0000000000000..a133354ac9bc2 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/kernel_flag.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Microsoft */ +#include +#include "kfunc_call_test.skel.h" +#include "kfunc_call_test.lskel.h" +#include "test_kernel_flag.skel.h" + +void test_kernel_flag(void) +{ + struct test_kernel_flag *lsm_skel; + struct kfunc_call_test *skel = NULL; + struct kfunc_call_test_lskel *lskel = NULL; + int ret; + + lsm_skel = test_kernel_flag__open_and_load(); + if (!ASSERT_OK_PTR(lsm_skel, "lsm_skel")) + return; + + lsm_skel->bss->monitored_tid = gettid(); + + ret = test_kernel_flag__attach(lsm_skel); + if (!ASSERT_OK(ret, "test_kernel_flag__attach")) + goto close_prog; + + /* Test with skel. This should pass the gatekeeper */ + skel = kfunc_call_test__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + goto close_prog; + + /* Test with lskel. This should fail due to blocking kernel-based bpf() invocations */ + lskel = kfunc_call_test_lskel__open_and_load(); + if (!ASSERT_ERR_PTR(lskel, "lskel")) + goto close_prog; + +close_prog: + if (skel) + kfunc_call_test__destroy(skel); + if (lskel) + kfunc_call_test_lskel__destroy(lskel); + + lsm_skel->bss->monitored_tid = 0; + test_kernel_flag__destroy(lsm_skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_kernel_flag.c b/tools/testing/selftests/bpf/progs/test_kernel_flag.c new file mode 100644 index 0000000000000..b45fab3be3528 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_kernel_flag.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2025 Microsoft Corporation + * + * Author: Blaise Boscaccy + */ + +#include "vmlinux.h" +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +__u32 monitored_tid; + +SEC("lsm.s/bpf") +int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) +{ + __u32 tid; + + tid = bpf_get_current_pid_tgid() & 0xFFFFFFFF; + if (!kernel || tid != monitored_tid) + return 0; + else + return -EINVAL; +} From a74a2a3e639f056a8ec0558efa982ae60e9cb43c Mon Sep 17 00:00:00 2001 From: Sewon Nam Date: Tue, 11 Mar 2025 12:12:37 +0900 Subject: [PATCH 75/80] bpf: bpftool: Setting error code in do_loader() We are missing setting error code in do_loader() when bpf_object__open_file() fails. This means the command's exit status code will be successful, even though the operation failed. So make sure to return the correct error code. To maintain consistency with other locations where bpf_object__open_file() is called, return -1. [0] Closes: https://github.com/libbpf/bpftool/issues/156 Reported-by: Dan Carpenter Signed-off-by: Sewon Nam Signed-off-by: Andrii Nakryiko Tested-by: Quentin Monnet Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/d3b5b4b4-19bb-4619-b4dd-86c958c4a367@stanley.mountain/t/#u Link: https://lore.kernel.org/bpf/20250311031238.14865-1-swnam0729@gmail.com --- tools/bpf/bpftool/prog.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index e71be67f1d865..52ffb74ae4e89 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1928,6 +1928,7 @@ static int do_loader(int argc, char **argv) obj = bpf_object__open_file(file, &open_opts); if (!obj) { + err = -1; p_err("failed to open object file"); goto err_close_obj; } From be741c7b2cafb87fcb7485860fc94cbaa16e489a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 10 Mar 2025 14:49:16 +0100 Subject: [PATCH 76/80] bpf: preload: Add MODULE_DESCRIPTION Modpost complains when extra warnings are enabled: WARNING: modpost: missing MODULE_DESCRIPTION() in kernel/bpf/preload/bpf_preload.o Add a description from the Kconfig help text. Signed-off-by: Arnd Bergmann Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250310134920.4123633-1-arnd@kernel.org ---- Not sure if that description actually fits what the module does. If not, please add a different description instead. --- kernel/bpf/preload/bpf_preload_kern.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index 0c63bc2cd895a..2fdf3c978db1b 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -90,3 +90,4 @@ static void __exit fini(void) late_initcall(load); module_exit(fini); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Embedded BPF programs for introspection in bpffs"); From 46d38f489ef02175dcff1e03a849c226eb0729a6 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 11 Mar 2025 08:42:44 -0700 Subject: [PATCH 77/80] selftests/bpf: Fix arena_spin_lock compilation on PowerPC Venkat reported a compilation error for BPF selftests on PowerPC [0]. The crux of the error is the following message: In file included from progs/arena_spin_lock.c:7: /root/bpf-next/tools/testing/selftests/bpf/bpf_arena_spin_lock.h:122:8: error: member reference base type '__attribute__((address_space(1))) u32' (aka '__attribute__((address_space(1))) unsigned int') is not a structure or union 122 | old = atomic_read(&lock->val); This is because PowerPC overrides the qspinlock type changing the lock->val member's type from atomic_t to u32. To remedy this, import the asm-generic version in the arena spin lock header, name it __qspinlock (since it's aliased to arena_spinlock_t, the actual name hardly matters), and adjust the selftest to not depend on the type in vmlinux.h. [0]: https://lore.kernel.org/bpf/7bc80a3b-d708-4735-aa3b-6a8c21720f9d@linux.ibm.com Fixes: 0201027a026c ("selftests/bpf: Introduce arena spin lock") Reported-by: Venkat Rao Bagalkote Signed-off-by: Kumar Kartikeya Dwivedi Signed-off-by: Andrii Nakryiko Tested-by: Venkat Rao Bagalkote Link: https://lore.kernel.org/bpf/20250311154244.3775505-1-memxor@gmail.com --- .../selftests/bpf/bpf_arena_spin_lock.h | 23 ++++++++++++++++++- .../bpf/prog_tests/arena_spin_lock.c | 4 ++-- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/bpf_arena_spin_lock.h index 3aca389ce424a..fb8dc07689998 100644 --- a/tools/testing/selftests/bpf/bpf_arena_spin_lock.h +++ b/tools/testing/selftests/bpf/bpf_arena_spin_lock.h @@ -22,7 +22,28 @@ extern unsigned long CONFIG_NR_CPUS __kconfig; -#define arena_spinlock_t struct qspinlock +/* + * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but + * PowerPC overrides the definition to define lock->val as u32 instead of + * atomic_t, leading to compilation errors. Import a local definition below so + * that we don't depend on the vmlinux.h version. + */ + +struct __qspinlock { + union { + atomic_t val; + struct { + u8 locked; + u8 pending; + }; + struct { + u16 locked_pending; + u16 tail; + }; + }; +}; + +#define arena_spinlock_t struct __qspinlock /* FIXME: Using typedef causes CO-RE relocation error */ /* typedef struct qspinlock arena_spinlock_t; */ diff --git a/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c b/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c index bc3616ba891c2..7565fc7690c27 100644 --- a/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c @@ -4,8 +4,8 @@ #include #include -struct qspinlock { int val; }; -typedef struct qspinlock arena_spinlock_t; +struct __qspinlock { int val; }; +typedef struct __qspinlock arena_spinlock_t; struct arena_qnode { unsigned long next; From 00d1fd27594e74fabd5ee7726a9d72712e81f77b Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Mon, 10 Mar 2025 20:10:37 +0000 Subject: [PATCH 78/80] bpf: udp: Avoid socket skips during iteration Replace the offset-based approach for tracking progress through a bucket in the UDP table with one based on unique, monotonically increasing index numbers associated with each socket in a bucket. Signed-off-by: Jordan Rife --- include/net/sock.h | 2 ++ include/net/udp.h | 1 + net/ipv4/udp.c | 38 +++++++++++++++++++++++++------------- 3 files changed, 28 insertions(+), 13 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 8036b3b79cd8b..b11f43e8e7ec7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -228,6 +228,7 @@ struct sock_common { u32 skc_window_clamp; u32 skc_tw_snd_nxt; /* struct tcp_timewait_sock */ }; + __s64 skc_idx; /* public: */ }; @@ -378,6 +379,7 @@ struct sock { #define sk_incoming_cpu __sk_common.skc_incoming_cpu #define sk_flags __sk_common.skc_flags #define sk_rxhash __sk_common.skc_rxhash +#define sk_idx __sk_common.skc_idx __cacheline_group_begin(sock_write_rx); diff --git a/include/net/udp.h b/include/net/udp.h index 6e89520e100dc..9398561addc68 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -102,6 +102,7 @@ struct udp_table { #endif unsigned int mask; unsigned int log; + atomic64_t ver; }; extern struct udp_table udp_table; void udp_table_init(struct udp_table *, const char *); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a9bb9ce5438ea..d7e9b33469838 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -229,6 +229,11 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } +static inline __s64 udp_table_next_idx(struct udp_table *udptable, bool pos) +{ + return (pos ? 1 : -1) * atomic64_inc_return(&udptable->ver); +} + /** * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 * @@ -244,6 +249,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, struct udp_hslot *hslot, *hslot2; struct net *net = sock_net(sk); int error = -EADDRINUSE; + bool add_tail; if (!snum) { DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); @@ -335,14 +341,16 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock(&hslot2->lock); - if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && - sk->sk_family == AF_INET6) + add_tail = IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && + sk->sk_family == AF_INET6; + if (add_tail) hlist_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node, &hslot2->head); else hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, &hslot2->head); hslot2->count++; + sk->sk_idx = udp_table_next_idx(udptable, add_tail); spin_unlock(&hslot2->lock); } @@ -2250,6 +2258,8 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, &nhslot2->head); nhslot2->count++; + sk->sk_idx = udp_table_next_idx(udptable, + false); spin_unlock(&nhslot2->lock); } @@ -3390,9 +3400,9 @@ struct bpf_udp_iter_state { unsigned int cur_sk; unsigned int end_sk; unsigned int max_sk; - int offset; struct sock **batch; bool st_bucket_done; + __s64 prev_idx; }; static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, @@ -3402,14 +3412,13 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq) struct bpf_udp_iter_state *iter = seq->private; struct udp_iter_state *state = &iter->state; struct net *net = seq_file_net(seq); - int resume_bucket, resume_offset; struct udp_table *udptable; unsigned int batch_sks = 0; bool resized = false; + int resume_bucket; struct sock *sk; resume_bucket = state->bucket; - resume_offset = iter->offset; /* The current batch is done, so advance the bucket. */ if (iter->st_bucket_done) @@ -3436,18 +3445,19 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq) if (hlist_empty(&hslot2->head)) continue; - iter->offset = 0; spin_lock_bh(&hslot2->lock); + /* Reset prev_idx if this is a new bucket. */ + if (!resume_bucket || state->bucket != resume_bucket) + iter->prev_idx = 0; udp_portaddr_for_each_entry(sk, &hslot2->head) { if (seq_sk_match(seq, sk)) { - /* Resume from the last iterated socket at the - * offset in the bucket before iterator was stopped. + /* Resume from the first socket that we didn't + * see last time around. */ if (state->bucket == resume_bucket && - iter->offset < resume_offset) { - ++iter->offset; + iter->prev_idx && + sk->sk_idx <= iter->prev_idx) continue; - } if (iter->end_sk < iter->max_sk) { sock_hold(sk); iter->batch[iter->end_sk++] = sk; @@ -3492,8 +3502,9 @@ static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) * done with seq_show(), so unref the iter->cur_sk. */ if (iter->cur_sk < iter->end_sk) { - sock_put(iter->batch[iter->cur_sk++]); - ++iter->offset; + sk = iter->batch[iter->cur_sk++]; + iter->prev_idx = sk->sk_idx; + sock_put(sk); } /* After updating iter->cur_sk, check if there are more sockets @@ -3740,6 +3751,7 @@ static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_ent udptable->hash2 = (void *)(udptable->hash + hash_entries); udptable->mask = hash_entries - 1; udptable->log = ilog2(hash_entries); + atomic64_set(&udptable->ver, 0); for (i = 0; i < hash_entries; i++) { INIT_HLIST_HEAD(&udptable->hash[i].head); From dcde5d53850f3bd90238a65083cfd027ae2b0b39 Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Thu, 13 Mar 2025 01:46:53 +0000 Subject: [PATCH 79/80] bpf: tcp: Avoid socket skips during iteration Replace the offset-based approach for tracking progress through a bucket in the TCP table with one based on unique, monotonically increasing index numbers associated with each socket in a bucket. Signed-off-by: Jordan Rife --- include/net/inet_hashtables.h | 2 ++ include/net/tcp.h | 3 ++- net/ipv4/inet_hashtables.c | 18 +++++++++++++++--- net/ipv4/tcp.c | 1 + net/ipv4/tcp_ipv4.c | 29 ++++++++++++++++------------- 5 files changed, 36 insertions(+), 17 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 5eea47f135a42..c95d3b1da1990 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -172,6 +172,8 @@ struct inet_hashinfo { struct inet_listen_hashbucket *lhash2; bool pernet; + + atomic64_t ver; } ____cacheline_aligned_in_smp; static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) diff --git a/include/net/tcp.h b/include/net/tcp.h index 2d08473a6dc00..499acd6da35f8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2202,7 +2202,8 @@ struct tcp_iter_state { struct seq_net_private p; enum tcp_seq_states state; struct sock *syn_wait_sk; - int bucket, offset, sbucket, num; + int bucket, sbucket, num; + __s64 prev_idx; loff_t last_pos; }; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 9bfcfd016e182..bc9f58172790f 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -534,6 +534,12 @@ struct sock *__inet_lookup_established(const struct net *net, } EXPORT_SYMBOL_GPL(__inet_lookup_established); +static inline __s64 inet_hashinfo_next_idx(struct inet_hashinfo *hinfo, + bool pos) +{ + return (pos ? 1 : -1) * atomic64_inc_return(&hinfo->ver); +} + /* called with local bh disabled */ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, __u16 lport, @@ -581,6 +587,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); + sk->sk_idx = inet_hashinfo_next_idx(hinfo, false); if (tw) { sk_nulls_del_node_init_rcu((struct sock *)tw); __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); @@ -678,8 +685,10 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) ret = false; } - if (ret) + if (ret) { __sk_nulls_add_node_rcu(sk, list); + sk->sk_idx = inet_hashinfo_next_idx(hashinfo, false); + } spin_unlock(lock); @@ -729,6 +738,7 @@ int __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; + bool add_tail; int err = 0; if (sk->sk_state != TCP_LISTEN) { @@ -747,11 +757,13 @@ int __inet_hash(struct sock *sk, struct sock *osk) goto unlock; } sock_set_flag(sk, SOCK_RCU_FREE); - if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && - sk->sk_family == AF_INET6) + add_tail = IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && + sk->sk_family == AF_INET6; + if (add_tail) __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); else __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); + sk->sk_idx = inet_hashinfo_next_idx(hashinfo, add_tail); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: spin_unlock(&ilb2->lock); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 285678d8ce077..63693af0c05c8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5147,6 +5147,7 @@ void __init tcp_init(void) cnt = tcp_hashinfo.ehash_mask + 1; sysctl_tcp_max_orphans = cnt / 2; + atomic64_set(&tcp_hashinfo.ver, 0); tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2632844d2c356..d0ddb307e2a13 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2602,7 +2602,7 @@ static void *listening_get_first(struct seq_file *seq) struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct tcp_iter_state *st = seq->private; - st->offset = 0; + st->prev_idx = 0; for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { struct inet_listen_hashbucket *ilb2; struct hlist_nulls_node *node; @@ -2637,7 +2637,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) struct sock *sk = cur; ++st->num; - ++st->offset; + st->prev_idx = sk->sk_idx; sk = sk_nulls_next(sk); sk_nulls_for_each_from(sk, node) { @@ -2658,7 +2658,6 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) void *rc; st->bucket = 0; - st->offset = 0; rc = listening_get_first(seq); while (rc && *pos) { @@ -2683,7 +2682,7 @@ static void *established_get_first(struct seq_file *seq) struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct tcp_iter_state *st = seq->private; - st->offset = 0; + st->prev_idx = 0; for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { struct sock *sk; struct hlist_nulls_node *node; @@ -2714,7 +2713,7 @@ static void *established_get_next(struct seq_file *seq, void *cur) struct sock *sk = cur; ++st->num; - ++st->offset; + st->prev_idx = sk->sk_idx; sk = sk_nulls_next(sk); @@ -2763,8 +2762,8 @@ static void *tcp_seek_last_pos(struct seq_file *seq) { struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct tcp_iter_state *st = seq->private; + __s64 prev_idx = st->prev_idx; int bucket = st->bucket; - int offset = st->offset; int orig_num = st->num; void *rc = NULL; @@ -2773,18 +2772,21 @@ static void *tcp_seek_last_pos(struct seq_file *seq) if (st->bucket > hinfo->lhash2_mask) break; rc = listening_get_first(seq); - while (offset-- && rc && bucket == st->bucket) + while (rc && bucket == st->bucket && prev_idx && + ((struct sock *)rc)->sk_idx <= prev_idx) rc = listening_get_next(seq, rc); if (rc) break; st->bucket = 0; + prev_idx = 0; st->state = TCP_SEQ_STATE_ESTABLISHED; fallthrough; case TCP_SEQ_STATE_ESTABLISHED: if (st->bucket > hinfo->ehash_mask) break; rc = established_get_first(seq); - while (offset-- && rc && bucket == st->bucket) + while (rc && bucket == st->bucket && prev_idx && + ((struct sock *)rc)->sk_idx <= prev_idx) rc = established_get_next(seq, rc); } @@ -2807,7 +2809,7 @@ void *tcp_seq_start(struct seq_file *seq, loff_t *pos) st->state = TCP_SEQ_STATE_LISTENING; st->num = 0; st->bucket = 0; - st->offset = 0; + st->prev_idx = 0; rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; out: @@ -2832,7 +2834,7 @@ void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!rc) { st->state = TCP_SEQ_STATE_ESTABLISHED; st->bucket = 0; - st->offset = 0; + st->prev_idx = 0; rc = established_get_first(seq); } break; @@ -3124,7 +3126,7 @@ static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) * it has to advance to the next bucket. */ if (iter->st_bucket_done) { - st->offset = 0; + st->prev_idx = 0; st->bucket++; if (st->state == TCP_SEQ_STATE_LISTENING && st->bucket > hinfo->lhash2_mask) { @@ -3192,8 +3194,9 @@ static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) * the future start() will resume at st->offset in * st->bucket. See tcp_seek_last_pos(). */ - st->offset++; - sock_gen_put(iter->batch[iter->cur_sk++]); + sk = iter->batch[iter->cur_sk++]; + st->prev_idx = sk->sk_idx; + sock_gen_put(sk); } if (iter->cur_sk < iter->end_sk) From 6a5bdf6ae12e22050dc1ca5074b2c3d55ffc755c Mon Sep 17 00:00:00 2001 From: Jordan Rife Date: Sun, 9 Mar 2025 06:23:53 +0000 Subject: [PATCH 80/80] selftests/bpf: Add tests for socket skips and repeats Add do_skip_test() and do_repeat_test() subtests to the sock_iter_batch prog_test to check for socket skips and repeats, respectively. Extend the sock_iter_batch BPF program to output the socket cookie as well, so that we can check for uniqueness. The skip test works by partially iterating through a bucket, then closing one of the sockets that have already been seen to remove it from the bucket. Before, this would have resulted in skipping the fourth socket. Now, the fourth socket is seen. The repeat test works by partially iterating through a bucket, then adding four more sockets to the head of the bucket. Before, this would have resulted in repeating several of the sockets from the first batch, but now we see sockets exactly once. Signed-off-by: Jordan Rife --- .../bpf/prog_tests/sock_iter_batch.c | 293 +++++++++++++++++- .../selftests/bpf/progs/bpf_tracing_net.h | 1 + .../selftests/bpf/progs/sock_iter_batch.c | 24 +- 3 files changed, 300 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c b/tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c index d56e18b255280..cf554e8fdf791 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c @@ -6,15 +6,275 @@ #include "sock_iter_batch.skel.h" #define TEST_NS "sock_iter_batch_netns" +#define nr_soreuse 4 -static const int nr_soreuse = 4; +static const __u16 reuse_port = 10001; + +struct iter_out { + int idx; + __u64 cookie; +} __packed; + +struct sock_count { + __u64 cookie; + int count; +}; + +static int insert(__u64 cookie, struct sock_count counts[], int counts_len) +{ + int insert = -1; + int i = 0; + + for (; i < counts_len; i++) { + if (!counts[i].cookie) { + insert = i; + } else if (counts[i].cookie == cookie) { + insert = i; + break; + } + } + if (insert < 0) + return insert; + + counts[insert].cookie = cookie; + counts[insert].count++; + + return counts[insert].count; +} + +static int read_n(int iter_fd, int n, struct sock_count counts[], + int counts_len) +{ + struct iter_out out; + int nread = 1; + int i = 0; + + for (; nread > 0 && (n < 0 || i < n); i++) { + nread = read(iter_fd, &out, sizeof(out)); + if (!nread || !ASSERT_GE(nread, 1, "nread")) + break; + ASSERT_GE(insert(out.cookie, counts, counts_len), 0, "insert"); + } + + ASSERT_TRUE(n < 0 || i == n, "n < 0 || i == n"); + + return i; +} + +static __u64 socket_cookie(int fd) +{ + __u64 cookie; + socklen_t cookie_len = sizeof(cookie); + static __u32 duration; /* for CHECK macro */ + + if (CHECK(getsockopt(fd, SOL_SOCKET, SO_COOKIE, &cookie, &cookie_len) < 0, + "getsockopt(SO_COOKIE)", "%s\n", strerror(errno))) + return 0; + return cookie; +} + +static bool was_seen(int fd, struct sock_count counts[], int counts_len) +{ + __u64 cookie = socket_cookie(fd); + int i = 0; + + for (; cookie && i < counts_len; i++) + if (cookie == counts[i].cookie) + return true; + + return false; +} + +static int get_seen_socket(int *fds, struct sock_count counts[], int n) +{ + int i = 0; + + for (; i < n; i++) + if (was_seen(fds[i], counts, n)) + return i; + return -1; +} + +static int get_seen_count(int fd, struct sock_count counts[], int n) +{ + __u64 cookie = socket_cookie(fd); + int count = 0; + int i = 0; + + for (; cookie && !count && i < n; i++) + if (cookie == counts[i].cookie) + count = counts[i].count; + + return count; +} + +static void check_n_were_seen_once(int *fds, int fds_len, int n, + struct sock_count counts[], int counts_len) +{ + int seen_once = 0; + int seen_cnt; + int i = 0; + + for (; i < fds_len; i++) { + /* Skip any sockets that were closed or that weren't seen + * exactly once. + */ + if (fds[i] < 0) + continue; + seen_cnt = get_seen_count(fds[i], counts, counts_len); + if (seen_cnt && ASSERT_EQ(seen_cnt, 1, "seen_cnt")) + seen_once++; + } + + ASSERT_EQ(seen_once, n, "seen_once"); +} + +static void do_skip_test(int sock_type) +{ + struct sock_count counts[nr_soreuse] = {}; + struct bpf_link *link = NULL; + struct sock_iter_batch *skel; + int err, iter_fd = -1; + int close_idx; + int *fds; + + skel = sock_iter_batch__open(); + if (!ASSERT_OK_PTR(skel, "sock_iter_batch__open")) + return; + + /* Prepare a bucket of sockets in the kernel hashtable */ + int local_port; + + fds = start_reuseport_server(AF_INET, sock_type, "127.0.0.1", 0, 0, + nr_soreuse); + if (!ASSERT_OK_PTR(fds, "start_reuseport_server")) + goto done; + local_port = get_socket_local_port(*fds); + if (!ASSERT_GE(local_port, 0, "get_socket_local_port")) + goto done; + skel->rodata->ports[0] = ntohs(local_port); + skel->rodata->sf = AF_INET; + + err = sock_iter_batch__load(skel); + if (!ASSERT_OK(err, "sock_iter_batch__load")) + goto done; + + link = bpf_program__attach_iter(sock_type == SOCK_STREAM ? + skel->progs.iter_tcp_soreuse : + skel->progs.iter_udp_soreuse, + NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_iter")) + goto done; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (!ASSERT_GE(iter_fd, 0, "bpf_iter_create")) + goto done; + + /* Iterate through the first three sockets. */ + read_n(iter_fd, nr_soreuse - 1, counts, nr_soreuse); + + /* Make sure we saw three sockets from fds exactly once. */ + check_n_were_seen_once(fds, nr_soreuse, nr_soreuse - 1, counts, + nr_soreuse); + + /* Close a socket we've already seen to remove it from the bucket. */ + close_idx = get_seen_socket(fds, counts, nr_soreuse); + if (!ASSERT_GE(close_idx, 0, "close_idx")) + goto done; + close(fds[close_idx]); + fds[close_idx] = -1; + + /* Iterate through the rest of the sockets. */ + read_n(iter_fd, -1, counts, nr_soreuse); + + /* Make sure the last socket wasn't skipped and that there were no + * repeats. + */ + check_n_were_seen_once(fds, nr_soreuse, nr_soreuse - 1, counts, + nr_soreuse); +done: + free_fds(fds, nr_soreuse); + if (iter_fd < 0) + close(iter_fd); + bpf_link__destroy(link); + sock_iter_batch__destroy(skel); +} + +static void do_repeat_test(int sock_type) +{ + struct sock_count counts[nr_soreuse] = {}; + struct bpf_link *link = NULL; + struct sock_iter_batch *skel; + int err, i, iter_fd = -1; + int *fds[2] = {}; + + skel = sock_iter_batch__open(); + if (!ASSERT_OK_PTR(skel, "sock_iter_batch__open")) + return; + + /* Prepare a bucket of sockets in the kernel hashtable */ + int local_port; + + fds[0] = start_reuseport_server(AF_INET, sock_type, "127.0.0.1", + reuse_port, 0, nr_soreuse); + if (!ASSERT_OK_PTR(fds[0], "start_reuseport_server")) + goto done; + local_port = get_socket_local_port(*fds[0]); + if (!ASSERT_GE(local_port, 0, "get_socket_local_port")) + goto done; + skel->rodata->ports[0] = ntohs(local_port); + skel->rodata->sf = AF_INET; + + err = sock_iter_batch__load(skel); + if (!ASSERT_OK(err, "sock_iter_batch__load")) + goto done; + + link = bpf_program__attach_iter(sock_type == SOCK_STREAM ? + skel->progs.iter_tcp_soreuse : + skel->progs.iter_udp_soreuse, + NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_iter")) + goto done; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (!ASSERT_GE(iter_fd, 0, "bpf_iter_create")) + goto done; + + /* Iterate through the first three sockets */ + read_n(iter_fd, nr_soreuse - 1, counts, nr_soreuse); + + /* Make sure we saw three sockets from fds exactly once. */ + check_n_were_seen_once(fds[0], nr_soreuse, nr_soreuse - 1, counts, + nr_soreuse); + + /* Add nr_soreuse more sockets to the bucket. */ + fds[1] = start_reuseport_server(AF_INET, sock_type, "127.0.0.1", + reuse_port, 0, nr_soreuse); + if (!ASSERT_OK_PTR(fds[1], "start_reuseport_server")) + goto done; + + /* Iterate through the rest of the sockets. */ + read_n(iter_fd, -1, counts, nr_soreuse); + + /* Make sure each socket from the first set was seen exactly once. */ + check_n_were_seen_once(fds[0], nr_soreuse, nr_soreuse, counts, + nr_soreuse); +done: + for (i = 0; i < ARRAY_SIZE(fds); i++) + free_fds(fds[i], nr_soreuse); + if (iter_fd < 0) + close(iter_fd); + bpf_link__destroy(link); + sock_iter_batch__destroy(skel); +} static void do_test(int sock_type, bool onebyone) { int err, i, nread, to_read, total_read, iter_fd = -1; - int first_idx, second_idx, indices[nr_soreuse]; + struct iter_out outputs[nr_soreuse]; struct bpf_link *link = NULL; struct sock_iter_batch *skel; + int first_idx, second_idx; int *fds[2] = {}; skel = sock_iter_batch__open(); @@ -34,6 +294,7 @@ static void do_test(int sock_type, bool onebyone) goto done; skel->rodata->ports[i] = ntohs(local_port); } + skel->rodata->sf = AF_INET6; err = sock_iter_batch__load(skel); if (!ASSERT_OK(err, "sock_iter_batch__load")) @@ -55,38 +316,38 @@ static void do_test(int sock_type, bool onebyone) * from a bucket and leave one socket out from * that bucket on purpose. */ - to_read = (nr_soreuse - 1) * sizeof(*indices); + to_read = (nr_soreuse - 1) * sizeof(*outputs); total_read = 0; first_idx = -1; do { - nread = read(iter_fd, indices, onebyone ? sizeof(*indices) : to_read); - if (nread <= 0 || nread % sizeof(*indices)) + nread = read(iter_fd, outputs, onebyone ? sizeof(*outputs) : to_read); + if (nread <= 0 || nread % sizeof(*outputs)) break; total_read += nread; if (first_idx == -1) - first_idx = indices[0]; - for (i = 0; i < nread / sizeof(*indices); i++) - ASSERT_EQ(indices[i], first_idx, "first_idx"); + first_idx = outputs[0].idx; + for (i = 0; i < nread / sizeof(*outputs); i++) + ASSERT_EQ(outputs[i].idx, first_idx, "first_idx"); } while (total_read < to_read); - ASSERT_EQ(nread, onebyone ? sizeof(*indices) : to_read, "nread"); + ASSERT_EQ(nread, onebyone ? sizeof(*outputs) : to_read, "nread"); ASSERT_EQ(total_read, to_read, "total_read"); free_fds(fds[first_idx], nr_soreuse); fds[first_idx] = NULL; /* Read the "whole" second bucket */ - to_read = nr_soreuse * sizeof(*indices); + to_read = nr_soreuse * sizeof(*outputs); total_read = 0; second_idx = !first_idx; do { - nread = read(iter_fd, indices, onebyone ? sizeof(*indices) : to_read); - if (nread <= 0 || nread % sizeof(*indices)) + nread = read(iter_fd, outputs, onebyone ? sizeof(*outputs) : to_read); + if (nread <= 0 || nread % sizeof(*outputs)) break; total_read += nread; - for (i = 0; i < nread / sizeof(*indices); i++) - ASSERT_EQ(indices[i], second_idx, "second_idx"); + for (i = 0; i < nread / sizeof(*outputs); i++) + ASSERT_EQ(outputs[i].idx, second_idx, "second_idx"); } while (total_read <= to_read); ASSERT_EQ(nread, 0, "nread"); /* Both so_reuseport ports should be in different buckets, so @@ -123,10 +384,14 @@ void test_sock_iter_batch(void) if (test__start_subtest("tcp")) { do_test(SOCK_STREAM, true); do_test(SOCK_STREAM, false); + do_skip_test(SOCK_STREAM); + do_repeat_test(SOCK_STREAM); } if (test__start_subtest("udp")) { do_test(SOCK_DGRAM, true); do_test(SOCK_DGRAM, false); + do_skip_test(SOCK_DGRAM); + do_repeat_test(SOCK_DGRAM); } close_netns(nstoken); diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 59843b430f76a..82928cc5d87b7 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -123,6 +123,7 @@ #define sk_refcnt __sk_common.skc_refcnt #define sk_state __sk_common.skc_state #define sk_net __sk_common.skc_net +#define sk_rcv_saddr __sk_common.skc_rcv_saddr #define sk_v6_daddr __sk_common.skc_v6_daddr #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr #define sk_flags __sk_common.skc_flags diff --git a/tools/testing/selftests/bpf/progs/sock_iter_batch.c b/tools/testing/selftests/bpf/progs/sock_iter_batch.c index 96531b0d9d55b..8f483337e103c 100644 --- a/tools/testing/selftests/bpf/progs/sock_iter_batch.c +++ b/tools/testing/selftests/bpf/progs/sock_iter_batch.c @@ -17,6 +17,12 @@ static bool ipv6_addr_loopback(const struct in6_addr *a) a->s6_addr32[2] | (a->s6_addr32[3] ^ bpf_htonl(1))) == 0; } +static bool ipv4_addr_loopback(__be32 a) +{ + return a == bpf_ntohl(0x7f000001); +} + +volatile const unsigned int sf; volatile const __u16 ports[2]; unsigned int bucket[2]; @@ -26,16 +32,20 @@ int iter_tcp_soreuse(struct bpf_iter__tcp *ctx) struct sock *sk = (struct sock *)ctx->sk_common; struct inet_hashinfo *hinfo; unsigned int hash; + __u64 sock_cookie; struct net *net; int idx; if (!sk) return 0; + sock_cookie = bpf_get_socket_cookie(sk); sk = bpf_core_cast(sk, struct sock); - if (sk->sk_family != AF_INET6 || + if (sk->sk_family != sf || sk->sk_state != TCP_LISTEN || - !ipv6_addr_loopback(&sk->sk_v6_rcv_saddr)) + sk->sk_family == AF_INET6 ? + !ipv6_addr_loopback(&sk->sk_v6_rcv_saddr) : + !ipv4_addr_loopback(sk->sk_rcv_saddr)) return 0; if (sk->sk_num == ports[0]) @@ -52,6 +62,7 @@ int iter_tcp_soreuse(struct bpf_iter__tcp *ctx) hinfo = net->ipv4.tcp_death_row.hashinfo; bucket[idx] = hash & hinfo->lhash2_mask; bpf_seq_write(ctx->meta->seq, &idx, sizeof(idx)); + bpf_seq_write(ctx->meta->seq, &sock_cookie, sizeof(sock_cookie)); return 0; } @@ -63,14 +74,18 @@ int iter_udp_soreuse(struct bpf_iter__udp *ctx) { struct sock *sk = (struct sock *)ctx->udp_sk; struct udp_table *udptable; + __u64 sock_cookie; int idx; if (!sk) return 0; + sock_cookie = bpf_get_socket_cookie(sk); sk = bpf_core_cast(sk, struct sock); - if (sk->sk_family != AF_INET6 || - !ipv6_addr_loopback(&sk->sk_v6_rcv_saddr)) + if (sk->sk_family != sf || + sk->sk_family == AF_INET6 ? + !ipv6_addr_loopback(&sk->sk_v6_rcv_saddr) : + !ipv4_addr_loopback(sk->sk_rcv_saddr)) return 0; if (sk->sk_num == ports[0]) @@ -84,6 +99,7 @@ int iter_udp_soreuse(struct bpf_iter__udp *ctx) udptable = sk->sk_net.net->ipv4.udp_table; bucket[idx] = udp_sk(sk)->udp_portaddr_hash & udptable->mask; bpf_seq_write(ctx->meta->seq, &idx, sizeof(idx)); + bpf_seq_write(ctx->meta->seq, &sock_cookie, sizeof(sock_cookie)); return 0; }