diff --git a/Documentation/bpf/bpf_iterators.rst b/Documentation/bpf/bpf_iterators.rst index 07433915aa414..7f514cb6b0525 100644 --- a/Documentation/bpf/bpf_iterators.rst +++ b/Documentation/bpf/bpf_iterators.rst @@ -86,7 +86,7 @@ following steps: The following are a few examples of selftest BPF iterator programs: * `bpf_iter_tcp4.c `_ -* `bpf_iter_task_vma.c `_ +* `bpf_iter_task_vmas.c `_ * `bpf_iter_task_file.c `_ Let us look at ``bpf_iter_task_file.c``, which runs in kernel space: diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index e390c432f546e..39577f1d079a9 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -188,8 +188,10 @@ enum aarch64_insn_ldst_type { AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX, AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX, AARCH64_INSN_LDST_STORE_PAIR_POST_INDEX, + AARCH64_INSN_LDST_LOAD_ACQ, AARCH64_INSN_LDST_LOAD_EX, AARCH64_INSN_LDST_LOAD_ACQ_EX, + AARCH64_INSN_LDST_STORE_REL, AARCH64_INSN_LDST_STORE_EX, AARCH64_INSN_LDST_STORE_REL_EX, AARCH64_INSN_LDST_SIGNED_LOAD_IMM_OFFSET, @@ -351,8 +353,10 @@ __AARCH64_INSN_FUNCS(ldr_imm, 0x3FC00000, 0x39400000) __AARCH64_INSN_FUNCS(ldr_lit, 0xBF000000, 0x18000000) __AARCH64_INSN_FUNCS(ldrsw_lit, 0xFF000000, 0x98000000) __AARCH64_INSN_FUNCS(exclusive, 0x3F800000, 0x08000000) -__AARCH64_INSN_FUNCS(load_ex, 0x3F400000, 0x08400000) -__AARCH64_INSN_FUNCS(store_ex, 0x3F400000, 0x08000000) +__AARCH64_INSN_FUNCS(load_acq, 0x3FDFFC00, 0x08DFFC00) +__AARCH64_INSN_FUNCS(store_rel, 0x3FDFFC00, 0x089FFC00) +__AARCH64_INSN_FUNCS(load_ex, 0x3FC00000, 0x08400000) +__AARCH64_INSN_FUNCS(store_ex, 0x3FC00000, 0x08000000) __AARCH64_INSN_FUNCS(mops, 0x3B200C00, 0x19000400) __AARCH64_INSN_FUNCS(stp, 0x7FC00000, 0x29000000) __AARCH64_INSN_FUNCS(ldp, 0x7FC00000, 0x29400000) @@ -602,6 +606,10 @@ u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1, int offset, enum aarch64_insn_variant variant, enum aarch64_insn_ldst_type type); +u32 aarch64_insn_gen_load_acq_store_rel(enum aarch64_insn_register reg, + enum aarch64_insn_register base, + enum aarch64_insn_size_type size, + enum aarch64_insn_ldst_type type); u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg, enum aarch64_insn_register base, enum aarch64_insn_register state, diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c index b008a9b46a7ff..9bef696e2230b 100644 --- a/arch/arm64/lib/insn.c +++ b/arch/arm64/lib/insn.c @@ -540,6 +540,35 @@ u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1, offset >> shift); } +u32 aarch64_insn_gen_load_acq_store_rel(enum aarch64_insn_register reg, + enum aarch64_insn_register base, + enum aarch64_insn_size_type size, + enum aarch64_insn_ldst_type type) +{ + u32 insn; + + switch (type) { + case AARCH64_INSN_LDST_LOAD_ACQ: + insn = aarch64_insn_get_load_acq_value(); + break; + case AARCH64_INSN_LDST_STORE_REL: + insn = aarch64_insn_get_store_rel_value(); + break; + default: + pr_err("%s: unknown load-acquire/store-release encoding %d\n", + __func__, type); + return AARCH64_BREAK_FAULT; + } + + insn = aarch64_insn_encode_ldst_size(size, insn); + + insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, + reg); + + return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, + base); +} + u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg, enum aarch64_insn_register base, enum aarch64_insn_register state, diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h index b22ab2f97a300..a3b0e693a125c 100644 --- a/arch/arm64/net/bpf_jit.h +++ b/arch/arm64/net/bpf_jit.h @@ -119,6 +119,26 @@ aarch64_insn_gen_load_store_ex(Rt, Rn, Rs, A64_SIZE(sf), \ AARCH64_INSN_LDST_STORE_REL_EX) +/* Load-acquire & store-release */ +#define A64_LDAR(Rt, Rn, size) \ + aarch64_insn_gen_load_acq_store_rel(Rt, Rn, AARCH64_INSN_SIZE_##size, \ + AARCH64_INSN_LDST_LOAD_ACQ) +#define A64_STLR(Rt, Rn, size) \ + aarch64_insn_gen_load_acq_store_rel(Rt, Rn, AARCH64_INSN_SIZE_##size, \ + AARCH64_INSN_LDST_STORE_REL) + +/* Rt = [Rn] (load acquire) */ +#define A64_LDARB(Wt, Xn) A64_LDAR(Wt, Xn, 8) +#define A64_LDARH(Wt, Xn) A64_LDAR(Wt, Xn, 16) +#define A64_LDAR32(Wt, Xn) A64_LDAR(Wt, Xn, 32) +#define A64_LDAR64(Xt, Xn) A64_LDAR(Xt, Xn, 64) + +/* [Rn] = Rt (store release) */ +#define A64_STLRB(Wt, Xn) A64_STLR(Wt, Xn, 8) +#define A64_STLRH(Wt, Xn) A64_STLR(Wt, Xn, 16) +#define A64_STLR32(Wt, Xn) A64_STLR(Wt, Xn, 32) +#define A64_STLR64(Xt, Xn) A64_STLR(Xt, Xn, 64) + /* * LSE atomics * diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 7409c8acbde35..70d7c89d3ac90 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -647,6 +647,81 @@ static int emit_bpf_tail_call(struct jit_ctx *ctx) return 0; } +static int emit_atomic_ld_st(const struct bpf_insn *insn, struct jit_ctx *ctx) +{ + const s32 imm = insn->imm; + const s16 off = insn->off; + const u8 code = insn->code; + const bool arena = BPF_MODE(code) == BPF_PROBE_ATOMIC; + const u8 arena_vm_base = bpf2a64[ARENA_VM_START]; + const u8 dst = bpf2a64[insn->dst_reg]; + const u8 src = bpf2a64[insn->src_reg]; + const u8 tmp = bpf2a64[TMP_REG_1]; + u8 reg; + + switch (imm) { + case BPF_LOAD_ACQ: + reg = src; + break; + case BPF_STORE_REL: + reg = dst; + break; + default: + pr_err_once("unknown atomic load/store op code %02x\n", imm); + return -EINVAL; + } + + if (off) { + emit_a64_add_i(1, tmp, reg, tmp, off, ctx); + reg = tmp; + } + if (arena) { + emit(A64_ADD(1, tmp, reg, arena_vm_base), ctx); + reg = tmp; + } + + switch (imm) { + case BPF_LOAD_ACQ: + switch (BPF_SIZE(code)) { + case BPF_B: + emit(A64_LDARB(dst, reg), ctx); + break; + case BPF_H: + emit(A64_LDARH(dst, reg), ctx); + break; + case BPF_W: + emit(A64_LDAR32(dst, reg), ctx); + break; + case BPF_DW: + emit(A64_LDAR64(dst, reg), ctx); + break; + } + break; + case BPF_STORE_REL: + switch (BPF_SIZE(code)) { + case BPF_B: + emit(A64_STLRB(src, reg), ctx); + break; + case BPF_H: + emit(A64_STLRH(src, reg), ctx); + break; + case BPF_W: + emit(A64_STLR32(src, reg), ctx); + break; + case BPF_DW: + emit(A64_STLR64(src, reg), ctx); + break; + } + break; + default: + pr_err_once("unexpected atomic load/store op code %02x\n", + imm); + return -EINVAL; + } + + return 0; +} + #ifdef CONFIG_ARM64_LSE_ATOMICS static int emit_lse_atomic(const struct bpf_insn *insn, struct jit_ctx *ctx) { @@ -1641,11 +1716,17 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, return ret; break; + case BPF_STX | BPF_ATOMIC | BPF_B: + case BPF_STX | BPF_ATOMIC | BPF_H: case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_B: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_H: case BPF_STX | BPF_PROBE_ATOMIC | BPF_W: case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW: - if (cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) + if (bpf_atomic_is_load_store(insn)) + ret = emit_atomic_ld_st(insn, ctx); + else if (cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) ret = emit_lse_atomic(insn, ctx); else ret = emit_ll_sc_atomic(insn, ctx); @@ -2669,7 +2750,8 @@ bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) switch (insn->code) { case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: - if (!cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) + if (!bpf_atomic_is_load_store(insn) && + !cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) return false; } return true; diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 9d440a0b729eb..0776dfde2dba9 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -2919,10 +2919,16 @@ bool bpf_jit_supports_arena(void) bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) { - /* - * Currently the verifier uses this function only to check which - * atomic stores to arena are supported, and they all are. - */ + if (!in_arena) + return true; + switch (insn->code) { + case BPF_STX | BPF_ATOMIC | BPF_B: + case BPF_STX | BPF_ATOMIC | BPF_H: + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (bpf_atomic_is_load_store(insn)) + return false; + } return true; } diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile index 383c87300b0d3..dddbefc0f4398 100644 --- a/arch/x86/net/Makefile +++ b/arch/x86/net/Makefile @@ -6,5 +6,5 @@ ifeq ($(CONFIG_X86_32),y) obj-$(CONFIG_BPF_JIT) += bpf_jit_comp32.o else - obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o + obj-$(CONFIG_BPF_JIT) += bpf_jit_comp.o bpf_timed_may_goto.o endif diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a43fc5af973d2..d3491cc0898bf 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1242,8 +1242,8 @@ static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm) emit_st_index(pprog, size, dst_reg, X86_REG_R12, off, imm); } -static int emit_atomic(u8 **pprog, u8 atomic_op, - u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) +static int emit_atomic_rmw(u8 **pprog, u32 atomic_op, + u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) { u8 *prog = *pprog; @@ -1283,8 +1283,9 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, return 0; } -static int emit_atomic_index(u8 **pprog, u8 atomic_op, u32 size, - u32 dst_reg, u32 src_reg, u32 index_reg, int off) +static int emit_atomic_rmw_index(u8 **pprog, u32 atomic_op, u32 size, + u32 dst_reg, u32 src_reg, u32 index_reg, + int off) { u8 *prog = *pprog; @@ -1297,7 +1298,7 @@ static int emit_atomic_index(u8 **pprog, u8 atomic_op, u32 size, EMIT1(add_3mod(0x48, dst_reg, src_reg, index_reg)); break; default: - pr_err("bpf_jit: 1 and 2 byte atomics are not supported\n"); + pr_err("bpf_jit: 1- and 2-byte RMW atomics are not supported\n"); return -EFAULT; } @@ -1331,6 +1332,49 @@ static int emit_atomic_index(u8 **pprog, u8 atomic_op, u32 size, return 0; } +static int emit_atomic_ld_st(u8 **pprog, u32 atomic_op, u32 dst_reg, + u32 src_reg, s16 off, u8 bpf_size) +{ + switch (atomic_op) { + case BPF_LOAD_ACQ: + /* dst_reg = smp_load_acquire(src_reg + off16) */ + emit_ldx(pprog, bpf_size, dst_reg, src_reg, off); + break; + case BPF_STORE_REL: + /* smp_store_release(dst_reg + off16, src_reg) */ + emit_stx(pprog, bpf_size, dst_reg, src_reg, off); + break; + default: + pr_err("bpf_jit: unknown atomic load/store opcode %02x\n", + atomic_op); + return -EFAULT; + } + + return 0; +} + +static int emit_atomic_ld_st_index(u8 **pprog, u32 atomic_op, u32 size, + u32 dst_reg, u32 src_reg, u32 index_reg, + int off) +{ + switch (atomic_op) { + case BPF_LOAD_ACQ: + /* dst_reg = smp_load_acquire(src_reg + idx_reg + off16) */ + emit_ldx_index(pprog, size, dst_reg, src_reg, index_reg, off); + break; + case BPF_STORE_REL: + /* smp_store_release(dst_reg + idx_reg + off16, src_reg) */ + emit_stx_index(pprog, size, dst_reg, src_reg, index_reg, off); + break; + default: + pr_err("bpf_jit: unknown atomic load/store opcode %02x\n", + atomic_op); + return -EFAULT; + } + + return 0; +} + #define DONT_CLEAR 1 bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs) @@ -2113,6 +2157,13 @@ st: if (is_imm8(insn->off)) } break; + case BPF_STX | BPF_ATOMIC | BPF_B: + case BPF_STX | BPF_ATOMIC | BPF_H: + if (!bpf_atomic_is_load_store(insn)) { + pr_err("bpf_jit: 1- and 2-byte RMW atomics are not supported\n"); + return -EFAULT; + } + fallthrough; case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: if (insn->imm == (BPF_AND | BPF_FETCH) || @@ -2148,10 +2199,10 @@ st: if (is_imm8(insn->off)) EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)], add_2reg(0xC0, AUX_REG, real_src_reg)); /* Attempt to swap in new value */ - err = emit_atomic(&prog, BPF_CMPXCHG, - real_dst_reg, AUX_REG, - insn->off, - BPF_SIZE(insn->code)); + err = emit_atomic_rmw(&prog, BPF_CMPXCHG, + real_dst_reg, AUX_REG, + insn->off, + BPF_SIZE(insn->code)); if (WARN_ON(err)) return err; /* @@ -2166,17 +2217,35 @@ st: if (is_imm8(insn->off)) break; } - err = emit_atomic(&prog, insn->imm, dst_reg, src_reg, - insn->off, BPF_SIZE(insn->code)); + if (bpf_atomic_is_load_store(insn)) + err = emit_atomic_ld_st(&prog, insn->imm, dst_reg, src_reg, + insn->off, BPF_SIZE(insn->code)); + else + err = emit_atomic_rmw(&prog, insn->imm, dst_reg, src_reg, + insn->off, BPF_SIZE(insn->code)); if (err) return err; break; + case BPF_STX | BPF_PROBE_ATOMIC | BPF_B: + case BPF_STX | BPF_PROBE_ATOMIC | BPF_H: + if (!bpf_atomic_is_load_store(insn)) { + pr_err("bpf_jit: 1- and 2-byte RMW atomics are not supported\n"); + return -EFAULT; + } + fallthrough; case BPF_STX | BPF_PROBE_ATOMIC | BPF_W: case BPF_STX | BPF_PROBE_ATOMIC | BPF_DW: start_of_ldx = prog; - err = emit_atomic_index(&prog, insn->imm, BPF_SIZE(insn->code), - dst_reg, src_reg, X86_REG_R12, insn->off); + + if (bpf_atomic_is_load_store(insn)) + err = emit_atomic_ld_st_index(&prog, insn->imm, + BPF_SIZE(insn->code), dst_reg, + src_reg, X86_REG_R12, insn->off); + else + err = emit_atomic_rmw_index(&prog, insn->imm, BPF_SIZE(insn->code), + dst_reg, src_reg, X86_REG_R12, + insn->off); if (err) return err; goto populate_extable; @@ -3791,3 +3860,8 @@ u64 bpf_arch_uaddress_limit(void) { return 0; } + +bool bpf_jit_supports_timed_may_goto(void) +{ + return true; +} diff --git a/arch/x86/net/bpf_timed_may_goto.S b/arch/x86/net/bpf_timed_may_goto.S new file mode 100644 index 0000000000000..20de4a14dc4cf --- /dev/null +++ b/arch/x86/net/bpf_timed_may_goto.S @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ + +#include +#include +#include + + .code64 + .section .text, "ax" + +SYM_FUNC_START(arch_bpf_timed_may_goto) + ANNOTATE_NOENDBR + + /* Save r0-r5. */ + pushq %rax + pushq %rdi + pushq %rsi + pushq %rdx + pushq %rcx + pushq %r8 + + /* + * r10 passes us stack depth, load the pointer to count and timestamp as + * first argument to the call below. + */ + leaq (%rbp, %r10, 1), %rdi + + /* Emit call depth accounting for call below. */ + CALL_DEPTH_ACCOUNT + call bpf_check_timed_may_goto + + /* BPF_REG_AX=r10 will be stored into count, so move return value to it. */ + movq %rax, %r10 + + /* Restore r5-r0. */ + popq %r8 + popq %rcx + popq %rdx + popq %rsi + popq %rdi + popq %rax + + RET +SYM_FUNC_END(arch_bpf_timed_may_goto) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 7fc69083e7450..9de7adb682948 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -111,6 +111,7 @@ struct bpf_prog_list { struct bpf_prog *prog; struct bpf_cgroup_link *link; struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; + u32 flags; }; int cgroup_bpf_inherit(struct cgroup *cgrp); diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 15164787ce7f8..7d55553de3fce 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -991,6 +991,21 @@ static inline bool bpf_pseudo_func(const struct bpf_insn *insn) return bpf_is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC; } +/* Given a BPF_ATOMIC instruction @atomic_insn, return true if it is an + * atomic load or store, and false if it is a read-modify-write instruction. + */ +static inline bool +bpf_atomic_is_load_store(const struct bpf_insn *atomic_insn) +{ + switch (atomic_insn->imm) { + case BPF_LOAD_ACQ: + case BPF_STORE_REL: + return true; + default: + return false; + } +} + struct bpf_prog_ops { int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); @@ -1531,6 +1546,7 @@ struct bpf_prog_aux { bool jits_use_priv_stack; bool priv_stack_requested; bool changes_pkt_data; + bool might_sleep; u64 prog_array_member_cnt; /* counts how many times as member of prog_array */ struct mutex ext_mutex; /* mutex for is_extended and prog_array_member_cnt */ struct bpf_arena *arena; @@ -1986,6 +2002,7 @@ struct bpf_array { */ enum { BPF_MAX_LOOPS = 8 * 1024 * 1024, + BPF_MAX_TIMED_LOOPS = 0xffff, }; #define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ @@ -2354,7 +2371,7 @@ int generic_map_delete_batch(struct bpf_map *map, struct bpf_map *bpf_map_get_curr_or_next(u32 *id); struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, +int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **page_array); #ifdef CONFIG_MEMCG void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index bbd013c38ff9f..d6cfc4ee6820c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -591,6 +591,8 @@ struct bpf_insn_aux_data { * accepts callback function as a parameter. */ bool calls_callback; + /* registers alive before this instruction. */ + u16 live_regs_before; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ @@ -667,6 +669,7 @@ struct bpf_subprog_info { /* true if bpf_fastcall stack region is used by functions that can't be inlined */ bool keep_fastcall_stack: 1; bool changes_pkt_data: 1; + bool might_sleep: 1; enum priv_stack_mode priv_stack_mode; u8 arg_cnt; @@ -747,7 +750,11 @@ struct bpf_verifier_env { struct { int *insn_state; int *insn_stack; + /* vector of instruction indexes sorted in post-order */ + int *insn_postorder; int cur_stack; + /* current position in the insn_postorder vector */ + int cur_postorder; } cfg; struct backtrack_state bt; struct bpf_insn_hist_entry *insn_hist; diff --git a/include/linux/filter.h b/include/linux/filter.h index 3ed6eb9e7c733..590476743f7a3 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -364,6 +364,8 @@ static inline bool insn_is_cast_user(const struct bpf_insn *insn) * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) + * BPF_LOAD_ACQ dst_reg = smp_load_acquire(src_reg + off16) + * BPF_STORE_REL smp_store_release(dst_reg + off16, src_reg) */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ @@ -669,6 +671,11 @@ struct bpf_prog_stats { struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); +struct bpf_timed_may_goto { + u64 count; + u64 timestamp; +}; + struct sk_filter { refcount_t refcnt; struct rcu_head rcu; @@ -1130,8 +1137,11 @@ bool bpf_jit_supports_ptr_xchg(void); bool bpf_jit_supports_arena(void); bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); bool bpf_jit_supports_private_stack(void); +bool bpf_jit_supports_timed_may_goto(void); u64 bpf_arch_uaddress_limit(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); +u64 arch_bpf_timed_may_goto(void); +u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *); bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id); static inline bool bpf_dump_raw_ok(const struct cred *cred) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 6bb1a5a7a4ae3..c9fa6309c903b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -39,6 +39,25 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) return !!(gfp_flags & __GFP_DIRECT_RECLAIM); } +static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags) +{ + /* + * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed. + * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd. + * All GFP_* flags including GFP_NOWAIT use one or both flags. + * try_alloc_pages() is the only API that doesn't specify either flag. + * + * This is stronger than GFP_NOWAIT or GFP_ATOMIC because + * those are guaranteed to never block on a sleeping lock. + * Here we are enforcing that the allocation doesn't ever spin + * on any locks (i.e. only trylocks). There is no high level + * GFP_$FOO flag for this use in try_alloc_pages() as the + * regular page allocator doesn't fully support this + * allocation mode. + */ + return !!(gfp_flags & __GFP_RECLAIM); +} + #ifdef CONFIG_HIGHMEM #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM #else @@ -335,6 +354,9 @@ static inline struct page *alloc_page_vma_noprof(gfp_t gfp, } #define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) +struct page *try_alloc_pages_noprof(int nid, unsigned int order); +#define try_alloc_pages(...) alloc_hooks(try_alloc_pages_noprof(__VA_ARGS__)) + extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order); #define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__)) @@ -357,6 +379,7 @@ __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas __get_free_pages((gfp_mask) | GFP_DMA, (order)) extern void __free_pages(struct page *page, unsigned int order); +extern void free_pages_nolock(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); #define __free_page(page) __free_pages((page), 0) diff --git a/include/linux/local_lock.h b/include/linux/local_lock.h index 091dc0b6bdfb9..1a0bc35839e36 100644 --- a/include/linux/local_lock.h +++ b/include/linux/local_lock.h @@ -51,6 +51,76 @@ #define local_unlock_irqrestore(lock, flags) \ __local_unlock_irqrestore(lock, flags) +/** + * localtry_lock_init - Runtime initialize a lock instance + */ +#define localtry_lock_init(lock) __localtry_lock_init(lock) + +/** + * localtry_lock - Acquire a per CPU local lock + * @lock: The lock variable + */ +#define localtry_lock(lock) __localtry_lock(lock) + +/** + * localtry_lock_irq - Acquire a per CPU local lock and disable interrupts + * @lock: The lock variable + */ +#define localtry_lock_irq(lock) __localtry_lock_irq(lock) + +/** + * localtry_lock_irqsave - Acquire a per CPU local lock, save and disable + * interrupts + * @lock: The lock variable + * @flags: Storage for interrupt flags + */ +#define localtry_lock_irqsave(lock, flags) \ + __localtry_lock_irqsave(lock, flags) + +/** + * localtry_trylock - Try to acquire a per CPU local lock. + * @lock: The lock variable + * + * The function can be used in any context such as NMI or HARDIRQ. Due to + * locking constrains it will _always_ fail to acquire the lock in NMI or + * HARDIRQ context on PREEMPT_RT. + */ +#define localtry_trylock(lock) __localtry_trylock(lock) + +/** + * localtry_trylock_irqsave - Try to acquire a per CPU local lock, save and disable + * interrupts if acquired + * @lock: The lock variable + * @flags: Storage for interrupt flags + * + * The function can be used in any context such as NMI or HARDIRQ. Due to + * locking constrains it will _always_ fail to acquire the lock in NMI or + * HARDIRQ context on PREEMPT_RT. + */ +#define localtry_trylock_irqsave(lock, flags) \ + __localtry_trylock_irqsave(lock, flags) + +/** + * local_unlock - Release a per CPU local lock + * @lock: The lock variable + */ +#define localtry_unlock(lock) __localtry_unlock(lock) + +/** + * local_unlock_irq - Release a per CPU local lock and enable interrupts + * @lock: The lock variable + */ +#define localtry_unlock_irq(lock) __localtry_unlock_irq(lock) + +/** + * localtry_unlock_irqrestore - Release a per CPU local lock and restore + * interrupt flags + * @lock: The lock variable + * @flags: Interrupt flags to restore + */ +#define localtry_unlock_irqrestore(lock, flags) \ + __localtry_unlock_irqrestore(lock, flags) + DEFINE_GUARD(local_lock, local_lock_t __percpu*, local_lock(_T), local_unlock(_T)) diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index 8dd71fbbb6d2b..67bd13d142fac 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -15,6 +15,11 @@ typedef struct { #endif } local_lock_t; +typedef struct { + local_lock_t llock; + unsigned int acquired; +} localtry_lock_t; + #ifdef CONFIG_DEBUG_LOCK_ALLOC # define LOCAL_LOCK_DEBUG_INIT(lockname) \ .dep_map = { \ @@ -31,6 +36,13 @@ static inline void local_lock_acquire(local_lock_t *l) l->owner = current; } +static inline void local_trylock_acquire(local_lock_t *l) +{ + lock_map_acquire_try(&l->dep_map); + DEBUG_LOCKS_WARN_ON(l->owner); + l->owner = current; +} + static inline void local_lock_release(local_lock_t *l) { DEBUG_LOCKS_WARN_ON(l->owner != current); @@ -45,11 +57,13 @@ static inline void local_lock_debug_init(local_lock_t *l) #else /* CONFIG_DEBUG_LOCK_ALLOC */ # define LOCAL_LOCK_DEBUG_INIT(lockname) static inline void local_lock_acquire(local_lock_t *l) { } +static inline void local_trylock_acquire(local_lock_t *l) { } static inline void local_lock_release(local_lock_t *l) { } static inline void local_lock_debug_init(local_lock_t *l) { } #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ #define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) } +#define INIT_LOCALTRY_LOCK(lockname) { .llock = { LOCAL_LOCK_DEBUG_INIT(lockname.llock) }} #define __local_lock_init(lock) \ do { \ @@ -118,6 +132,104 @@ do { \ #define __local_unlock_nested_bh(lock) \ local_lock_release(this_cpu_ptr(lock)) +/* localtry_lock_t variants */ + +#define __localtry_lock_init(lock) \ +do { \ + __local_lock_init(&(lock)->llock); \ + WRITE_ONCE((lock)->acquired, 0); \ +} while (0) + +#define __localtry_lock(lock) \ + do { \ + localtry_lock_t *lt; \ + preempt_disable(); \ + lt = this_cpu_ptr(lock); \ + local_lock_acquire(<->llock); \ + WRITE_ONCE(lt->acquired, 1); \ + } while (0) + +#define __localtry_lock_irq(lock) \ + do { \ + localtry_lock_t *lt; \ + local_irq_disable(); \ + lt = this_cpu_ptr(lock); \ + local_lock_acquire(<->llock); \ + WRITE_ONCE(lt->acquired, 1); \ + } while (0) + +#define __localtry_lock_irqsave(lock, flags) \ + do { \ + localtry_lock_t *lt; \ + local_irq_save(flags); \ + lt = this_cpu_ptr(lock); \ + local_lock_acquire(<->llock); \ + WRITE_ONCE(lt->acquired, 1); \ + } while (0) + +#define __localtry_trylock(lock) \ + ({ \ + localtry_lock_t *lt; \ + bool _ret; \ + \ + preempt_disable(); \ + lt = this_cpu_ptr(lock); \ + if (!READ_ONCE(lt->acquired)) { \ + WRITE_ONCE(lt->acquired, 1); \ + local_trylock_acquire(<->llock); \ + _ret = true; \ + } else { \ + _ret = false; \ + preempt_enable(); \ + } \ + _ret; \ + }) + +#define __localtry_trylock_irqsave(lock, flags) \ + ({ \ + localtry_lock_t *lt; \ + bool _ret; \ + \ + local_irq_save(flags); \ + lt = this_cpu_ptr(lock); \ + if (!READ_ONCE(lt->acquired)) { \ + WRITE_ONCE(lt->acquired, 1); \ + local_trylock_acquire(<->llock); \ + _ret = true; \ + } else { \ + _ret = false; \ + local_irq_restore(flags); \ + } \ + _ret; \ + }) + +#define __localtry_unlock(lock) \ + do { \ + localtry_lock_t *lt; \ + lt = this_cpu_ptr(lock); \ + WRITE_ONCE(lt->acquired, 0); \ + local_lock_release(<->llock); \ + preempt_enable(); \ + } while (0) + +#define __localtry_unlock_irq(lock) \ + do { \ + localtry_lock_t *lt; \ + lt = this_cpu_ptr(lock); \ + WRITE_ONCE(lt->acquired, 0); \ + local_lock_release(<->llock); \ + local_irq_enable(); \ + } while (0) + +#define __localtry_unlock_irqrestore(lock, flags) \ + do { \ + localtry_lock_t *lt; \ + lt = this_cpu_ptr(lock); \ + WRITE_ONCE(lt->acquired, 0); \ + local_lock_release(<->llock); \ + local_irq_restore(flags); \ + } while (0) + #else /* !CONFIG_PREEMPT_RT */ /* @@ -125,8 +237,10 @@ do { \ * critical section while staying preemptible. */ typedef spinlock_t local_lock_t; +typedef spinlock_t localtry_lock_t; #define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname)) +#define INIT_LOCALTRY_LOCK(lockname) INIT_LOCAL_LOCK(lockname) #define __local_lock_init(l) \ do { \ @@ -169,4 +283,36 @@ do { \ spin_unlock(this_cpu_ptr((lock))); \ } while (0) +/* localtry_lock_t variants */ + +#define __localtry_lock_init(lock) __local_lock_init(lock) +#define __localtry_lock(lock) __local_lock(lock) +#define __localtry_lock_irq(lock) __local_lock(lock) +#define __localtry_lock_irqsave(lock, flags) __local_lock_irqsave(lock, flags) +#define __localtry_unlock(lock) __local_unlock(lock) +#define __localtry_unlock_irq(lock) __local_unlock(lock) +#define __localtry_unlock_irqrestore(lock, flags) __local_unlock_irqrestore(lock, flags) + +#define __localtry_trylock(lock) \ + ({ \ + int __locked; \ + \ + if (in_nmi() | in_hardirq()) { \ + __locked = 0; \ + } else { \ + migrate_disable(); \ + __locked = spin_trylock(this_cpu_ptr((lock))); \ + if (!__locked) \ + migrate_enable(); \ + } \ + __locked; \ + }) + +#define __localtry_trylock_irqsave(lock, flags) \ + ({ \ + typecheck(unsigned long, flags); \ + flags = 0; \ + __localtry_trylock(lock); \ + }) + #endif /* CONFIG_PREEMPT_RT */ diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index e2f1ce37c41ef..f5aafd3ba5d4d 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -426,14 +426,14 @@ LSM_HOOK(void, LSM_RET_VOID, audit_rule_free, void *lsmrule) #endif /* CONFIG_AUDIT */ #ifdef CONFIG_BPF_SYSCALL -LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size) +LSM_HOOK(int, 0, bpf, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) LSM_HOOK(int, 0, bpf_map, struct bpf_map *map, fmode_t fmode) LSM_HOOK(int, 0, bpf_prog, struct bpf_prog *prog) LSM_HOOK(int, 0, bpf_map_create, struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) LSM_HOOK(void, LSM_RET_VOID, bpf_map_free, struct bpf_map *map) LSM_HOOK(int, 0, bpf_prog_load, struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) LSM_HOOK(void, LSM_RET_VOID, bpf_prog_free, struct bpf_prog *prog) LSM_HOOK(int, 0, bpf_token_create, struct bpf_token *token, union bpf_attr *attr, const struct path *path) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6b27db7f94963..483aa90242cdf 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -99,6 +99,10 @@ struct page { /* Or, free page */ struct list_head buddy_list; struct list_head pcp_list; + struct { + struct llist_node pcp_llist; + unsigned int order; + }; }; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 9540b41894da6..e169395539304 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -972,6 +972,9 @@ struct zone { /* Primarily protects free_area */ spinlock_t lock; + /* Pages to be freed when next trylock succeeds */ + struct llist_head trylock_free_pages; + /* Write-intensive fields used by compaction and vmstats. */ CACHELINE_PADDING(_pad2_); diff --git a/include/linux/security.h b/include/linux/security.h index 980b6c207cade..b2010034f82bc 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2249,14 +2249,14 @@ struct bpf_map; struct bpf_prog; struct bpf_token; #ifdef CONFIG_SECURITY -extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size); +extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size, bool kernel); extern int security_bpf_map(struct bpf_map *map, fmode_t fmode); extern int security_bpf_prog(struct bpf_prog *prog); extern int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token); + struct bpf_token *token, bool kernel); extern void security_bpf_map_free(struct bpf_map *map); extern int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token); + struct bpf_token *token, bool kernel); extern void security_bpf_prog_free(struct bpf_prog *prog); extern int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr, const struct path *path); @@ -2265,7 +2265,7 @@ extern int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cm extern int security_bpf_token_capable(const struct bpf_token *token, int cap); #else static inline int security_bpf(int cmd, union bpf_attr *attr, - unsigned int size) + unsigned int size, bool kernel) { return 0; } @@ -2281,7 +2281,7 @@ static inline int security_bpf_prog(struct bpf_prog *prog) } static inline int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) { return 0; } @@ -2290,7 +2290,7 @@ static inline void security_bpf_map_free(struct bpf_map *map) { } static inline int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) { return 0; } diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 5eea47f135a42..c95d3b1da1990 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -172,6 +172,8 @@ struct inet_hashinfo { struct inet_listen_hashbucket *lhash2; bool pernet; + + atomic64_t ver; } ____cacheline_aligned_in_smp; static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) diff --git a/include/net/sock.h b/include/net/sock.h index 8036b3b79cd8b..b11f43e8e7ec7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -228,6 +228,7 @@ struct sock_common { u32 skc_window_clamp; u32 skc_tw_snd_nxt; /* struct tcp_timewait_sock */ }; + __s64 skc_idx; /* public: */ }; @@ -378,6 +379,7 @@ struct sock { #define sk_incoming_cpu __sk_common.skc_incoming_cpu #define sk_flags __sk_common.skc_flags #define sk_rxhash __sk_common.skc_rxhash +#define sk_idx __sk_common.skc_idx __cacheline_group_begin(sock_write_rx); diff --git a/include/net/tcp.h b/include/net/tcp.h index 2d08473a6dc00..499acd6da35f8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2202,7 +2202,8 @@ struct tcp_iter_state { struct seq_net_private p; enum tcp_seq_states state; struct sock *syn_wait_sk; - int bucket, offset, sbucket, num; + int bucket, sbucket, num; + __s64 prev_idx; loff_t last_pos; }; diff --git a/include/net/udp.h b/include/net/udp.h index 6e89520e100dc..9398561addc68 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -102,6 +102,7 @@ struct udp_table { #endif unsigned int mask; unsigned int log; + atomic64_t ver; }; extern struct udp_table udp_table; void udp_table_init(struct udp_table *, const char *); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fff6cdb8d11a2..bb37897c03939 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -51,6 +51,9 @@ #define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ #define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */ +#define BPF_LOAD_ACQ 0x100 /* load-acquire */ +#define BPF_STORE_REL 0x110 /* store-release */ + enum bpf_cond_pseudo_jmp { BPF_MAY_GOTO = 0, }; @@ -1207,6 +1210,7 @@ enum bpf_perf_event_type { #define BPF_F_BEFORE (1U << 3) #define BPF_F_AFTER (1U << 4) #define BPF_F_ID (1U << 5) +#define BPF_F_PREORDER (1U << 6) #define BPF_F_LINK BPF_F_LINK /* 1 << 13 */ /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c index 647b709d7d77f..0d56cea716022 100644 --- a/kernel/bpf/arena.c +++ b/kernel/bpf/arena.c @@ -287,7 +287,7 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf) return VM_FAULT_SIGSEGV; /* Account into memcg of the process that created bpf_arena */ - ret = bpf_map_alloc_pages(map, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, 1, &page); + ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page); if (ret) { range_tree_set(&arena->rt, vmf->pgoff, 1); return VM_FAULT_SIGSEGV; @@ -465,8 +465,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt if (ret) goto out_free_pages; - ret = bpf_map_alloc_pages(&arena->map, GFP_KERNEL | __GFP_ZERO, - node_id, page_cnt, pages); + ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages); if (ret) goto out; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 46e5db65dbc8d..84f58f3d028a3 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -369,7 +369,7 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) /* count number of elements in the list. * it's slow but the list cannot be long */ -static u32 prog_list_length(struct hlist_head *head) +static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt) { struct bpf_prog_list *pl; u32 cnt = 0; @@ -377,6 +377,8 @@ static u32 prog_list_length(struct hlist_head *head) hlist_for_each_entry(pl, head, node) { if (!prog_list_prog(pl)) continue; + if (preorder_cnt && (pl->flags & BPF_F_PREORDER)) + (*preorder_cnt)++; cnt++; } return cnt; @@ -400,7 +402,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, if (flags & BPF_F_ALLOW_MULTI) return true; - cnt = prog_list_length(&p->bpf.progs[atype]); + cnt = prog_list_length(&p->bpf.progs[atype], NULL); WARN_ON_ONCE(cnt > 1); if (cnt == 1) return !!(flags & BPF_F_ALLOW_OVERRIDE); @@ -423,12 +425,12 @@ static int compute_effective_progs(struct cgroup *cgrp, struct bpf_prog_array *progs; struct bpf_prog_list *pl; struct cgroup *p = cgrp; - int cnt = 0; + int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart; /* count number of effective programs by walking parents */ do { if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) - cnt += prog_list_length(&p->bpf.progs[atype]); + cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt); p = cgroup_parent(p); } while (p); @@ -439,20 +441,34 @@ static int compute_effective_progs(struct cgroup *cgrp, /* populate the array with effective progs */ cnt = 0; p = cgrp; + fstart = preorder_cnt; + bstart = preorder_cnt - 1; do { if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) continue; + init_bstart = bstart; hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { if (!prog_list_prog(pl)) continue; - item = &progs->items[cnt]; + if (pl->flags & BPF_F_PREORDER) { + item = &progs->items[bstart]; + bstart--; + } else { + item = &progs->items[fstart]; + fstart++; + } item->prog = prog_list_prog(pl); bpf_cgroup_storages_assign(item->cgroup_storage, pl->storage); cnt++; } + + /* reverse pre-ordering progs at this cgroup level */ + for (i = bstart + 1, j = init_bstart; i < j; i++, j--) + swap(progs->items[i], progs->items[j]); + } while ((p = cgroup_parent(p))); *array = progs; @@ -663,7 +679,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, */ return -EPERM; - if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) + if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS) return -E2BIG; pl = find_attach_entry(progs, prog, link, replace_prog, @@ -698,6 +714,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp, pl->prog = prog; pl->link = link; + pl->flags = flags; bpf_cgroup_storages_assign(pl->storage, storage); cgrp->bpf.flags[atype] = saved_flags; @@ -1073,7 +1090,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, lockdep_is_held(&cgroup_mutex)); total_cnt += bpf_prog_array_length(effective); } else { - total_cnt += prog_list_length(&cgrp->bpf.progs[atype]); + total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL); } } @@ -1105,7 +1122,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, u32 id; progs = &cgrp->bpf.progs[atype]; - cnt = min_t(int, prog_list_length(progs), total_cnt); + cnt = min_t(int, prog_list_length(progs, NULL), total_cnt); i = 0; hlist_for_each_entry(pl, progs, node) { prog = prog_list_prog(pl); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a0200fbbace99..62cb9557ad3be 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1663,14 +1663,17 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(JMP, JSET, K), \ INSN_2(JMP, JA), \ INSN_2(JMP32, JA), \ + /* Atomic operations. */ \ + INSN_3(STX, ATOMIC, B), \ + INSN_3(STX, ATOMIC, H), \ + INSN_3(STX, ATOMIC, W), \ + INSN_3(STX, ATOMIC, DW), \ /* Store instructions. */ \ /* Register based. */ \ INSN_3(STX, MEM, B), \ INSN_3(STX, MEM, H), \ INSN_3(STX, MEM, W), \ INSN_3(STX, MEM, DW), \ - INSN_3(STX, ATOMIC, W), \ - INSN_3(STX, ATOMIC, DW), \ /* Immediate based. */ \ INSN_3(ST, MEM, B), \ INSN_3(ST, MEM, H), \ @@ -2152,24 +2155,33 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) if (BPF_SIZE(insn->code) == BPF_W) \ atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \ (DST + insn->off)); \ - else \ + else if (BPF_SIZE(insn->code) == BPF_DW) \ atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \ (DST + insn->off)); \ + else \ + goto default_label; \ break; \ case BOP | BPF_FETCH: \ if (BPF_SIZE(insn->code) == BPF_W) \ SRC = (u32) atomic_fetch_##KOP( \ (u32) SRC, \ (atomic_t *)(unsigned long) (DST + insn->off)); \ - else \ + else if (BPF_SIZE(insn->code) == BPF_DW) \ SRC = (u64) atomic64_fetch_##KOP( \ (u64) SRC, \ (atomic64_t *)(unsigned long) (DST + insn->off)); \ + else \ + goto default_label; \ break; STX_ATOMIC_DW: STX_ATOMIC_W: + STX_ATOMIC_H: + STX_ATOMIC_B: switch (IMM) { + /* Atomic read-modify-write instructions support only W and DW + * size modifiers. + */ ATOMIC_ALU_OP(BPF_ADD, add) ATOMIC_ALU_OP(BPF_AND, and) ATOMIC_ALU_OP(BPF_OR, or) @@ -2181,20 +2193,63 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) SRC = (u32) atomic_xchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) SRC); - else + else if (BPF_SIZE(insn->code) == BPF_DW) SRC = (u64) atomic64_xchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) SRC); + else + goto default_label; break; case BPF_CMPXCHG: if (BPF_SIZE(insn->code) == BPF_W) BPF_R0 = (u32) atomic_cmpxchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) BPF_R0, (u32) SRC); - else + else if (BPF_SIZE(insn->code) == BPF_DW) BPF_R0 = (u64) atomic64_cmpxchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) BPF_R0, (u64) SRC); + else + goto default_label; + break; + /* Atomic load and store instructions support all size + * modifiers. + */ + case BPF_LOAD_ACQ: + switch (BPF_SIZE(insn->code)) { +#define LOAD_ACQUIRE(SIZEOP, SIZE) \ + case BPF_##SIZEOP: \ + DST = (SIZE)smp_load_acquire( \ + (SIZE *)(unsigned long)(SRC + insn->off)); \ + break; + LOAD_ACQUIRE(B, u8) + LOAD_ACQUIRE(H, u16) + LOAD_ACQUIRE(W, u32) +#ifdef CONFIG_64BIT + LOAD_ACQUIRE(DW, u64) +#endif +#undef LOAD_ACQUIRE + default: + goto default_label; + } + break; + case BPF_STORE_REL: + switch (BPF_SIZE(insn->code)) { +#define STORE_RELEASE(SIZEOP, SIZE) \ + case BPF_##SIZEOP: \ + smp_store_release( \ + (SIZE *)(unsigned long)(DST + insn->off), (SIZE)SRC); \ + break; + STORE_RELEASE(B, u8) + STORE_RELEASE(H, u16) + STORE_RELEASE(W, u32) +#ifdef CONFIG_64BIT + STORE_RELEASE(DW, u64) +#endif +#undef STORE_RELEASE + default: + goto default_label; + } break; default: @@ -3069,6 +3124,32 @@ void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, { } +bool __weak bpf_jit_supports_timed_may_goto(void) +{ + return false; +} + +u64 __weak arch_bpf_timed_may_goto(void) +{ + return 0; +} + +u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) +{ + u64 time = ktime_get_mono_fast_ns(); + + /* Populate the timestamp for this stack frame, and refresh count. */ + if (!p->timestamp) { + p->timestamp = time; + return BPF_MAX_TIMED_LOOPS; + } + /* Check if we've exhausted our time slice, and zero count. */ + if (time - p->timestamp >= (NSEC_PER_SEC / 4)) + return 0; + /* Refresh the count for the stack frame. */ + return BPF_MAX_TIMED_LOOPS; +} + /* for configs without MMU or 32-bit */ __weak const struct bpf_map_ops arena_map_ops; __weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c index cfa1c18e3a483..9876c5fe6c2a2 100644 --- a/kernel/bpf/cpumask.c +++ b/kernel/bpf/cpumask.c @@ -45,6 +45,10 @@ __bpf_kfunc_start_defs(); * * bpf_cpumask_create() allocates memory using the BPF memory allocator, and * will not block. It may return NULL if no memory is available. + * + * Return: + * * A pointer to a new struct bpf_cpumask instance on success. + * * NULL if the BPF memory allocator is out of memory. */ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void) { @@ -71,6 +75,10 @@ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_create(void) * Acquires a reference to a BPF cpumask. The cpumask returned by this function * must either be embedded in a map as a kptr, or freed with * bpf_cpumask_release(). + * + * Return: + * * The struct bpf_cpumask pointer passed to the function. + * */ __bpf_kfunc struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) { @@ -106,6 +114,9 @@ CFI_NOSEAL(bpf_cpumask_release_dtor); * * Find the index of the first nonzero bit of the cpumask. A struct bpf_cpumask * pointer may be safely passed to this function. + * + * Return: + * * The index of the first nonzero bit in the struct cpumask. */ __bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask) { @@ -119,6 +130,9 @@ __bpf_kfunc u32 bpf_cpumask_first(const struct cpumask *cpumask) * * Find the index of the first unset bit of the cpumask. A struct bpf_cpumask * pointer may be safely passed to this function. + * + * Return: + * * The index of the first zero bit in the struct cpumask. */ __bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) { @@ -133,6 +147,9 @@ __bpf_kfunc u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) * * Find the index of the first nonzero bit of the AND of two cpumasks. * struct bpf_cpumask pointers may be safely passed to @src1 and @src2. + * + * Return: + * * The index of the first bit that is nonzero in both cpumask instances. */ __bpf_kfunc u32 bpf_cpumask_first_and(const struct cpumask *src1, const struct cpumask *src2) @@ -414,12 +431,47 @@ __bpf_kfunc u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, * @cpumask: The cpumask being queried. * * Count the number of set bits in the given cpumask. + * + * Return: + * * The number of bits set in the mask. */ __bpf_kfunc u32 bpf_cpumask_weight(const struct cpumask *cpumask) { return cpumask_weight(cpumask); } +/** + * bpf_cpumask_populate() - Populate the CPU mask from the contents of + * a BPF memory region. + * + * @cpumask: The cpumask being populated. + * @src: The BPF memory holding the bit pattern. + * @src__sz: Length of the BPF memory region in bytes. + * + * Return: + * * 0 if the struct cpumask * instance was populated successfully. + * * -EACCES if the memory region is too small to populate the cpumask. + * * -EINVAL if the memory region is not aligned to the size of a long + * and the architecture does not support efficient unaligned accesses. + */ +__bpf_kfunc int bpf_cpumask_populate(struct cpumask *cpumask, void *src, size_t src__sz) +{ + unsigned long source = (unsigned long)src; + + /* The memory region must be large enough to populate the entire CPU mask. */ + if (src__sz < bitmap_size(nr_cpu_ids)) + return -EACCES; + + /* If avoiding unaligned accesses, the input region must be aligned to the nearest long. */ + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && + !IS_ALIGNED(source, sizeof(long))) + return -EINVAL; + + bitmap_copy(cpumask_bits(cpumask), src, nr_cpu_ids); + + return 0; +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(cpumask_kfunc_btf_ids) @@ -448,6 +500,7 @@ BTF_ID_FLAGS(func, bpf_cpumask_copy, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_any_and_distribute, KF_RCU) BTF_ID_FLAGS(func, bpf_cpumask_weight, KF_RCU) +BTF_ID_FLAGS(func, bpf_cpumask_populate, KF_RCU) BTF_KFUNCS_END(cpumask_kfunc_btf_ids) static const struct btf_kfunc_id_set cpumask_kfunc_set = { diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 309c4aa1b026a..974d172d6735a 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -267,6 +267,18 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_LOAD_ACQ) { + verbose(cbs->private_data, "(%02x) r%d = load_acquire((%s *)(r%d %+d))\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_STORE_REL) { + verbose(cbs->private_data, "(%02x) store_release((%s *)(r%d %+d), r%d)\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, insn->src_reg); } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 183298fc11ba4..5449756ba102e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1759,8 +1759,8 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; -BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, - u32, offset, u64, flags) +static int __bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr_kern *src, + u32 offset, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1793,6 +1793,12 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern } } +BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, + u32, offset, u64, flags) +{ + return __bpf_dynptr_read(dst, len, src, offset, flags); +} + static const struct bpf_func_proto bpf_dynptr_read_proto = { .func = bpf_dynptr_read, .gpl_only = false, @@ -1804,8 +1810,8 @@ static const struct bpf_func_proto bpf_dynptr_read_proto = { .arg5_type = ARG_ANYTHING, }; -BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, - u32, len, u64, flags) +static int __bpf_dynptr_write(const struct bpf_dynptr_kern *dst, u32 offset, void *src, + u32 len, u64 flags) { enum bpf_dynptr_type type; int err; @@ -1843,6 +1849,12 @@ BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, v } } +BPF_CALL_5(bpf_dynptr_write, const struct bpf_dynptr_kern *, dst, u32, offset, void *, src, + u32, len, u64, flags) +{ + return __bpf_dynptr_write(dst, offset, src, len, flags); +} + static const struct bpf_func_proto bpf_dynptr_write_proto = { .func = bpf_dynptr_write, .gpl_only = false, @@ -2758,6 +2770,61 @@ __bpf_kfunc int bpf_dynptr_clone(const struct bpf_dynptr *p, return 0; } +/** + * bpf_dynptr_copy() - Copy data from one dynptr to another. + * @dst_ptr: Destination dynptr - where data should be copied to + * @dst_off: Offset into the destination dynptr + * @src_ptr: Source dynptr - where data should be copied from + * @src_off: Offset into the source dynptr + * @size: Length of the data to copy from source to destination + * + * Copies data from source dynptr to destination dynptr. + * Returns 0 on success; negative error, otherwise. + */ +__bpf_kfunc int bpf_dynptr_copy(struct bpf_dynptr *dst_ptr, u32 dst_off, + struct bpf_dynptr *src_ptr, u32 src_off, u32 size) +{ + struct bpf_dynptr_kern *dst = (struct bpf_dynptr_kern *)dst_ptr; + struct bpf_dynptr_kern *src = (struct bpf_dynptr_kern *)src_ptr; + void *src_slice, *dst_slice; + char buf[256]; + u32 off; + + src_slice = bpf_dynptr_slice(src_ptr, src_off, NULL, size); + dst_slice = bpf_dynptr_slice_rdwr(dst_ptr, dst_off, NULL, size); + + if (src_slice && dst_slice) { + memmove(dst_slice, src_slice, size); + return 0; + } + + if (src_slice) + return __bpf_dynptr_write(dst, dst_off, src_slice, size, 0); + + if (dst_slice) + return __bpf_dynptr_read(dst_slice, size, src, src_off, 0); + + if (bpf_dynptr_check_off_len(dst, dst_off, size) || + bpf_dynptr_check_off_len(src, src_off, size)) + return -E2BIG; + + off = 0; + while (off < size) { + u32 chunk_sz = min_t(u32, sizeof(buf), size - off); + int err; + + err = __bpf_dynptr_read(buf, chunk_sz, src, src_off + off, 0); + if (err) + return err; + err = __bpf_dynptr_write(dst, dst_off + off, buf, chunk_sz, 0); + if (err) + return err; + + off += chunk_sz; + } + return 0; +} + __bpf_kfunc void *bpf_cast_to_kern_ctx(void *obj) { return obj; @@ -3206,6 +3273,7 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null) BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly) BTF_ID_FLAGS(func, bpf_dynptr_size) BTF_ID_FLAGS(func, bpf_dynptr_clone) +BTF_ID_FLAGS(func, bpf_dynptr_copy) #ifdef CONFIG_NET BTF_ID_FLAGS(func, bpf_modify_return_test_tp) #endif diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index 0c63bc2cd895a..2fdf3c978db1b 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -90,3 +90,4 @@ static void __exit fini(void) late_initcall(load); module_exit(fini); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Embedded BPF programs for introspection in bpffs"); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dbd89c13dd328..45dc98d526cc3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -569,7 +569,24 @@ static void bpf_map_release_memcg(struct bpf_map *map) } #endif -int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, +static bool can_alloc_pages(void) +{ + return preempt_count() == 0 && !irqs_disabled() && + !IS_ENABLED(CONFIG_PREEMPT_RT); +} + +static struct page *__bpf_alloc_page(int nid) +{ + if (!can_alloc_pages()) + return try_alloc_pages(nid, 0); + + return alloc_pages_node(nid, + GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT + | __GFP_NOWARN, + 0); +} + +int bpf_map_alloc_pages(const struct bpf_map *map, int nid, unsigned long nr_pages, struct page **pages) { unsigned long i, j; @@ -582,14 +599,14 @@ int bpf_map_alloc_pages(const struct bpf_map *map, gfp_t gfp, int nid, old_memcg = set_active_memcg(memcg); #endif for (i = 0; i < nr_pages; i++) { - pg = alloc_pages_node(nid, gfp | __GFP_ACCOUNT, 0); + pg = __bpf_alloc_page(nid); if (pg) { pages[i] = pg; continue; } for (j = 0; j < i; j++) - __free_page(pages[j]); + free_pages_nolock(pages[j], 0); ret = -ENOMEM; break; } @@ -1315,7 +1332,7 @@ static bool bpf_net_capable(void) #define BPF_MAP_CREATE_LAST_FIELD map_token_fd /* called via syscall */ -static int map_create(union bpf_attr *attr) +static int map_create(union bpf_attr *attr, bool kernel) { const struct bpf_map_ops *ops; struct bpf_token *token = NULL; @@ -1505,7 +1522,7 @@ static int map_create(union bpf_attr *attr) attr->btf_vmlinux_value_type_id; } - err = security_bpf_map_create(map, attr, token); + err = security_bpf_map_create(map, attr, token, kernel); if (err) goto free_map_sec; @@ -1593,11 +1610,8 @@ struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, bool uref) struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map) { - spin_lock_bh(&map_idr_lock); - map = __bpf_map_inc_not_zero(map, false); - spin_unlock_bh(&map_idr_lock); - - return map; + lockdep_assert(rcu_read_lock_held()); + return __bpf_map_inc_not_zero(map, false); } EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); @@ -2945,7 +2959,7 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size) if (err < 0) goto free_prog; - err = security_bpf_prog_load(prog, attr, token); + err = security_bpf_prog_load(prog, attr, token, uattr.is_kernel); if (err) goto free_prog_sec; @@ -4170,7 +4184,8 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, #define BPF_F_ATTACH_MASK_BASE \ (BPF_F_ALLOW_OVERRIDE | \ BPF_F_ALLOW_MULTI | \ - BPF_F_REPLACE) + BPF_F_REPLACE | \ + BPF_F_PREORDER) #define BPF_F_ATTACH_MASK_MPROG \ (BPF_F_REPLACE | \ @@ -5769,13 +5784,13 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size) if (copy_from_bpfptr(&attr, uattr, size) != 0) return -EFAULT; - err = security_bpf(cmd, &attr, size); + err = security_bpf(cmd, &attr, size, uattr.is_kernel); if (err < 0) return err; switch (cmd) { case BPF_MAP_CREATE: - err = map_create(&attr); + err = map_create(&attr, uattr.is_kernel); break; case BPF_MAP_LOOKUP_ELEM: err = map_lookup_elem(&attr); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dcd0da4e62fca..3303a3605ee80 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -579,6 +579,13 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn) insn->imm == BPF_CMPXCHG; } +static bool is_atomic_load_insn(const struct bpf_insn *insn) +{ + return BPF_CLASS(insn->code) == BPF_STX && + BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_LOAD_ACQ; +} + static int __get_spi(s32 off) { return (-off - 1) / BPF_REG_SIZE; @@ -3353,6 +3360,15 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env) return 0; } +static int jmp_offset(struct bpf_insn *insn) +{ + u8 code = insn->code; + + if (code == (BPF_JMP32 | BPF_JA)) + return insn->imm; + return insn->off; +} + static int check_subprogs(struct bpf_verifier_env *env) { int i, subprog_start, subprog_end, off, cur_subprog = 0; @@ -3379,10 +3395,7 @@ static int check_subprogs(struct bpf_verifier_env *env) goto next; if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) goto next; - if (code == (BPF_JMP32 | BPF_JA)) - off = i + insn[i].imm + 1; - else - off = i + insn[i].off + 1; + off = i + jmp_offset(&insn[i]) + 1; if (off < subprog_start || off >= subprog_end) { verbose(env, "jump out of range from insn %d to %d\n", i, off); return -EINVAL; @@ -3567,7 +3580,7 @@ static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, } if (class == BPF_STX) { - /* BPF_STX (including atomic variants) has multiple source + /* BPF_STX (including atomic variants) has one or more source * operands, one of which is a ptr. Check whether the caller is * asking about it. */ @@ -3912,6 +3925,17 @@ static const char *disasm_kfunc_name(void *data, const struct bpf_insn *insn) return btf_name_by_offset(desc_btf, func->name_off); } +static void verbose_insn(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + const struct bpf_insn_cbs cbs = { + .cb_call = disasm_kfunc_name, + .cb_print = verbose, + .private_data = env, + }; + + print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); +} + static inline void bt_init(struct backtrack_state *bt, u32 frame) { bt->frame = frame; @@ -4112,11 +4136,6 @@ static bool calls_callback(struct bpf_verifier_env *env, int insn_idx); static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, struct bpf_insn_hist_entry *hist, struct backtrack_state *bt) { - const struct bpf_insn_cbs cbs = { - .cb_call = disasm_kfunc_name, - .cb_print = verbose, - .private_data = env, - }; struct bpf_insn *insn = env->prog->insnsi + idx; u8 class = BPF_CLASS(insn->code); u8 opcode = BPF_OP(insn->code); @@ -4134,7 +4153,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN, bt_stack_mask(bt)); verbose(env, "stack=%s before ", env->tmp_str_buf); verbose(env, "%d: ", idx); - print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + verbose_insn(env, insn); } /* If there is a history record that some registers gained range at this insn, @@ -4181,7 +4200,7 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx, * dreg still needs precision before this insn */ } - } else if (class == BPF_LDX) { + } else if (class == BPF_LDX || is_atomic_load_insn(insn)) { if (!bt_is_reg_set(bt, dreg)) return 0; bt_clear_reg(bt, dreg); @@ -6195,6 +6214,26 @@ static bool is_arena_reg(struct bpf_verifier_env *env, int regno) return reg->type == PTR_TO_ARENA; } +/* Return false if @regno contains a pointer whose type isn't supported for + * atomic instruction @insn. + */ +static bool atomic_ptr_type_ok(struct bpf_verifier_env *env, int regno, + struct bpf_insn *insn) +{ + if (is_ctx_reg(env, regno)) + return false; + if (is_pkt_reg(env, regno)) + return false; + if (is_flow_key_reg(env, regno)) + return false; + if (is_sk_reg(env, regno)) + return false; + if (is_arena_reg(env, regno)) + return bpf_jit_supports_insn(insn, true); + + return true; +} + static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = { #ifdef CONFIG_NET [PTR_TO_SOCKET] = &btf_sock_ids[BTF_SOCK_TYPE_SOCK], @@ -7596,27 +7635,72 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn static int save_aux_ptr_type(struct bpf_verifier_env *env, enum bpf_reg_type type, bool allow_trust_mismatch); -static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) +static int check_load_mem(struct bpf_verifier_env *env, struct bpf_insn *insn, + bool strict_alignment_once, bool is_ldsx, + bool allow_trust_mismatch, const char *ctx) { - int load_reg; + struct bpf_reg_state *regs = cur_regs(env); + enum bpf_reg_type src_reg_type; int err; - switch (insn->imm) { - case BPF_ADD: - case BPF_ADD | BPF_FETCH: - case BPF_AND: - case BPF_AND | BPF_FETCH: - case BPF_OR: - case BPF_OR | BPF_FETCH: - case BPF_XOR: - case BPF_XOR | BPF_FETCH: - case BPF_XCHG: - case BPF_CMPXCHG: - break; - default: - verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); - return -EINVAL; - } + /* check src operand */ + err = check_reg_arg(env, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check dst operand */ + err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + + src_reg_type = regs[insn->src_reg].type; + + /* Check if (src_reg + off) is readable. The state of dst_reg will be + * updated by this call. + */ + err = check_mem_access(env, env->insn_idx, insn->src_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, insn->dst_reg, + strict_alignment_once, is_ldsx); + err = err ?: save_aux_ptr_type(env, src_reg_type, + allow_trust_mismatch); + err = err ?: reg_bounds_sanity_check(env, ®s[insn->dst_reg], ctx); + + return err; +} + +static int check_store_reg(struct bpf_verifier_env *env, struct bpf_insn *insn, + bool strict_alignment_once) +{ + struct bpf_reg_state *regs = cur_regs(env); + enum bpf_reg_type dst_reg_type; + int err; + + /* check src1 operand */ + err = check_reg_arg(env, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check src2 operand */ + err = check_reg_arg(env, insn->dst_reg, SRC_OP); + if (err) + return err; + + dst_reg_type = regs[insn->dst_reg].type; + + /* Check if (dst_reg + off) is writeable. */ + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg, + strict_alignment_once, false); + err = err ?: save_aux_ptr_type(env, dst_reg_type, false); + + return err; +} + +static int check_atomic_rmw(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + int load_reg; + int err; if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) { verbose(env, "invalid atomic operand size\n"); @@ -7652,11 +7736,7 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return -EACCES; } - if (is_ctx_reg(env, insn->dst_reg) || - is_pkt_reg(env, insn->dst_reg) || - is_flow_key_reg(env, insn->dst_reg) || - is_sk_reg(env, insn->dst_reg) || - (is_arena_reg(env, insn->dst_reg) && !bpf_jit_supports_insn(insn, true))) { + if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) { verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str(env, reg_state(env, insn->dst_reg)->type)); @@ -7683,12 +7763,12 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i /* Check whether we can read the memory, with second call for fetch * case to simulate the register fill. */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1, true, false); if (!err && load_reg >= 0) - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, load_reg, - true, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_READ, load_reg, true, false); if (err) return err; @@ -7698,13 +7778,74 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return err; } /* Check whether we can write into the same memory. */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + err = check_mem_access(env, env->insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true, false); if (err) return err; return 0; } +static int check_atomic_load(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + if (!atomic_ptr_type_ok(env, insn->src_reg, insn)) { + verbose(env, "BPF_ATOMIC loads from R%d %s is not allowed\n", + insn->src_reg, + reg_type_str(env, reg_state(env, insn->src_reg)->type)); + return -EACCES; + } + + return check_load_mem(env, insn, true, false, false, "atomic_load"); +} + +static int check_atomic_store(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + if (!atomic_ptr_type_ok(env, insn->dst_reg, insn)) { + verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", + insn->dst_reg, + reg_type_str(env, reg_state(env, insn->dst_reg)->type)); + return -EACCES; + } + + return check_store_reg(env, insn, true); +} + +static int check_atomic(struct bpf_verifier_env *env, struct bpf_insn *insn) +{ + switch (insn->imm) { + case BPF_ADD: + case BPF_ADD | BPF_FETCH: + case BPF_AND: + case BPF_AND | BPF_FETCH: + case BPF_OR: + case BPF_OR | BPF_FETCH: + case BPF_XOR: + case BPF_XOR | BPF_FETCH: + case BPF_XCHG: + case BPF_CMPXCHG: + return check_atomic_rmw(env, insn); + case BPF_LOAD_ACQ: + if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) { + verbose(env, + "64-bit load-acquires are only supported on 64-bit arches\n"); + return -EOPNOTSUPP; + } + return check_atomic_load(env, insn); + case BPF_STORE_REL: + if (BPF_SIZE(insn->code) == BPF_DW && BITS_PER_LONG != 64) { + verbose(env, + "64-bit store-releases are only supported on 64-bit arches\n"); + return -EOPNOTSUPP; + } + return check_atomic_store(env, insn); + default: + verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", + insn->imm); + return -EINVAL; + } +} + /* When register 'regno' is used to read the stack (either directly or through * a helper function) make sure that it's within stack boundary and, depending * on the access type and privileges, that all elements of the stack are @@ -10317,23 +10458,18 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, if (subprog_is_global(env, subprog)) { const char *sub_name = subprog_name(env, subprog); - /* Only global subprogs cannot be called with a lock held. */ if (env->cur_state->active_locks) { verbose(env, "global function calls are not allowed while holding a lock,\n" "use static function instead\n"); return -EINVAL; } - /* Only global subprogs cannot be called with preemption disabled. */ - if (env->cur_state->active_preempt_locks) { - verbose(env, "global function calls are not allowed with preemption disabled,\n" - "use static function instead\n"); - return -EINVAL; - } - - if (env->cur_state->active_irq_id) { - verbose(env, "global function calls are not allowed with IRQs disabled,\n" - "use static function instead\n"); + if (env->subprog_info[subprog].might_sleep && + (env->cur_state->active_rcu_lock || env->cur_state->active_preempt_locks || + env->cur_state->active_irq_id || !in_sleepable(env))) { + verbose(env, "global functions that may sleep are not allowed in non-sleepable context,\n" + "i.e., in a RCU/IRQ/preempt-disabled section, or in\n" + "a non-sleepable BPF program context\n"); return -EINVAL; } @@ -16703,6 +16839,14 @@ static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off) subprog->changes_pkt_data = true; } +static void mark_subprog_might_sleep(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *subprog; + + subprog = find_containing_subprog(env, off); + subprog->might_sleep = true; +} + /* 't' is an index of a call-site. * 'w' is a callee entry point. * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED. @@ -16716,6 +16860,7 @@ static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) caller = find_containing_subprog(env, t); callee = find_containing_subprog(env, w); caller->changes_pkt_data |= callee->changes_pkt_data; + caller->might_sleep |= callee->might_sleep; } /* non-recursive DFS pseudo code @@ -16874,27 +17019,6 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, /* Bitmask with 1s for all caller saved registers */ #define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1) -/* Return a bitmask specifying which caller saved registers are - * clobbered by a call to a helper *as if* this helper follows - * bpf_fastcall contract: - * - includes R0 if function is non-void; - * - includes R1-R5 if corresponding parameter has is described - * in the function prototype. - */ -static u32 helper_fastcall_clobber_mask(const struct bpf_func_proto *fn) -{ - u32 mask; - int i; - - mask = 0; - if (fn->ret_type != RET_VOID) - mask |= BIT(BPF_REG_0); - for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) - if (fn->arg_type[i] != ARG_DONTCARE) - mask |= BIT(BPF_REG_1 + i); - return mask; -} - /* True if do_misc_fixups() replaces calls to helper number 'imm', * replacement patch is presumed to follow bpf_fastcall contract * (see mark_fastcall_pattern_for_call() below). @@ -16911,24 +17035,54 @@ static bool verifier_inlines_helper_call(struct bpf_verifier_env *env, s32 imm) } } -/* Same as helper_fastcall_clobber_mask() but for kfuncs, see comment above */ -static u32 kfunc_fastcall_clobber_mask(struct bpf_kfunc_call_arg_meta *meta) +struct call_summary { + u8 num_params; + bool is_void; + bool fastcall; +}; + +/* If @call is a kfunc or helper call, fills @cs and returns true, + * otherwise returns false. + */ +static bool get_call_summary(struct bpf_verifier_env *env, struct bpf_insn *call, + struct call_summary *cs) { - u32 vlen, i, mask; + struct bpf_kfunc_call_arg_meta meta; + const struct bpf_func_proto *fn; + int i; - vlen = btf_type_vlen(meta->func_proto); - mask = 0; - if (!btf_type_is_void(btf_type_by_id(meta->btf, meta->func_proto->type))) - mask |= BIT(BPF_REG_0); - for (i = 0; i < vlen; ++i) - mask |= BIT(BPF_REG_1 + i); - return mask; -} + if (bpf_helper_call(call)) { -/* Same as verifier_inlines_helper_call() but for kfuncs, see comment above */ -static bool is_fastcall_kfunc_call(struct bpf_kfunc_call_arg_meta *meta) -{ - return meta->kfunc_flags & KF_FASTCALL; + if (get_helper_proto(env, call->imm, &fn) < 0) + /* error would be reported later */ + return false; + cs->fastcall = fn->allow_fastcall && + (verifier_inlines_helper_call(env, call->imm) || + bpf_jit_inlines_helper_call(call->imm)); + cs->is_void = fn->ret_type == RET_VOID; + cs->num_params = 0; + for (i = 0; i < ARRAY_SIZE(fn->arg_type); ++i) { + if (fn->arg_type[i] == ARG_DONTCARE) + break; + cs->num_params++; + } + return true; + } + + if (bpf_pseudo_kfunc_call(call)) { + int err; + + err = fetch_kfunc_meta(env, call, &meta, NULL); + if (err < 0) + /* error would be reported later */ + return false; + cs->num_params = btf_type_vlen(meta.func_proto); + cs->fastcall = meta.kfunc_flags & KF_FASTCALL; + cs->is_void = btf_type_is_void(btf_type_by_id(meta.btf, meta.func_proto->type)); + return true; + } + + return false; } /* LLVM define a bpf_fastcall function attribute. @@ -17011,39 +17165,23 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env, { struct bpf_insn *insns = env->prog->insnsi, *stx, *ldx; struct bpf_insn *call = &env->prog->insnsi[insn_idx]; - const struct bpf_func_proto *fn; - u32 clobbered_regs_mask = ALL_CALLER_SAVED_REGS; + u32 clobbered_regs_mask; + struct call_summary cs; u32 expected_regs_mask; - bool can_be_inlined = false; s16 off; int i; - if (bpf_helper_call(call)) { - if (get_helper_proto(env, call->imm, &fn) < 0) - /* error would be reported later */ - return; - clobbered_regs_mask = helper_fastcall_clobber_mask(fn); - can_be_inlined = fn->allow_fastcall && - (verifier_inlines_helper_call(env, call->imm) || - bpf_jit_inlines_helper_call(call->imm)); - } - - if (bpf_pseudo_kfunc_call(call)) { - struct bpf_kfunc_call_arg_meta meta; - int err; - - err = fetch_kfunc_meta(env, call, &meta, NULL); - if (err < 0) - /* error would be reported later */ - return; - - clobbered_regs_mask = kfunc_fastcall_clobber_mask(&meta); - can_be_inlined = is_fastcall_kfunc_call(&meta); - } - - if (clobbered_regs_mask == ALL_CALLER_SAVED_REGS) + if (!get_call_summary(env, call, &cs)) return; + /* A bitmask specifying which caller saved registers are clobbered + * by a call to a helper/kfunc *as if* this helper/kfunc follows + * bpf_fastcall contract: + * - includes R0 if function is non-void; + * - includes R1-R5 if corresponding parameter has is described + * in the function prototype. + */ + clobbered_regs_mask = GENMASK(cs.num_params, cs.is_void ? 1 : 0); /* e.g. if helper call clobbers r{0,1}, expect r{2,3,4,5} in the pattern */ expected_regs_mask = ~clobbered_regs_mask & ALL_CALLER_SAVED_REGS; @@ -17101,7 +17239,7 @@ static void mark_fastcall_pattern_for_call(struct bpf_verifier_env *env, * don't set 'fastcall_spills_num' for call B so that remove_fastcall_spills_fills() * does not remove spill/fill pair {4,6}. */ - if (can_be_inlined) + if (cs.fastcall) env->insn_aux_data[insn_idx].fastcall_spills_num = i - 1; else subprog->keep_fastcall_stack = 1; @@ -17183,9 +17321,20 @@ static int visit_insn(int t, struct bpf_verifier_env *env) mark_prune_point(env, t); mark_jmp_point(env, t); } - if (bpf_helper_call(insn) && bpf_helper_changes_pkt_data(insn->imm)) - mark_subprog_changes_pkt_data(env, t); - if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { + if (bpf_helper_call(insn)) { + const struct bpf_func_proto *fp; + + ret = get_helper_proto(env, insn->imm, &fp); + /* If called in a non-sleepable context program will be + * rejected anyway, so we should end up with precise + * sleepable marks on subprogs, except for dead code + * elimination. + */ + if (ret == 0 && fp->might_sleep) + mark_subprog_might_sleep(env, t); + if (bpf_helper_changes_pkt_data(insn->imm)) + mark_subprog_changes_pkt_data(env, t); + } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; ret = fetch_kfunc_meta(env, insn, &meta, NULL); @@ -17204,6 +17353,13 @@ static int visit_insn(int t, struct bpf_verifier_env *env) */ mark_force_checkpoint(env, t); } + /* Same as helpers, if called in a non-sleepable context + * program will be rejected anyway, so we should end up + * with precise sleepable marks on subprogs, except for + * dead code elimination. + */ + if (ret == 0 && is_kfunc_sleepable(&meta)) + mark_subprog_might_sleep(env, t); } return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); @@ -17246,9 +17402,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) static int check_cfg(struct bpf_verifier_env *env) { int insn_cnt = env->prog->len; - int *insn_stack, *insn_state; + int *insn_stack, *insn_state, *insn_postorder; int ex_insn_beg, i, ret = 0; - bool ex_done = false; insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) @@ -17260,6 +17415,17 @@ static int check_cfg(struct bpf_verifier_env *env) return -ENOMEM; } + insn_postorder = env->cfg.insn_postorder = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + if (!insn_postorder) { + kvfree(insn_state); + kvfree(insn_stack); + return -ENOMEM; + } + + ex_insn_beg = env->exception_callback_subprog + ? env->subprog_info[env->exception_callback_subprog].start + : 0; + insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ insn_stack[0] = 0; /* 0 is the first instruction */ env->cfg.cur_stack = 1; @@ -17273,6 +17439,7 @@ static int check_cfg(struct bpf_verifier_env *env) case DONE_EXPLORING: insn_state[t] = EXPLORED; env->cfg.cur_stack--; + insn_postorder[env->cfg.cur_postorder++] = t; break; case KEEP_EXPLORING: break; @@ -17291,13 +17458,10 @@ static int check_cfg(struct bpf_verifier_env *env) goto err_free; } - if (env->exception_callback_subprog && !ex_done) { - ex_insn_beg = env->subprog_info[env->exception_callback_subprog].start; - + if (ex_insn_beg && insn_state[ex_insn_beg] != EXPLORED) { insn_state[ex_insn_beg] = DISCOVERED; insn_stack[0] = ex_insn_beg; env->cfg.cur_stack = 1; - ex_done = true; goto walk_cfg; } @@ -17320,6 +17484,7 @@ static int check_cfg(struct bpf_verifier_env *env) } ret = 0; /* cfg looks good */ env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data; + env->prog->aux->might_sleep = env->subprog_info[0].might_sleep; err_free: kvfree(insn_state); @@ -18335,15 +18500,17 @@ static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *c * the current state will reach 'bpf_exit' instruction safely */ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_state *old, - struct bpf_func_state *cur, enum exact_level exact) + struct bpf_func_state *cur, u32 insn_idx, enum exact_level exact) { - int i; + u16 live_regs = env->insn_aux_data[insn_idx].live_regs_before; + u16 i; if (old->callback_depth > cur->callback_depth) return false; for (i = 0; i < MAX_BPF_REG; i++) - if (!regsafe(env, &old->regs[i], &cur->regs[i], + if (((1 << i) & live_regs) && + !regsafe(env, &old->regs[i], &cur->regs[i], &env->idmap_scratch, exact)) return false; @@ -18364,6 +18531,7 @@ static bool states_equal(struct bpf_verifier_env *env, struct bpf_verifier_state *cur, enum exact_level exact) { + u32 insn_idx; int i; if (old->curframe != cur->curframe) @@ -18387,9 +18555,12 @@ static bool states_equal(struct bpf_verifier_env *env, * and all frame states need to be equivalent */ for (i = 0; i <= old->curframe; i++) { + insn_idx = i == old->curframe + ? env->insn_idx + : old->frame[i + 1]->callsite; if (old->frame[i]->callsite != cur->frame[i]->callsite) return false; - if (!func_states_equal(env, old->frame[i], cur->frame[i], exact)) + if (!func_states_equal(env, old->frame[i], cur->frame[i], insn_idx, exact)) return false; } return true; @@ -19121,19 +19292,13 @@ static int do_check(struct bpf_verifier_env *env) } if (env->log.level & BPF_LOG_LEVEL) { - const struct bpf_insn_cbs cbs = { - .cb_call = disasm_kfunc_name, - .cb_print = verbose, - .private_data = env, - }; - if (verifier_state_scratched(env)) print_insn_state(env, state, state->curframe); verbose_linfo(env, env->insn_idx, "; "); env->prev_log_pos = env->log.end_pos; verbose(env, "%d: ", env->insn_idx); - print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); + verbose_insn(env, insn); env->prev_insn_print_pos = env->log.end_pos - env->prev_log_pos; env->prev_log_pos = env->log.end_pos; } @@ -19155,37 +19320,18 @@ static int do_check(struct bpf_verifier_env *env) return err; } else if (class == BPF_LDX) { - enum bpf_reg_type src_reg_type; + bool is_ldsx = BPF_MODE(insn->code) == BPF_MEMSX; - /* check for reserved fields is already done */ - - /* check src operand */ - err = check_reg_arg(env, insn->src_reg, SRC_OP); - if (err) - return err; - - err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); - if (err) - return err; - - src_reg_type = regs[insn->src_reg].type; - - /* check that memory (src_reg + off) is readable, - * the state of dst_reg will be updated by this func + /* Check for reserved fields is already done in + * resolve_pseudo_ldimm64(). */ - err = check_mem_access(env, env->insn_idx, insn->src_reg, - insn->off, BPF_SIZE(insn->code), - BPF_READ, insn->dst_reg, false, - BPF_MODE(insn->code) == BPF_MEMSX); - err = err ?: save_aux_ptr_type(env, src_reg_type, true); - err = err ?: reg_bounds_sanity_check(env, ®s[insn->dst_reg], "ldx"); + err = check_load_mem(env, insn, false, is_ldsx, true, + "ldx"); if (err) return err; } else if (class == BPF_STX) { - enum bpf_reg_type dst_reg_type; - if (BPF_MODE(insn->code) == BPF_ATOMIC) { - err = check_atomic(env, env->insn_idx, insn); + err = check_atomic(env, insn); if (err) return err; env->insn_idx++; @@ -19197,25 +19343,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - /* check src1 operand */ - err = check_reg_arg(env, insn->src_reg, SRC_OP); - if (err) - return err; - /* check src2 operand */ - err = check_reg_arg(env, insn->dst_reg, SRC_OP); - if (err) - return err; - - dst_reg_type = regs[insn->dst_reg].type; - - /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, env->insn_idx, insn->dst_reg, - insn->off, BPF_SIZE(insn->code), - BPF_WRITE, insn->src_reg, false, false); - if (err) - return err; - - err = save_aux_ptr_type(env, dst_reg_type, false); + err = check_store_reg(env, insn, false); if (err) return err; } else if (class == BPF_ST) { @@ -20537,7 +20665,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) insn->code == (BPF_ST | BPF_MEM | BPF_W) || insn->code == (BPF_ST | BPF_MEM | BPF_DW)) { type = BPF_WRITE; - } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || + } else if ((insn->code == (BPF_STX | BPF_ATOMIC | BPF_B) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_H) || + insn->code == (BPF_STX | BPF_ATOMIC | BPF_W) || insn->code == (BPF_STX | BPF_ATOMIC | BPF_DW)) && env->insn_aux_data[i + delta].ptr_type == PTR_TO_ARENA) { insn->code = BPF_STX | BPF_PROBE_ATOMIC | BPF_SIZE(insn->code); @@ -20845,6 +20975,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb; func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data; + func[i]->aux->might_sleep = env->subprog_info[i].might_sleep; if (!i) func[i]->aux->exception_boundary = env->seen_exception; func[i] = bpf_int_jit_compile(func[i]); @@ -21503,7 +21634,50 @@ static int do_misc_fixups(struct bpf_verifier_env *env) goto next_insn; } - if (is_may_goto_insn(insn)) { + if (is_may_goto_insn(insn) && bpf_jit_supports_timed_may_goto()) { + int stack_off_cnt = -stack_depth - 16; + + /* + * Two 8 byte slots, depth-16 stores the count, and + * depth-8 stores the start timestamp of the loop. + * + * The starting value of count is BPF_MAX_TIMED_LOOPS + * (0xffff). Every iteration loads it and subs it by 1, + * until the value becomes 0 in AX (thus, 1 in stack), + * after which we call arch_bpf_timed_may_goto, which + * either sets AX to 0xffff to keep looping, or to 0 + * upon timeout. AX is then stored into the stack. In + * the next iteration, we either see 0 and break out, or + * continue iterating until the next time value is 0 + * after subtraction, rinse and repeat. + */ + stack_depth_extra = 16; + insn_buf[0] = BPF_LDX_MEM(BPF_DW, BPF_REG_AX, BPF_REG_10, stack_off_cnt); + if (insn->off >= 0) + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off + 5); + else + insn_buf[1] = BPF_JMP_IMM(BPF_JEQ, BPF_REG_AX, 0, insn->off - 1); + insn_buf[2] = BPF_ALU64_IMM(BPF_SUB, BPF_REG_AX, 1); + insn_buf[3] = BPF_JMP_IMM(BPF_JNE, BPF_REG_AX, 0, 2); + /* + * AX is used as an argument to pass in stack_off_cnt + * (to add to r10/fp), and also as the return value of + * the call to arch_bpf_timed_may_goto. + */ + insn_buf[4] = BPF_MOV64_IMM(BPF_REG_AX, stack_off_cnt); + insn_buf[5] = BPF_EMIT_CALL(arch_bpf_timed_may_goto); + insn_buf[6] = BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_AX, stack_off_cnt); + cnt = 7; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + goto next_insn; + } else if (is_may_goto_insn(insn)) { int stack_off = -stack_depth - 8; stack_depth_extra = 8; @@ -22044,23 +22218,33 @@ static int do_misc_fixups(struct bpf_verifier_env *env) env->prog->aux->stack_depth = subprogs[0].stack_depth; for (i = 0; i < env->subprog_cnt; i++) { + int delta = bpf_jit_supports_timed_may_goto() ? 2 : 1; int subprog_start = subprogs[i].start; int stack_slots = subprogs[i].stack_extra / 8; + int slots = delta, cnt = 0; if (!stack_slots) continue; - if (stack_slots > 1) { + /* We need two slots in case timed may_goto is supported. */ + if (stack_slots > slots) { verbose(env, "verifier bug: stack_slots supports may_goto only\n"); return -EFAULT; } - /* Add ST insn to subprog prologue to init extra stack */ - insn_buf[0] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, - -subprogs[i].stack_depth, BPF_MAX_LOOPS); + stack_depth = subprogs[i].stack_depth; + if (bpf_jit_supports_timed_may_goto()) { + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth, + BPF_MAX_TIMED_LOOPS); + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth + 8, 0); + } else { + /* Add ST insn to subprog prologue to init extra stack */ + insn_buf[cnt++] = BPF_ST_MEM(BPF_DW, BPF_REG_FP, -stack_depth, + BPF_MAX_LOOPS); + } /* Copy first actual insn to preserve it */ - insn_buf[1] = env->prog->insnsi[subprog_start]; + insn_buf[cnt++] = env->prog->insnsi[subprog_start]; - new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, 2); + new_prog = bpf_patch_insn_data(env, subprog_start, insn_buf, cnt); if (!new_prog) return -ENOMEM; env->prog = prog = new_prog; @@ -22070,7 +22254,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) * to insn after BPF_ST that inits may_goto count. * Adjustment will succeed because bpf_patch_insn_data() didn't fail. */ - WARN_ON(adjust_jmp_off(env->prog, subprog_start, 1)); + WARN_ON(adjust_jmp_off(env->prog, subprog_start, delta)); } /* Since poke tab is now finalized, publish aux to tracker. */ @@ -22723,6 +22907,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, if (tgt_prog) { struct bpf_prog_aux *aux = tgt_prog->aux; bool tgt_changes_pkt_data; + bool tgt_might_sleep; if (bpf_prog_is_dev_bound(prog->aux) && !bpf_prog_dev_bound_match(prog, tgt_prog)) { @@ -22765,6 +22950,15 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, "Extension program changes packet data, while original does not\n"); return -EINVAL; } + + tgt_might_sleep = aux->func + ? aux->func[subprog]->aux->might_sleep + : aux->might_sleep; + if (prog->aux->might_sleep && !tgt_might_sleep) { + bpf_log(log, + "Extension program may sleep, while original does not\n"); + return -EINVAL; + } } if (!tgt_prog->jited) { bpf_log(log, "Can attach to only JITed progs\n"); @@ -23199,6 +23393,302 @@ static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, return 0; } +static bool can_fallthrough(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + + if (class != BPF_JMP && class != BPF_JMP32) + return true; + + if (opcode == BPF_EXIT || opcode == BPF_JA) + return false; + + return true; +} + +static bool can_jump(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + u8 opcode = BPF_OP(insn->code); + + if (class != BPF_JMP && class != BPF_JMP32) + return false; + + switch (opcode) { + case BPF_JA: + case BPF_JEQ: + case BPF_JNE: + case BPF_JLT: + case BPF_JLE: + case BPF_JGT: + case BPF_JGE: + case BPF_JSGT: + case BPF_JSGE: + case BPF_JSLT: + case BPF_JSLE: + case BPF_JCOND: + return true; + } + + return false; +} + +static int insn_successors(struct bpf_prog *prog, u32 idx, u32 succ[2]) +{ + struct bpf_insn *insn = &prog->insnsi[idx]; + int i = 0, insn_sz; + u32 dst; + + insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; + if (can_fallthrough(insn) && idx + 1 < prog->len) + succ[i++] = idx + insn_sz; + + if (can_jump(insn)) { + dst = idx + jmp_offset(insn) + 1; + if (i == 0 || succ[0] != dst) + succ[i++] = dst; + } + + return i; +} + +/* Each field is a register bitmask */ +struct insn_live_regs { + u16 use; /* registers read by instruction */ + u16 def; /* registers written by instruction */ + u16 in; /* registers that may be alive before instruction */ + u16 out; /* registers that may be alive after instruction */ +}; + +/* Bitmask with 1s for all caller saved registers */ +#define ALL_CALLER_SAVED_REGS ((1u << CALLER_SAVED_REGS) - 1) + +/* Compute info->{use,def} fields for the instruction */ +static void compute_insn_live_regs(struct bpf_verifier_env *env, + struct bpf_insn *insn, + struct insn_live_regs *info) +{ + struct call_summary cs; + u8 class = BPF_CLASS(insn->code); + u8 code = BPF_OP(insn->code); + u8 mode = BPF_MODE(insn->code); + u16 src = BIT(insn->src_reg); + u16 dst = BIT(insn->dst_reg); + u16 r0 = BIT(0); + u16 def = 0; + u16 use = 0xffff; + + switch (class) { + case BPF_LD: + switch (mode) { + case BPF_IMM: + if (BPF_SIZE(insn->code) == BPF_DW) { + def = dst; + use = 0; + } + break; + case BPF_LD | BPF_ABS: + case BPF_LD | BPF_IND: + /* stick with defaults */ + break; + } + break; + case BPF_LDX: + switch (mode) { + case BPF_MEM: + case BPF_MEMSX: + def = dst; + use = src; + break; + } + break; + case BPF_ST: + switch (mode) { + case BPF_MEM: + def = 0; + use = dst; + break; + } + break; + case BPF_STX: + switch (mode) { + case BPF_MEM: + def = 0; + use = dst | src; + break; + case BPF_ATOMIC: + switch (insn->imm) { + case BPF_CMPXCHG: + use = r0 | dst | src; + def = r0; + break; + case BPF_LOAD_ACQ: + def = dst; + use = src; + break; + case BPF_STORE_REL: + def = 0; + use = dst | src; + break; + default: + use = dst | src; + if (insn->imm & BPF_FETCH) + def = src; + else + def = 0; + } + break; + } + break; + case BPF_ALU: + case BPF_ALU64: + switch (code) { + case BPF_END: + use = dst; + def = dst; + break; + case BPF_MOV: + def = dst; + if (BPF_SRC(insn->code) == BPF_K) + use = 0; + else + use = src; + break; + default: + def = dst; + if (BPF_SRC(insn->code) == BPF_K) + use = dst; + else + use = dst | src; + } + break; + case BPF_JMP: + case BPF_JMP32: + switch (code) { + case BPF_JA: + case BPF_JCOND: + def = 0; + use = 0; + break; + case BPF_EXIT: + def = 0; + use = r0; + break; + case BPF_CALL: + def = ALL_CALLER_SAVED_REGS; + use = def & ~BIT(BPF_REG_0); + if (get_call_summary(env, insn, &cs)) + use = GENMASK(cs.num_params, 1); + break; + default: + def = 0; + if (BPF_SRC(insn->code) == BPF_K) + use = dst; + else + use = dst | src; + } + break; + } + + info->def = def; + info->use = use; +} + +/* Compute may-live registers after each instruction in the program. + * The register is live after the instruction I if it is read by some + * instruction S following I during program execution and is not + * overwritten between I and S. + * + * Store result in env->insn_aux_data[i].live_regs. + */ +static int compute_live_registers(struct bpf_verifier_env *env) +{ + struct bpf_insn_aux_data *insn_aux = env->insn_aux_data; + struct bpf_insn *insns = env->prog->insnsi; + struct insn_live_regs *state; + int insn_cnt = env->prog->len; + int err = 0, i, j; + bool changed; + + /* Use the following algorithm: + * - define the following: + * - I.use : a set of all registers read by instruction I; + * - I.def : a set of all registers written by instruction I; + * - I.in : a set of all registers that may be alive before I execution; + * - I.out : a set of all registers that may be alive after I execution; + * - insn_successors(I): a set of instructions S that might immediately + * follow I for some program execution; + * - associate separate empty sets 'I.in' and 'I.out' with each instruction; + * - visit each instruction in a postorder and update + * state[i].in, state[i].out as follows: + * + * state[i].out = U [state[s].in for S in insn_successors(i)] + * state[i].in = (state[i].out / state[i].def) U state[i].use + * + * (where U stands for set union, / stands for set difference) + * - repeat the computation while {in,out} fields changes for + * any instruction. + */ + state = kvcalloc(insn_cnt, sizeof(*state), GFP_KERNEL); + if (!state) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < insn_cnt; ++i) + compute_insn_live_regs(env, &insns[i], &state[i]); + + changed = true; + while (changed) { + changed = false; + for (i = 0; i < env->cfg.cur_postorder; ++i) { + int insn_idx = env->cfg.insn_postorder[i]; + struct insn_live_regs *live = &state[insn_idx]; + int succ_num; + u32 succ[2]; + u16 new_out = 0; + u16 new_in = 0; + + succ_num = insn_successors(env->prog, insn_idx, succ); + for (int s = 0; s < succ_num; ++s) + new_out |= state[succ[s]].in; + new_in = (new_out & ~live->def) | live->use; + if (new_out != live->out || new_in != live->in) { + live->in = new_in; + live->out = new_out; + changed = true; + } + } + } + + for (i = 0; i < insn_cnt; ++i) + insn_aux[i].live_regs_before = state[i].in; + + if (env->log.level & BPF_LOG_LEVEL2) { + verbose(env, "Live regs before insn:\n"); + for (i = 0; i < insn_cnt; ++i) { + verbose(env, "%3d: ", i); + for (j = BPF_REG_0; j < BPF_REG_10; ++j) + if (insn_aux[i].live_regs_before & BIT(j)) + verbose(env, "%d", j); + else + verbose(env, "."); + verbose(env, " "); + verbose_insn(env, &insns[i]); + if (bpf_is_ldimm64(&insns[i])) + i++; + } + } + +out: + kvfree(state); + kvfree(env->cfg.insn_postorder); + env->cfg.insn_postorder = NULL; + env->cfg.cur_postorder = 0; + return err; +} + int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size) { u64 start_time = ktime_get_ns(); @@ -23320,6 +23810,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret) goto skip_full_check; + ret = compute_live_registers(env); + if (ret < 0) + goto skip_full_check; + ret = mark_fastcall_patterns(env); if (ret < 0) goto skip_full_check; @@ -23458,6 +23952,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 vfree(env->insn_aux_data); kvfree(env->insn_hist); err_free_env: + kvfree(env->cfg.insn_postorder); kvfree(env); return ret; } diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 245d5b4166999..73d7b50924ef6 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -591,7 +591,8 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, depot_stack_handle_t handle = 0; struct page *page = NULL; void *prealloc = NULL; - bool can_alloc = depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC; + bool allow_spin = gfpflags_allow_spinning(alloc_flags); + bool can_alloc = (depot_flags & STACK_DEPOT_FLAG_CAN_ALLOC) && allow_spin; unsigned long flags; u32 hash; @@ -630,7 +631,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, prealloc = page_address(page); } - if (in_nmi()) { + if (in_nmi() || !allow_spin) { /* We can never allocate in NMI context. */ WARN_ON_ONCE(can_alloc); /* Best effort; bail if we fail to take the lock. */ @@ -671,7 +672,10 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, exit: if (prealloc) { /* Stack depot didn't use this memory, free it. */ - free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER); + if (!allow_spin) + free_pages_nolock(virt_to_page(prealloc), DEPOT_POOL_ORDER); + else + free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER); } if (found) handle = found->handle.handle; diff --git a/mm/internal.h b/mm/internal.h index 109ef30fee11f..10a8b4b3b86ec 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1187,6 +1187,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_NOFRAGMENT 0x0 #endif #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ +#define ALLOC_TRYLOCK 0x400 /* Only use spin_trylock in allocation path */ #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ /* Flags that allow allocations below the min watermark. */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4de6acb9b8ecb..385100b730128 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1739,7 +1739,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) } struct memcg_stock_pcp { - local_lock_t stock_lock; + localtry_lock_t stock_lock; struct mem_cgroup *cached; /* this never be root cgroup */ unsigned int nr_pages; @@ -1754,7 +1754,7 @@ struct memcg_stock_pcp { #define FLUSHING_CACHED_CHARGE 0 }; static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { - .stock_lock = INIT_LOCAL_LOCK(stock_lock), + .stock_lock = INIT_LOCALTRY_LOCK(stock_lock), }; static DEFINE_MUTEX(percpu_charge_mutex); @@ -1766,6 +1766,7 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, * consume_stock: Try to consume stocked charge on this cpu. * @memcg: memcg to consume from. * @nr_pages: how many pages to charge. + * @gfp_mask: allocation mask. * * The charges will only happen if @memcg matches the current cpu's memcg * stock, and at least @nr_pages are available in that stock. Failure to @@ -1773,7 +1774,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, * * returns true if successful, false otherwise. */ -static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages, + gfp_t gfp_mask) { struct memcg_stock_pcp *stock; unsigned int stock_pages; @@ -1783,7 +1785,11 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) if (nr_pages > MEMCG_CHARGE_BATCH) return ret; - local_lock_irqsave(&memcg_stock.stock_lock, flags); + if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) { + if (!gfpflags_allow_spinning(gfp_mask)) + return ret; + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); + } stock = this_cpu_ptr(&memcg_stock); stock_pages = READ_ONCE(stock->nr_pages); @@ -1792,7 +1798,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) ret = true; } - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); return ret; } @@ -1831,14 +1837,14 @@ static void drain_local_stock(struct work_struct *dummy) * drain_stock races is that we always operate on local CPU stock * here with IRQ disabled */ - local_lock_irqsave(&memcg_stock.stock_lock, flags); + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); old = drain_obj_stock(stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); obj_cgroup_put(old); } @@ -1868,9 +1874,20 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { unsigned long flags; - local_lock_irqsave(&memcg_stock.stock_lock, flags); + if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) { + /* + * In case of unlikely failure to lock percpu stock_lock + * uncharge memcg directly. + */ + if (mem_cgroup_is_root(memcg)) + return; + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, nr_pages); + return; + } __refill_stock(memcg, nr_pages); - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); } /* @@ -2213,9 +2230,13 @@ int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long pflags; retry: - if (consume_stock(memcg, nr_pages)) + if (consume_stock(memcg, nr_pages, gfp_mask)) return 0; + if (!gfpflags_allow_spinning(gfp_mask)) + /* Avoid the refill and flush of the older stock */ + batch = nr_pages; + if (!do_memsw_account() || page_counter_try_charge(&memcg->memsw, batch, &counter)) { if (page_counter_try_charge(&memcg->memory, batch, &counter)) @@ -2699,7 +2720,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, unsigned long flags; int *bytes; - local_lock_irqsave(&memcg_stock.stock_lock, flags); + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); /* @@ -2752,7 +2773,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, if (nr) __mod_objcg_mlstate(objcg, pgdat, idx, nr); - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); obj_cgroup_put(old); } @@ -2762,7 +2783,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) unsigned long flags; bool ret = false; - local_lock_irqsave(&memcg_stock.stock_lock, flags); + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { @@ -2770,7 +2791,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) ret = true; } - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); return ret; } @@ -2862,7 +2883,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, unsigned long flags; unsigned int nr_pages = 0; - local_lock_irqsave(&memcg_stock.stock_lock, flags); + localtry_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ @@ -2880,7 +2901,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, stock->nr_bytes &= (PAGE_SIZE - 1); } - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags); obj_cgroup_put(old); if (nr_pages) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 579789600a3c7..a0904315d4da2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -88,6 +88,9 @@ typedef int __bitwise fpi_t; */ #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) +/* Free the page without taking locks. Rely on trylock only. */ +#define FPI_TRYLOCK ((__force fpi_t)BIT(2)) + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) @@ -1249,13 +1252,44 @@ static void split_large_buddy(struct zone *zone, struct page *page, } while (1); } +static void add_page_to_zone_llist(struct zone *zone, struct page *page, + unsigned int order) +{ + /* Remember the order */ + page->order = order; + /* Add the page to the free list */ + llist_add(&page->pcp_llist, &zone->trylock_free_pages); +} + static void free_one_page(struct zone *zone, struct page *page, unsigned long pfn, unsigned int order, fpi_t fpi_flags) { + struct llist_head *llhead; unsigned long flags; - spin_lock_irqsave(&zone->lock, flags); + if (!spin_trylock_irqsave(&zone->lock, flags)) { + if (unlikely(fpi_flags & FPI_TRYLOCK)) { + add_page_to_zone_llist(zone, page, order); + return; + } + spin_lock_irqsave(&zone->lock, flags); + } + + /* The lock succeeded. Process deferred pages. */ + llhead = &zone->trylock_free_pages; + if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) { + struct llist_node *llnode; + struct page *p, *tmp; + + llnode = llist_del_all(llhead); + llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) { + unsigned int p_order = p->order; + + split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags); + __count_vm_events(PGFREE, 1 << p_order); + } + } split_large_buddy(zone, page, pfn, order, fpi_flags); spin_unlock_irqrestore(&zone->lock, flags); @@ -2307,7 +2341,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long flags; int i; - spin_lock_irqsave(&zone->lock, flags); + if (!spin_trylock_irqsave(&zone->lock, flags)) { + if (unlikely(alloc_flags & ALLOC_TRYLOCK)) + return 0; + spin_lock_irqsave(&zone->lock, flags); + } for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, alloc_flags); @@ -2595,7 +2633,7 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, static void free_frozen_page_commit(struct zone *zone, struct per_cpu_pages *pcp, struct page *page, int migratetype, - unsigned int order) + unsigned int order, fpi_t fpi_flags) { int high, batch; int pindex; @@ -2630,6 +2668,14 @@ static void free_frozen_page_commit(struct zone *zone, } if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX)) pcp->free_count += (1 << order); + + if (unlikely(fpi_flags & FPI_TRYLOCK)) { + /* + * Do not attempt to take a zone lock. Let pcp->count get + * over high mark temporarily. + */ + return; + } high = nr_pcp_high(pcp, zone, batch, free_high); if (pcp->count >= high) { free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), @@ -2644,7 +2690,8 @@ static void free_frozen_page_commit(struct zone *zone, /* * Free a pcp page */ -void free_frozen_pages(struct page *page, unsigned int order) +static void __free_frozen_pages(struct page *page, unsigned int order, + fpi_t fpi_flags) { unsigned long __maybe_unused UP_flags; struct per_cpu_pages *pcp; @@ -2653,7 +2700,7 @@ void free_frozen_pages(struct page *page, unsigned int order) int migratetype; if (!pcp_allowed_order(order)) { - __free_pages_ok(page, order, FPI_NONE); + __free_pages_ok(page, order, fpi_flags); return; } @@ -2671,23 +2718,33 @@ void free_frozen_pages(struct page *page, unsigned int order) migratetype = get_pfnblock_migratetype(page, pfn); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(zone, page, pfn, order, FPI_NONE); + free_one_page(zone, page, pfn, order, fpi_flags); return; } migratetype = MIGRATE_MOVABLE; } + if (unlikely((fpi_flags & FPI_TRYLOCK) && IS_ENABLED(CONFIG_PREEMPT_RT) + && (in_nmi() || in_hardirq()))) { + add_page_to_zone_llist(zone, page, order); + return; + } pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { - free_frozen_page_commit(zone, pcp, page, migratetype, order); + free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags); pcp_spin_unlock(pcp); } else { - free_one_page(zone, page, pfn, order, FPI_NONE); + free_one_page(zone, page, pfn, order, fpi_flags); } pcp_trylock_finish(UP_flags); } +void free_frozen_pages(struct page *page, unsigned int order) +{ + __free_frozen_pages(page, order, FPI_NONE); +} + /* * Free a batch of folios */ @@ -2776,7 +2833,7 @@ void free_unref_folios(struct folio_batch *folios) trace_mm_page_free_batched(&folio->page); free_frozen_page_commit(zone, pcp, &folio->page, migratetype, - order); + order, FPI_NONE); } if (pcp) { @@ -2907,7 +2964,11 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, do { page = NULL; - spin_lock_irqsave(&zone->lock, flags); + if (!spin_trylock_irqsave(&zone->lock, flags)) { + if (unlikely(alloc_flags & ALLOC_TRYLOCK)) + return NULL; + spin_lock_irqsave(&zone->lock, flags); + } if (alloc_flags & ALLOC_HIGHATOMIC) page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (!page) { @@ -4511,7 +4572,12 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, might_alloc(gfp_mask); - if (should_fail_alloc_page(gfp_mask, order)) + /* + * Don't invoke should_fail logic, since it may call + * get_random_u32() and printk() which need to spin_lock. + */ + if (!(*alloc_flags & ALLOC_TRYLOCK) && + should_fail_alloc_page(gfp_mask, order)) return false; *alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, *alloc_flags); @@ -4809,9 +4875,10 @@ unsigned long get_zeroed_page_noprof(gfp_t gfp_mask) EXPORT_SYMBOL(get_zeroed_page_noprof); /** - * __free_pages - Free pages allocated with alloc_pages(). + * ___free_pages - Free pages allocated with alloc_pages(). * @page: The page pointer returned from alloc_pages(). * @order: The order of the allocation. + * @fpi_flags: Free Page Internal flags. * * This function can free multi-page allocations that are not compound * pages. It does not check that the @order passed in matches that of @@ -4828,22 +4895,37 @@ EXPORT_SYMBOL(get_zeroed_page_noprof); * Context: May be called in interrupt context or while holding a normal * spinlock, but not in NMI context or while holding a raw spinlock. */ -void __free_pages(struct page *page, unsigned int order) +static void ___free_pages(struct page *page, unsigned int order, + fpi_t fpi_flags) { /* get PageHead before we drop reference */ int head = PageHead(page); struct alloc_tag *tag = pgalloc_tag_get(page); if (put_page_testzero(page)) - free_frozen_pages(page, order); + __free_frozen_pages(page, order, fpi_flags); else if (!head) { pgalloc_tag_sub_pages(tag, (1 << order) - 1); while (order-- > 0) - free_frozen_pages(page + (1 << order), order); + __free_frozen_pages(page + (1 << order), order, + fpi_flags); } } +void __free_pages(struct page *page, unsigned int order) +{ + ___free_pages(page, order, FPI_NONE); +} EXPORT_SYMBOL(__free_pages); +/* + * Can be called while holding raw_spin_lock or from IRQ and NMI for any + * page type (not only those that came from try_alloc_pages) + */ +void free_pages_nolock(struct page *page, unsigned int order) +{ + ___free_pages(page, order, FPI_TRYLOCK); +} + void free_pages(unsigned long addr, unsigned int order) { if (addr != 0) { @@ -7071,3 +7153,94 @@ static bool __free_unaccepted(struct page *page) } #endif /* CONFIG_UNACCEPTED_MEMORY */ + +/** + * try_alloc_pages - opportunistic reentrant allocation from any context + * @nid: node to allocate from + * @order: allocation order size + * + * Allocates pages of a given order from the given node. This is safe to + * call from any context (from atomic, NMI, and also reentrant + * allocator -> tracepoint -> try_alloc_pages_noprof). + * Allocation is best effort and to be expected to fail easily so nobody should + * rely on the success. Failures are not reported via warn_alloc(). + * See always fail conditions below. + * + * Return: allocated page or NULL on failure. + */ +struct page *try_alloc_pages_noprof(int nid, unsigned int order) +{ + /* + * Do not specify __GFP_DIRECT_RECLAIM, since direct claim is not allowed. + * Do not specify __GFP_KSWAPD_RECLAIM either, since wake up of kswapd + * is not safe in arbitrary context. + * + * These two are the conditions for gfpflags_allow_spinning() being true. + * + * Specify __GFP_NOWARN since failing try_alloc_pages() is not a reason + * to warn. Also warn would trigger printk() which is unsafe from + * various contexts. We cannot use printk_deferred_enter() to mitigate, + * since the running context is unknown. + * + * Specify __GFP_ZERO to make sure that call to kmsan_alloc_page() below + * is safe in any context. Also zeroing the page is mandatory for + * BPF use cases. + * + * Though __GFP_NOMEMALLOC is not checked in the code path below, + * specify it here to highlight that try_alloc_pages() + * doesn't want to deplete reserves. + */ + gfp_t alloc_gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_NOMEMALLOC + | __GFP_ACCOUNT; + unsigned int alloc_flags = ALLOC_TRYLOCK; + struct alloc_context ac = { }; + struct page *page; + + /* + * In PREEMPT_RT spin_trylock() will call raw_spin_lock() which is + * unsafe in NMI. If spin_trylock() is called from hard IRQ the current + * task may be waiting for one rt_spin_lock, but rt_spin_trylock() will + * mark the task as the owner of another rt_spin_lock which will + * confuse PI logic, so return immediately if called form hard IRQ or + * NMI. + * + * Note, irqs_disabled() case is ok. This function can be called + * from raw_spin_lock_irqsave region. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) + return NULL; + if (!pcp_allowed_order(order)) + return NULL; + +#ifdef CONFIG_UNACCEPTED_MEMORY + /* Bailout, since try_to_accept_memory_one() needs to take a lock */ + if (has_unaccepted_memory()) + return NULL; +#endif + /* Bailout, since _deferred_grow_zone() needs to take a lock */ + if (deferred_pages_enabled()) + return NULL; + + if (nid == NUMA_NO_NODE) + nid = numa_node_id(); + + prepare_alloc_pages(alloc_gfp, order, nid, NULL, &ac, + &alloc_gfp, &alloc_flags); + + /* + * Best effort allocation from percpu free list. + * If it's empty attempt to spin_trylock zone->lock. + */ + page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); + + /* Unlike regular alloc_pages() there is no __alloc_pages_slowpath(). */ + + if (memcg_kmem_online() && page && + unlikely(__memcg_kmem_charge_page(page, alloc_gfp, order) != 0)) { + free_pages_nolock(page, order); + page = NULL; + } + trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); + kmsan_alloc_page(page, order, alloc_gfp); + return page; +} diff --git a/mm/page_owner.c b/mm/page_owner.c index 2d6360eaccbb6..90e31d0e3ed78 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -294,7 +294,13 @@ void __reset_page_owner(struct page *page, unsigned short order) page_owner = get_page_owner(page_ext); alloc_handle = page_owner->handle; - handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); + /* + * Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false + * to prevent issues in stack_depot_save(). + * This is similar to try_alloc_pages() gfp flags, but only used + * to signal stack_depot to avoid spin_locks. + */ + handle = save_stack(__GFP_NOWARN); __update_page_owner_free_handle(page_ext, handle, order, current->pid, current->tgid, free_ts_nsec); page_ext_put(page_ext); diff --git a/net/core/filter.c b/net/core/filter.c index 827108c6dad9f..dcc53ac5c5458 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9637,7 +9637,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, queue_mapping): if (type == BPF_WRITE) { - u32 off = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size); + u32 offset = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size); if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) { *insn++ = BPF_JMP_A(0); /* noop */ @@ -9646,7 +9646,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, if (BPF_CLASS(si->code) == BPF_STX) *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); - *insn++ = BPF_EMIT_STORE(BPF_H, si, off); + *insn++ = BPF_EMIT_STORE(BPF_H, si, offset); } else { *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 9bfcfd016e182..bc9f58172790f 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -534,6 +534,12 @@ struct sock *__inet_lookup_established(const struct net *net, } EXPORT_SYMBOL_GPL(__inet_lookup_established); +static inline __s64 inet_hashinfo_next_idx(struct inet_hashinfo *hinfo, + bool pos) +{ + return (pos ? 1 : -1) * atomic64_inc_return(&hinfo->ver); +} + /* called with local bh disabled */ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, __u16 lport, @@ -581,6 +587,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); + sk->sk_idx = inet_hashinfo_next_idx(hinfo, false); if (tw) { sk_nulls_del_node_init_rcu((struct sock *)tw); __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); @@ -678,8 +685,10 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) ret = false; } - if (ret) + if (ret) { __sk_nulls_add_node_rcu(sk, list); + sk->sk_idx = inet_hashinfo_next_idx(hashinfo, false); + } spin_unlock(lock); @@ -729,6 +738,7 @@ int __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; + bool add_tail; int err = 0; if (sk->sk_state != TCP_LISTEN) { @@ -747,11 +757,13 @@ int __inet_hash(struct sock *sk, struct sock *osk) goto unlock; } sock_set_flag(sk, SOCK_RCU_FREE); - if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && - sk->sk_family == AF_INET6) + add_tail = IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && + sk->sk_family == AF_INET6; + if (add_tail) __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); else __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); + sk->sk_idx = inet_hashinfo_next_idx(hashinfo, add_tail); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: spin_unlock(&ilb2->lock); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 285678d8ce077..63693af0c05c8 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5147,6 +5147,7 @@ void __init tcp_init(void) cnt = tcp_hashinfo.ehash_mask + 1; sysctl_tcp_max_orphans = cnt / 2; + atomic64_set(&tcp_hashinfo.ver, 0); tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2632844d2c356..d0ddb307e2a13 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2602,7 +2602,7 @@ static void *listening_get_first(struct seq_file *seq) struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct tcp_iter_state *st = seq->private; - st->offset = 0; + st->prev_idx = 0; for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { struct inet_listen_hashbucket *ilb2; struct hlist_nulls_node *node; @@ -2637,7 +2637,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) struct sock *sk = cur; ++st->num; - ++st->offset; + st->prev_idx = sk->sk_idx; sk = sk_nulls_next(sk); sk_nulls_for_each_from(sk, node) { @@ -2658,7 +2658,6 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) void *rc; st->bucket = 0; - st->offset = 0; rc = listening_get_first(seq); while (rc && *pos) { @@ -2683,7 +2682,7 @@ static void *established_get_first(struct seq_file *seq) struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct tcp_iter_state *st = seq->private; - st->offset = 0; + st->prev_idx = 0; for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { struct sock *sk; struct hlist_nulls_node *node; @@ -2714,7 +2713,7 @@ static void *established_get_next(struct seq_file *seq, void *cur) struct sock *sk = cur; ++st->num; - ++st->offset; + st->prev_idx = sk->sk_idx; sk = sk_nulls_next(sk); @@ -2763,8 +2762,8 @@ static void *tcp_seek_last_pos(struct seq_file *seq) { struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct tcp_iter_state *st = seq->private; + __s64 prev_idx = st->prev_idx; int bucket = st->bucket; - int offset = st->offset; int orig_num = st->num; void *rc = NULL; @@ -2773,18 +2772,21 @@ static void *tcp_seek_last_pos(struct seq_file *seq) if (st->bucket > hinfo->lhash2_mask) break; rc = listening_get_first(seq); - while (offset-- && rc && bucket == st->bucket) + while (rc && bucket == st->bucket && prev_idx && + ((struct sock *)rc)->sk_idx <= prev_idx) rc = listening_get_next(seq, rc); if (rc) break; st->bucket = 0; + prev_idx = 0; st->state = TCP_SEQ_STATE_ESTABLISHED; fallthrough; case TCP_SEQ_STATE_ESTABLISHED: if (st->bucket > hinfo->ehash_mask) break; rc = established_get_first(seq); - while (offset-- && rc && bucket == st->bucket) + while (rc && bucket == st->bucket && prev_idx && + ((struct sock *)rc)->sk_idx <= prev_idx) rc = established_get_next(seq, rc); } @@ -2807,7 +2809,7 @@ void *tcp_seq_start(struct seq_file *seq, loff_t *pos) st->state = TCP_SEQ_STATE_LISTENING; st->num = 0; st->bucket = 0; - st->offset = 0; + st->prev_idx = 0; rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; out: @@ -2832,7 +2834,7 @@ void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!rc) { st->state = TCP_SEQ_STATE_ESTABLISHED; st->bucket = 0; - st->offset = 0; + st->prev_idx = 0; rc = established_get_first(seq); } break; @@ -3124,7 +3126,7 @@ static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) * it has to advance to the next bucket. */ if (iter->st_bucket_done) { - st->offset = 0; + st->prev_idx = 0; st->bucket++; if (st->state == TCP_SEQ_STATE_LISTENING && st->bucket > hinfo->lhash2_mask) { @@ -3192,8 +3194,9 @@ static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) * the future start() will resume at st->offset in * st->bucket. See tcp_seek_last_pos(). */ - st->offset++; - sock_gen_put(iter->batch[iter->cur_sk++]); + sk = iter->batch[iter->cur_sk++]; + st->prev_idx = sk->sk_idx; + sock_gen_put(sk); } if (iter->cur_sk < iter->end_sk) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a9bb9ce5438ea..d7e9b33469838 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -229,6 +229,11 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } +static inline __s64 udp_table_next_idx(struct udp_table *udptable, bool pos) +{ + return (pos ? 1 : -1) * atomic64_inc_return(&udptable->ver); +} + /** * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 * @@ -244,6 +249,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, struct udp_hslot *hslot, *hslot2; struct net *net = sock_net(sk); int error = -EADDRINUSE; + bool add_tail; if (!snum) { DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); @@ -335,14 +341,16 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock(&hslot2->lock); - if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && - sk->sk_family == AF_INET6) + add_tail = IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && + sk->sk_family == AF_INET6; + if (add_tail) hlist_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node, &hslot2->head); else hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, &hslot2->head); hslot2->count++; + sk->sk_idx = udp_table_next_idx(udptable, add_tail); spin_unlock(&hslot2->lock); } @@ -2250,6 +2258,8 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, &nhslot2->head); nhslot2->count++; + sk->sk_idx = udp_table_next_idx(udptable, + false); spin_unlock(&nhslot2->lock); } @@ -3390,9 +3400,9 @@ struct bpf_udp_iter_state { unsigned int cur_sk; unsigned int end_sk; unsigned int max_sk; - int offset; struct sock **batch; bool st_bucket_done; + __s64 prev_idx; }; static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, @@ -3402,14 +3412,13 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq) struct bpf_udp_iter_state *iter = seq->private; struct udp_iter_state *state = &iter->state; struct net *net = seq_file_net(seq); - int resume_bucket, resume_offset; struct udp_table *udptable; unsigned int batch_sks = 0; bool resized = false; + int resume_bucket; struct sock *sk; resume_bucket = state->bucket; - resume_offset = iter->offset; /* The current batch is done, so advance the bucket. */ if (iter->st_bucket_done) @@ -3436,18 +3445,19 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq) if (hlist_empty(&hslot2->head)) continue; - iter->offset = 0; spin_lock_bh(&hslot2->lock); + /* Reset prev_idx if this is a new bucket. */ + if (!resume_bucket || state->bucket != resume_bucket) + iter->prev_idx = 0; udp_portaddr_for_each_entry(sk, &hslot2->head) { if (seq_sk_match(seq, sk)) { - /* Resume from the last iterated socket at the - * offset in the bucket before iterator was stopped. + /* Resume from the first socket that we didn't + * see last time around. */ if (state->bucket == resume_bucket && - iter->offset < resume_offset) { - ++iter->offset; + iter->prev_idx && + sk->sk_idx <= iter->prev_idx) continue; - } if (iter->end_sk < iter->max_sk) { sock_hold(sk); iter->batch[iter->end_sk++] = sk; @@ -3492,8 +3502,9 @@ static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) * done with seq_show(), so unref the iter->cur_sk. */ if (iter->cur_sk < iter->end_sk) { - sock_put(iter->batch[iter->cur_sk++]); - ++iter->offset; + sk = iter->batch[iter->cur_sk++]; + iter->prev_idx = sk->sk_idx; + sock_put(sk); } /* After updating iter->cur_sk, check if there are more sockets @@ -3740,6 +3751,7 @@ static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_ent udptable->hash2 = (void *)(udptable->hash + hash_entries); udptable->mask = hash_entries - 1; udptable->log = ilog2(hash_entries); + atomic64_set(&udptable->ver, 0); for (i = 0; i < hash_entries; i++) { INIT_HLIST_HEAD(&udptable->hash[i].head); diff --git a/security/security.c b/security/security.c index 143561ebc3e89..c369481232585 100644 --- a/security/security.c +++ b/security/security.c @@ -5627,6 +5627,7 @@ int security_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, * @cmd: command * @attr: bpf attribute * @size: size + * @kernel: whether or not call originated from kernel * * Do a initial check for all bpf syscalls after the attribute is copied into * the kernel. The actual security module can implement their own rules to @@ -5634,9 +5635,9 @@ int security_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, * * Return: Returns 0 if permission is granted. */ -int security_bpf(int cmd, union bpf_attr *attr, unsigned int size) +int security_bpf(int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { - return call_int_hook(bpf, cmd, attr, size); + return call_int_hook(bpf, cmd, attr, size, kernel); } /** @@ -5673,6 +5674,7 @@ int security_bpf_prog(struct bpf_prog *prog) * @map: BPF map object * @attr: BPF syscall attributes used to create BPF map * @token: BPF token used to grant user access + * @kernel: whether or not call originated from kernel * * Do a check when the kernel creates a new BPF map. This is also the * point where LSM blob is allocated for LSMs that need them. @@ -5680,9 +5682,9 @@ int security_bpf_prog(struct bpf_prog *prog) * Return: Returns 0 on success, error on failure. */ int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) { - return call_int_hook(bpf_map_create, map, attr, token); + return call_int_hook(bpf_map_create, map, attr, token, kernel); } /** @@ -5690,6 +5692,7 @@ int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, * @prog: BPF program object * @attr: BPF syscall attributes used to create BPF program * @token: BPF token used to grant user access to BPF subsystem + * @kernel: whether or not call originated from kernel * * Perform an access control check when the kernel loads a BPF program and * allocates associated BPF program object. This hook is also responsible for @@ -5698,9 +5701,9 @@ int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, * Return: Returns 0 on success, error on failure. */ int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) { - return call_int_hook(bpf_prog_load, prog, attr, token); + return call_int_hook(bpf_prog_load, prog, attr, token, kernel); } /** diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 7b867dfec88ba..71199d86fc97f 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -6866,7 +6866,7 @@ static int selinux_ib_alloc_security(void *ib_sec) #ifdef CONFIG_BPF_SYSCALL static int selinux_bpf(int cmd, union bpf_attr *attr, - unsigned int size) + unsigned int size, bool kernel) { u32 sid = current_sid(); int ret; @@ -6953,7 +6953,7 @@ static int selinux_bpf_prog(struct bpf_prog *prog) } static int selinux_bpf_map_create(struct bpf_map *map, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) { struct bpf_security_struct *bpfsec; @@ -6976,7 +6976,7 @@ static void selinux_bpf_map_free(struct bpf_map *map) } static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr, - struct bpf_token *token) + struct bpf_token *token, bool kernel) { struct bpf_security_struct *bpfsec; diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index e71be67f1d865..52ffb74ae4e89 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -1928,6 +1928,7 @@ static int do_loader(int argc, char **argv) obj = bpf_object__open_file(file, &open_opts); if (!obj) { + err = -1; p_err("failed to open object file"); goto err_close_obj; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index fff6cdb8d11a2..bb37897c03939 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -51,6 +51,9 @@ #define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ #define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */ +#define BPF_LOAD_ACQ 0x100 /* load-acquire */ +#define BPF_STORE_REL 0x110 /* store-release */ + enum bpf_cond_pseudo_jmp { BPF_MAY_GOTO = 0, }; @@ -1207,6 +1210,7 @@ enum bpf_perf_event_type { #define BPF_F_BEFORE (1U << 3) #define BPF_F_AFTER (1U << 4) #define BPF_F_ID (1U << 5) +#define BPF_F_PREORDER (1U << 6) #define BPF_F_LINK BPF_F_LINK /* 1 << 13 */ /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 899e98225f3b6..8e32286854ef3 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -670,11 +670,18 @@ struct elf_state { struct usdt_manager; +enum bpf_object_state { + OBJ_OPEN, + OBJ_PREPARED, + OBJ_LOADED, +}; + struct bpf_object { char name[BPF_OBJ_NAME_LEN]; char license[64]; __u32 kern_version; + enum bpf_object_state state; struct bpf_program *programs; size_t nr_programs; struct bpf_map *maps; @@ -686,7 +693,6 @@ struct bpf_object { int nr_extern; int kconfig_map_idx; - bool loaded; bool has_subcalls; bool has_rodata; @@ -1511,7 +1517,7 @@ static struct bpf_object *bpf_object__new(const char *path, obj->kconfig_map_idx = -1; obj->kern_version = get_kernel_version(); - obj->loaded = false; + obj->state = OBJ_OPEN; return obj; } @@ -4845,6 +4851,11 @@ static int bpf_get_map_info_from_fdinfo(int fd, struct bpf_map_info *info) return 0; } +static bool map_is_created(const struct bpf_map *map) +{ + return map->obj->state >= OBJ_PREPARED || map->reused; +} + bool bpf_map__autocreate(const struct bpf_map *map) { return map->autocreate; @@ -4852,7 +4863,7 @@ bool bpf_map__autocreate(const struct bpf_map *map) int bpf_map__set_autocreate(struct bpf_map *map, bool autocreate) { - if (map->obj->loaded) + if (map_is_created(map)) return libbpf_err(-EBUSY); map->autocreate = autocreate; @@ -4946,7 +4957,7 @@ struct bpf_map *bpf_map__inner_map(struct bpf_map *map) int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries) { - if (map->obj->loaded) + if (map_is_created(map)) return libbpf_err(-EBUSY); map->def.max_entries = max_entries; @@ -5191,11 +5202,6 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) static void bpf_map__destroy(struct bpf_map *map); -static bool map_is_created(const struct bpf_map *map) -{ - return map->obj->loaded || map->reused; -} - static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, bool is_inner) { LIBBPF_OPTS(bpf_map_create_opts, create_attr); @@ -7895,13 +7901,6 @@ bpf_object__load_progs(struct bpf_object *obj, int log_level) size_t i; int err; - for (i = 0; i < obj->nr_programs; i++) { - prog = &obj->programs[i]; - err = bpf_object__sanitize_prog(obj, prog); - if (err) - return err; - } - for (i = 0; i < obj->nr_programs; i++) { prog = &obj->programs[i]; if (prog_is_subprog(obj, prog)) @@ -7927,6 +7926,21 @@ bpf_object__load_progs(struct bpf_object *obj, int log_level) return 0; } +static int bpf_object_prepare_progs(struct bpf_object *obj) +{ + struct bpf_program *prog; + size_t i; + int err; + + for (i = 0; i < obj->nr_programs; i++) { + prog = &obj->programs[i]; + err = bpf_object__sanitize_prog(obj, prog); + if (err) + return err; + } + return 0; +} + static const struct bpf_sec_def *find_sec_def(const char *sec_name); static int bpf_object_init_progs(struct bpf_object *obj, const struct bpf_object_open_opts *opts) @@ -8543,14 +8557,77 @@ static int bpf_object_prepare_struct_ops(struct bpf_object *obj) return 0; } +static void bpf_object_unpin(struct bpf_object *obj) +{ + int i; + + /* unpin any maps that were auto-pinned during load */ + for (i = 0; i < obj->nr_maps; i++) + if (obj->maps[i].pinned && !obj->maps[i].reused) + bpf_map__unpin(&obj->maps[i], NULL); +} + +static void bpf_object_post_load_cleanup(struct bpf_object *obj) +{ + int i; + + /* clean up fd_array */ + zfree(&obj->fd_array); + + /* clean up module BTFs */ + for (i = 0; i < obj->btf_module_cnt; i++) { + close(obj->btf_modules[i].fd); + btf__free(obj->btf_modules[i].btf); + free(obj->btf_modules[i].name); + } + obj->btf_module_cnt = 0; + zfree(&obj->btf_modules); + + /* clean up vmlinux BTF */ + btf__free(obj->btf_vmlinux); + obj->btf_vmlinux = NULL; +} + +static int bpf_object_prepare(struct bpf_object *obj, const char *target_btf_path) +{ + int err; + + if (obj->state >= OBJ_PREPARED) { + pr_warn("object '%s': prepare loading can't be attempted twice\n", obj->name); + return -EINVAL; + } + + err = bpf_object_prepare_token(obj); + err = err ? : bpf_object__probe_loading(obj); + err = err ? : bpf_object__load_vmlinux_btf(obj, false); + err = err ? : bpf_object__resolve_externs(obj, obj->kconfig); + err = err ? : bpf_object__sanitize_maps(obj); + err = err ? : bpf_object__init_kern_struct_ops_maps(obj); + err = err ? : bpf_object_adjust_struct_ops_autoload(obj); + err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path); + err = err ? : bpf_object__sanitize_and_load_btf(obj); + err = err ? : bpf_object__create_maps(obj); + err = err ? : bpf_object_prepare_progs(obj); + + if (err) { + bpf_object_unpin(obj); + bpf_object_unload(obj); + obj->state = OBJ_LOADED; + return err; + } + + obj->state = OBJ_PREPARED; + return 0; +} + static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const char *target_btf_path) { - int err, i; + int err; if (!obj) return libbpf_err(-EINVAL); - if (obj->loaded) { + if (obj->state >= OBJ_LOADED) { pr_warn("object '%s': load can't be attempted twice\n", obj->name); return libbpf_err(-EINVAL); } @@ -8565,17 +8642,12 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch return libbpf_err(-LIBBPF_ERRNO__ENDIAN); } - err = bpf_object_prepare_token(obj); - err = err ? : bpf_object__probe_loading(obj); - err = err ? : bpf_object__load_vmlinux_btf(obj, false); - err = err ? : bpf_object__resolve_externs(obj, obj->kconfig); - err = err ? : bpf_object__sanitize_maps(obj); - err = err ? : bpf_object__init_kern_struct_ops_maps(obj); - err = err ? : bpf_object_adjust_struct_ops_autoload(obj); - err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path); - err = err ? : bpf_object__sanitize_and_load_btf(obj); - err = err ? : bpf_object__create_maps(obj); - err = err ? : bpf_object__load_progs(obj, extra_log_level); + if (obj->state < OBJ_PREPARED) { + err = bpf_object_prepare(obj, target_btf_path); + if (err) + return libbpf_err(err); + } + err = bpf_object__load_progs(obj, extra_log_level); err = err ? : bpf_object_init_prog_arrays(obj); err = err ? : bpf_object_prepare_struct_ops(obj); @@ -8587,36 +8659,22 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch err = bpf_gen__finish(obj->gen_loader, obj->nr_programs, obj->nr_maps); } - /* clean up fd_array */ - zfree(&obj->fd_array); + bpf_object_post_load_cleanup(obj); + obj->state = OBJ_LOADED; /* doesn't matter if successfully or not */ - /* clean up module BTFs */ - for (i = 0; i < obj->btf_module_cnt; i++) { - close(obj->btf_modules[i].fd); - btf__free(obj->btf_modules[i].btf); - free(obj->btf_modules[i].name); + if (err) { + bpf_object_unpin(obj); + bpf_object_unload(obj); + pr_warn("failed to load object '%s'\n", obj->path); + return libbpf_err(err); } - free(obj->btf_modules); - - /* clean up vmlinux BTF */ - btf__free(obj->btf_vmlinux); - obj->btf_vmlinux = NULL; - - obj->loaded = true; /* doesn't matter if successfully or not */ - - if (err) - goto out; return 0; -out: - /* unpin any maps that were auto-pinned during load */ - for (i = 0; i < obj->nr_maps; i++) - if (obj->maps[i].pinned && !obj->maps[i].reused) - bpf_map__unpin(&obj->maps[i], NULL); +} - bpf_object_unload(obj); - pr_warn("failed to load object '%s'\n", obj->path); - return libbpf_err(err); +int bpf_object__prepare(struct bpf_object *obj) +{ + return libbpf_err(bpf_object_prepare(obj, NULL)); } int bpf_object__load(struct bpf_object *obj) @@ -8866,7 +8924,7 @@ int bpf_object__pin_maps(struct bpf_object *obj, const char *path) if (!obj) return libbpf_err(-ENOENT); - if (!obj->loaded) { + if (obj->state < OBJ_PREPARED) { pr_warn("object not yet loaded; load it first\n"); return libbpf_err(-ENOENT); } @@ -8945,7 +9003,7 @@ int bpf_object__pin_programs(struct bpf_object *obj, const char *path) if (!obj) return libbpf_err(-ENOENT); - if (!obj->loaded) { + if (obj->state < OBJ_LOADED) { pr_warn("object not yet loaded; load it first\n"); return libbpf_err(-ENOENT); } @@ -9064,6 +9122,13 @@ void bpf_object__close(struct bpf_object *obj) if (IS_ERR_OR_NULL(obj)) return; + /* + * if user called bpf_object__prepare() without ever getting to + * bpf_object__load(), we need to clean up stuff that is normally + * cleaned up at the end of loading step + */ + bpf_object_post_load_cleanup(obj); + usdt_manager_free(obj->usdt_man); obj->usdt_man = NULL; @@ -9132,7 +9197,7 @@ int bpf_object__btf_fd(const struct bpf_object *obj) int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version) { - if (obj->loaded) + if (obj->state >= OBJ_LOADED) return libbpf_err(-EINVAL); obj->kern_version = kern_version; @@ -9229,7 +9294,7 @@ bool bpf_program__autoload(const struct bpf_program *prog) int bpf_program__set_autoload(struct bpf_program *prog, bool autoload) { - if (prog->obj->loaded) + if (prog->obj->state >= OBJ_LOADED) return libbpf_err(-EINVAL); prog->autoload = autoload; @@ -9261,7 +9326,7 @@ int bpf_program__set_insns(struct bpf_program *prog, { struct bpf_insn *insns; - if (prog->obj->loaded) + if (prog->obj->state >= OBJ_LOADED) return libbpf_err(-EBUSY); insns = libbpf_reallocarray(prog->insns, new_insn_cnt, sizeof(*insns)); @@ -9304,7 +9369,7 @@ static int last_custom_sec_def_handler_id; int bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type) { - if (prog->obj->loaded) + if (prog->obj->state >= OBJ_LOADED) return libbpf_err(-EBUSY); /* if type is not changed, do nothing */ @@ -9335,7 +9400,7 @@ enum bpf_attach_type bpf_program__expected_attach_type(const struct bpf_program int bpf_program__set_expected_attach_type(struct bpf_program *prog, enum bpf_attach_type type) { - if (prog->obj->loaded) + if (prog->obj->state >= OBJ_LOADED) return libbpf_err(-EBUSY); prog->expected_attach_type = type; @@ -9349,7 +9414,7 @@ __u32 bpf_program__flags(const struct bpf_program *prog) int bpf_program__set_flags(struct bpf_program *prog, __u32 flags) { - if (prog->obj->loaded) + if (prog->obj->state >= OBJ_LOADED) return libbpf_err(-EBUSY); prog->prog_flags = flags; @@ -9363,7 +9428,7 @@ __u32 bpf_program__log_level(const struct bpf_program *prog) int bpf_program__set_log_level(struct bpf_program *prog, __u32 log_level) { - if (prog->obj->loaded) + if (prog->obj->state >= OBJ_LOADED) return libbpf_err(-EBUSY); prog->log_level = log_level; @@ -9382,7 +9447,7 @@ int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log return libbpf_err(-EINVAL); if (prog->log_size > UINT_MAX) return libbpf_err(-EINVAL); - if (prog->obj->loaded) + if (prog->obj->state >= OBJ_LOADED) return libbpf_err(-EBUSY); prog->log_buf = log_buf; @@ -10299,7 +10364,7 @@ static int map_btf_datasec_resize(struct bpf_map *map, __u32 size) int bpf_map__set_value_size(struct bpf_map *map, __u32 size) { - if (map->obj->loaded || map->reused) + if (map_is_created(map)) return libbpf_err(-EBUSY); if (map->mmaped) { @@ -10345,7 +10410,7 @@ int bpf_map__set_initial_value(struct bpf_map *map, { size_t actual_sz; - if (map->obj->loaded || map->reused) + if (map_is_created(map)) return libbpf_err(-EBUSY); if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG) @@ -13666,7 +13731,7 @@ int bpf_program__set_attach_target(struct bpf_program *prog, if (!prog || attach_prog_fd < 0) return libbpf_err(-EINVAL); - if (prog->obj->loaded) + if (prog->obj->state >= OBJ_LOADED) return libbpf_err(-EINVAL); if (attach_prog_fd && !attach_func_name) { diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index 3020ee45303a0..e0605403f9773 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -241,6 +241,19 @@ LIBBPF_API struct bpf_object * bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, const struct bpf_object_open_opts *opts); +/** + * @brief **bpf_object__prepare()** prepares BPF object for loading: + * performs ELF processing, relocations, prepares final state of BPF program + * instructions (accessible with bpf_program__insns()), creates and + * (potentially) pins maps. Leaves BPF object in the state ready for program + * loading. + * @param obj Pointer to a valid BPF object instance returned by + * **bpf_object__open*()** API + * @return 0, on success; negative error code, otherwise, error code is + * stored in errno + */ +int bpf_object__prepare(struct bpf_object *obj); + /** * @brief **bpf_object__load()** loads BPF object into kernel. * @param obj Pointer to a valid BPF object instance returned by diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map index b5a838de6f47c..d8b71f22f197c 100644 --- a/tools/lib/bpf/libbpf.map +++ b/tools/lib/bpf/libbpf.map @@ -436,6 +436,7 @@ LIBBPF_1.6.0 { bpf_linker__add_buf; bpf_linker__add_fd; bpf_linker__new_fd; + bpf_object__prepare; btf__add_decl_attr; btf__add_type_attr; } LIBBPF_1.5.0; diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index abfb450c26bb3..ca41d47d4ba6a 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -95,17 +95,14 @@ TEST_GEN_PROGS += test_progs-cpuv4 TEST_INST_SUBDIRS += cpuv4 endif -TEST_GEN_FILES = test_lwt_ip_encap.bpf.o test_tc_edt.bpf.o +TEST_GEN_FILES = test_tc_edt.bpf.o TEST_FILES = xsk_prereqs.sh $(wildcard progs/btf_dump_test_case_*.c) # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ - test_tunnel.sh \ - test_lwt_seg6local.sh \ test_lirc_mode2.sh \ test_xdp_vlan_mode_generic.sh \ test_xdp_vlan_mode_native.sh \ - test_lwt_ip_encap.sh \ test_tc_tunnel.sh \ test_tc_edt.sh \ test_xdping.sh \ @@ -183,7 +180,7 @@ ifeq ($(feature-llvm),1) # both llvm-config and lib.mk add -D_GNU_SOURCE, which ends up as conflict LLVM_CFLAGS += $(filter-out -D_GNU_SOURCE,$(shell $(LLVM_CONFIG) --cflags)) # Prefer linking statically if it's available, otherwise fallback to shared - ifeq ($(shell $(LLVM_CONFIG) --link-static --libs &> /dev/null && echo static),static) + ifeq ($(shell $(LLVM_CONFIG) --link-static --libs >/dev/null 2>&1 && echo static),static) LLVM_LDLIBS += $(shell $(LLVM_CONFIG) --link-static --libs $(LLVM_CONFIG_LIB_COMPONENTS)) LLVM_LDLIBS += $(shell $(LLVM_CONFIG) --link-static --system-libs $(LLVM_CONFIG_LIB_COMPONENTS)) LLVM_LDLIBS += -lstdc++ diff --git a/tools/testing/selftests/bpf/bpf_arena_spin_lock.h b/tools/testing/selftests/bpf/bpf_arena_spin_lock.h new file mode 100644 index 0000000000000..fb8dc07689998 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_arena_spin_lock.h @@ -0,0 +1,533 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#ifndef BPF_ARENA_SPIN_LOCK_H +#define BPF_ARENA_SPIN_LOCK_H + +#include +#include +#include "bpf_atomic.h" + +#define arch_mcs_spin_lock_contended_label(l, label) smp_cond_load_acquire_label(l, VAL, label) +#define arch_mcs_spin_unlock_contended(l) smp_store_release((l), 1) + +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) + +#define EBUSY 16 +#define EOPNOTSUPP 95 +#define ETIMEDOUT 110 + +#ifndef __arena +#define __arena __attribute__((address_space(1))) +#endif + +extern unsigned long CONFIG_NR_CPUS __kconfig; + +/* + * Typically, we'd just rely on the definition in vmlinux.h for qspinlock, but + * PowerPC overrides the definition to define lock->val as u32 instead of + * atomic_t, leading to compilation errors. Import a local definition below so + * that we don't depend on the vmlinux.h version. + */ + +struct __qspinlock { + union { + atomic_t val; + struct { + u8 locked; + u8 pending; + }; + struct { + u16 locked_pending; + u16 tail; + }; + }; +}; + +#define arena_spinlock_t struct __qspinlock +/* FIXME: Using typedef causes CO-RE relocation error */ +/* typedef struct qspinlock arena_spinlock_t; */ + +struct arena_mcs_spinlock { + struct arena_mcs_spinlock __arena *next; + int locked; + int count; +}; + +struct arena_qnode { + struct arena_mcs_spinlock mcs; +}; + +#define _Q_MAX_NODES 4 +#define _Q_PENDING_LOOPS 1 + +/* + * Bitfields in the atomic value: + * + * 0- 7: locked byte + * 8: pending + * 9-15: not used + * 16-17: tail index + * 18-31: tail cpu (+1) + */ +#define _Q_MAX_CPUS 1024 + +#define _Q_SET_MASK(type) (((1U << _Q_ ## type ## _BITS) - 1)\ + << _Q_ ## type ## _OFFSET) +#define _Q_LOCKED_OFFSET 0 +#define _Q_LOCKED_BITS 8 +#define _Q_LOCKED_MASK _Q_SET_MASK(LOCKED) + +#define _Q_PENDING_OFFSET (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS) +#define _Q_PENDING_BITS 8 +#define _Q_PENDING_MASK _Q_SET_MASK(PENDING) + +#define _Q_TAIL_IDX_OFFSET (_Q_PENDING_OFFSET + _Q_PENDING_BITS) +#define _Q_TAIL_IDX_BITS 2 +#define _Q_TAIL_IDX_MASK _Q_SET_MASK(TAIL_IDX) + +#define _Q_TAIL_CPU_OFFSET (_Q_TAIL_IDX_OFFSET + _Q_TAIL_IDX_BITS) +#define _Q_TAIL_CPU_BITS (32 - _Q_TAIL_CPU_OFFSET) +#define _Q_TAIL_CPU_MASK _Q_SET_MASK(TAIL_CPU) + +#define _Q_TAIL_OFFSET _Q_TAIL_IDX_OFFSET +#define _Q_TAIL_MASK (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK) + +#define _Q_LOCKED_VAL (1U << _Q_LOCKED_OFFSET) +#define _Q_PENDING_VAL (1U << _Q_PENDING_OFFSET) + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +struct arena_qnode __arena qnodes[_Q_MAX_CPUS][_Q_MAX_NODES]; + +static inline u32 encode_tail(int cpu, int idx) +{ + u32 tail; + + tail = (cpu + 1) << _Q_TAIL_CPU_OFFSET; + tail |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */ + + return tail; +} + +static inline struct arena_mcs_spinlock __arena *decode_tail(u32 tail) +{ + u32 cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; + u32 idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; + + return &qnodes[cpu][idx].mcs; +} + +static inline +struct arena_mcs_spinlock __arena *grab_mcs_node(struct arena_mcs_spinlock __arena *base, int idx) +{ + return &((struct arena_qnode __arena *)base + idx)->mcs; +} + +#define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK) + +/** + * xchg_tail - Put in the new queue tail code word & retrieve previous one + * @lock : Pointer to queued spinlock structure + * @tail : The new queue tail code word + * Return: The previous queue tail code word + * + * xchg(lock, tail) + * + * p,*,* -> n,*,* ; prev = xchg(lock, node) + */ +static __always_inline u32 xchg_tail(arena_spinlock_t __arena *lock, u32 tail) +{ + u32 old, new; + + old = atomic_read(&lock->val); + do { + new = (old & _Q_LOCKED_PENDING_MASK) | tail; + /* + * We can use relaxed semantics since the caller ensures that + * the MCS node is properly initialized before updating the + * tail. + */ + /* These loops are not expected to stall, but we still need to + * prove to the verifier they will terminate eventually. + */ + cond_break_label(out); + } while (!atomic_try_cmpxchg_relaxed(&lock->val, &old, new)); + + return old; +out: + bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); + return old; +} + +/** + * clear_pending - clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,* -> *,0,* + */ +static __always_inline void clear_pending(arena_spinlock_t __arena *lock) +{ + WRITE_ONCE(lock->pending, 0); +} + +/** + * clear_pending_set_locked - take ownership and clear the pending bit. + * @lock: Pointer to queued spinlock structure + * + * *,1,0 -> *,0,1 + * + * Lock stealing is not allowed if this function is used. + */ +static __always_inline void clear_pending_set_locked(arena_spinlock_t __arena *lock) +{ + WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL); +} + +/** + * set_locked - Set the lock bit and own the lock + * @lock: Pointer to queued spinlock structure + * + * *,*,0 -> *,0,1 + */ +static __always_inline void set_locked(arena_spinlock_t __arena *lock) +{ + WRITE_ONCE(lock->locked, _Q_LOCKED_VAL); +} + +static __always_inline +u32 arena_fetch_set_pending_acquire(arena_spinlock_t __arena *lock) +{ + u32 old, new; + + old = atomic_read(&lock->val); + do { + new = old | _Q_PENDING_VAL; + /* + * These loops are not expected to stall, but we still need to + * prove to the verifier they will terminate eventually. + */ + cond_break_label(out); + } while (!atomic_try_cmpxchg_acquire(&lock->val, &old, new)); + + return old; +out: + bpf_printk("RUNTIME ERROR: %s unexpected cond_break exit!!!", __func__); + return old; +} + +/** + * arena_spin_trylock - try to acquire the queued spinlock + * @lock : Pointer to queued spinlock structure + * Return: 1 if lock acquired, 0 if failed + */ +static __always_inline int arena_spin_trylock(arena_spinlock_t __arena *lock) +{ + int val = atomic_read(&lock->val); + + if (unlikely(val)) + return 0; + + return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)); +} + +__noinline +int arena_spin_lock_slowpath(arena_spinlock_t __arena __arg_arena *lock, u32 val) +{ + struct arena_mcs_spinlock __arena *prev, *next, *node0, *node; + int ret = -ETIMEDOUT; + u32 old, tail; + int idx; + + /* + * Wait for in-progress pending->locked hand-overs with a bounded + * number of spins so that we guarantee forward progress. + * + * 0,1,0 -> 0,0,1 + */ + if (val == _Q_PENDING_VAL) { + int cnt = _Q_PENDING_LOOPS; + val = atomic_cond_read_relaxed_label(&lock->val, + (VAL != _Q_PENDING_VAL) || !cnt--, + release_err); + } + + /* + * If we observe any contention; queue. + */ + if (val & ~_Q_LOCKED_MASK) + goto queue; + + /* + * trylock || pending + * + * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock + */ + val = arena_fetch_set_pending_acquire(lock); + + /* + * If we observe contention, there is a concurrent locker. + * + * Undo and queue; our setting of PENDING might have made the + * n,0,0 -> 0,0,0 transition fail and it will now be waiting + * on @next to become !NULL. + */ + if (unlikely(val & ~_Q_LOCKED_MASK)) { + + /* Undo PENDING if we set it. */ + if (!(val & _Q_PENDING_MASK)) + clear_pending(lock); + + goto queue; + } + + /* + * We're pending, wait for the owner to go away. + * + * 0,1,1 -> *,1,0 + * + * this wait loop must be a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because not all + * clear_pending_set_locked() implementations imply full + * barriers. + */ + if (val & _Q_LOCKED_MASK) + smp_cond_load_acquire_label(&lock->locked, !VAL, release_err); + + /* + * take ownership and clear the pending bit. + * + * 0,1,0 -> 0,0,1 + */ + clear_pending_set_locked(lock); + return 0; + + /* + * End of pending bit optimistic spinning and beginning of MCS + * queuing. + */ +queue: + node0 = &(qnodes[bpf_get_smp_processor_id()])[0].mcs; + idx = node0->count++; + tail = encode_tail(bpf_get_smp_processor_id(), idx); + + /* + * 4 nodes are allocated based on the assumption that there will not be + * nested NMIs taking spinlocks. That may not be true in some + * architectures even though the chance of needing more than 4 nodes + * will still be extremely unlikely. When that happens, we simply return + * an error. Original qspinlock has a trylock fallback in this case. + */ + if (unlikely(idx >= _Q_MAX_NODES)) { + ret = -EBUSY; + goto release_node_err; + } + + node = grab_mcs_node(node0, idx); + + /* + * Ensure that we increment the head node->count before initialising + * the actual node. If the compiler is kind enough to reorder these + * stores, then an IRQ could overwrite our assignments. + */ + barrier(); + + node->locked = 0; + node->next = NULL; + + /* + * We touched a (possibly) cold cacheline in the per-cpu queue node; + * attempt the trylock once more in the hope someone let go while we + * weren't watching. + */ + if (arena_spin_trylock(lock)) + goto release; + + /* + * Ensure that the initialisation of @node is complete before we + * publish the updated tail via xchg_tail() and potentially link + * @node into the waitqueue via WRITE_ONCE(prev->next, node) below. + */ + smp_wmb(); + + /* + * Publish the updated tail. + * We have already touched the queueing cacheline; don't bother with + * pending stuff. + * + * p,*,* -> n,*,* + */ + old = xchg_tail(lock, tail); + next = NULL; + + /* + * if there was a previous node; link it and wait until reaching the + * head of the waitqueue. + */ + if (old & _Q_TAIL_MASK) { + prev = decode_tail(old); + + /* Link @node into the waitqueue. */ + WRITE_ONCE(prev->next, node); + + arch_mcs_spin_lock_contended_label(&node->locked, release_node_err); + + /* + * While waiting for the MCS lock, the next pointer may have + * been set by another lock waiter. We cannot prefetch here + * due to lack of equivalent instruction in BPF ISA. + */ + next = READ_ONCE(node->next); + } + + /* + * we're at the head of the waitqueue, wait for the owner & pending to + * go away. + * + * *,x,y -> *,0,0 + * + * this wait loop must use a load-acquire such that we match the + * store-release that clears the locked bit and create lock + * sequentiality; this is because the set_locked() function below + * does not imply a full barrier. + */ + val = atomic_cond_read_acquire_label(&lock->val, !(VAL & _Q_LOCKED_PENDING_MASK), + release_node_err); + + /* + * claim the lock: + * + * n,0,0 -> 0,0,1 : lock, uncontended + * *,*,0 -> *,*,1 : lock, contended + * + * If the queue head is the only one in the queue (lock value == tail) + * and nobody is pending, clear the tail code and grab the lock. + * Otherwise, we only need to grab the lock. + */ + + /* + * In the PV case we might already have _Q_LOCKED_VAL set, because + * of lock stealing; therefore we must also allow: + * + * n,0,1 -> 0,0,1 + * + * Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the + * above wait condition, therefore any concurrent setting of + * PENDING will make the uncontended transition fail. + */ + if ((val & _Q_TAIL_MASK) == tail) { + if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) + goto release; /* No contention */ + } + + /* + * Either somebody is queued behind us or _Q_PENDING_VAL got set + * which will then detect the remaining tail and queue behind us + * ensuring we'll see a @next. + */ + set_locked(lock); + + /* + * contended path; wait for next if not observed yet, release. + */ + if (!next) + next = smp_cond_load_relaxed_label(&node->next, (VAL), release_node_err); + + arch_mcs_spin_unlock_contended(&next->locked); + +release:; + /* + * release the node + * + * Doing a normal dec vs this_cpu_dec is fine. An upper context always + * decrements count it incremented before returning, thus we're fine. + * For contexts interrupting us, they either observe our dec or not. + * Just ensure the compiler doesn't reorder this statement, as a + * this_cpu_dec implicitly implied that. + */ + barrier(); + node0->count--; + return 0; +release_node_err: + barrier(); + node0->count--; + goto release_err; +release_err: + return ret; +} + +/** + * arena_spin_lock - acquire a queued spinlock + * @lock: Pointer to queued spinlock structure + * + * On error, returned value will be negative. + * On success, zero is returned. + * + * The return value _must_ be tested against zero for success, + * instead of checking it against negative, for passing the + * BPF verifier. + * + * The user should do: + * if (arena_spin_lock(...) != 0) // failure + * or + * if (arena_spin_lock(...) == 0) // success + * or + * if (arena_spin_lock(...)) // failure + * or + * if (!arena_spin_lock(...)) // success + * instead of: + * if (arena_spin_lock(...) < 0) // failure + * + * The return value can still be inspected later. + */ +static __always_inline int arena_spin_lock(arena_spinlock_t __arena *lock) +{ + int val = 0; + + if (CONFIG_NR_CPUS > 1024) + return -EOPNOTSUPP; + + bpf_preempt_disable(); + if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) + return 0; + + val = arena_spin_lock_slowpath(lock, val); + /* FIXME: bpf_assert_range(-MAX_ERRNO, 0) once we have it working for all cases. */ + if (val) + bpf_preempt_enable(); + return val; +} + +/** + * arena_spin_unlock - release a queued spinlock + * @lock : Pointer to queued spinlock structure + */ +static __always_inline void arena_spin_unlock(arena_spinlock_t __arena *lock) +{ + /* + * unlock() needs release semantics: + */ + smp_store_release(&lock->locked, 0); + bpf_preempt_enable(); +} + +#define arena_spin_lock_irqsave(lock, flags) \ + ({ \ + int __ret; \ + bpf_local_irq_save(&(flags)); \ + __ret = arena_spin_lock((lock)); \ + if (__ret) \ + bpf_local_irq_restore(&(flags)); \ + (__ret); \ + }) + +#define arena_spin_unlock_irqrestore(lock, flags) \ + ({ \ + arena_spin_unlock((lock)); \ + bpf_local_irq_restore(&(flags)); \ + }) + +#endif + +#endif /* BPF_ARENA_SPIN_LOCK_H */ diff --git a/tools/testing/selftests/bpf/bpf_atomic.h b/tools/testing/selftests/bpf/bpf_atomic.h new file mode 100644 index 0000000000000..a9674e5443222 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_atomic.h @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#ifndef BPF_ATOMIC_H +#define BPF_ATOMIC_H + +#include +#include +#include "bpf_experimental.h" + +extern bool CONFIG_X86_64 __kconfig __weak; + +/* + * __unqual_typeof(x) - Declare an unqualified scalar type, leaving + * non-scalar types unchanged, + * + * Prefer C11 _Generic for better compile-times and simpler code. Note: 'char' + * is not type-compatible with 'signed char', and we define a separate case. + * + * This is copied verbatim from kernel's include/linux/compiler_types.h, but + * with default expression (for pointers) changed from (x) to (typeof(x)0). + * + * This is because LLVM has a bug where for lvalue (x), it does not get rid of + * an extra address_space qualifier, but does in case of rvalue (typeof(x)0). + * Hence, for pointers, we need to create an rvalue expression to get the + * desired type. See https://github.com/llvm/llvm-project/issues/53400. + */ +#define __scalar_type_to_expr_cases(type) \ + unsigned type : (unsigned type)0, signed type : (signed type)0 + +#define __unqual_typeof(x) \ + typeof(_Generic((x), \ + char: (char)0, \ + __scalar_type_to_expr_cases(char), \ + __scalar_type_to_expr_cases(short), \ + __scalar_type_to_expr_cases(int), \ + __scalar_type_to_expr_cases(long), \ + __scalar_type_to_expr_cases(long long), \ + default: (typeof(x))0)) + +/* No-op for BPF */ +#define cpu_relax() ({}) + +#define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) + +#define WRITE_ONCE(x, val) ((*(volatile typeof(x) *)&(x)) = (val)) + +#define cmpxchg(p, old, new) __sync_val_compare_and_swap((p), old, new) + +#define try_cmpxchg(p, pold, new) \ + ({ \ + __unqual_typeof(*(pold)) __o = *(pold); \ + __unqual_typeof(*(p)) __r = cmpxchg(p, __o, new); \ + if (__r != __o) \ + *(pold) = __r; \ + __r == __o; \ + }) + +#define try_cmpxchg_relaxed(p, pold, new) try_cmpxchg(p, pold, new) + +#define try_cmpxchg_acquire(p, pold, new) try_cmpxchg(p, pold, new) + +#define smp_mb() \ + ({ \ + unsigned long __val; \ + __sync_fetch_and_add(&__val, 0); \ + }) + +#define smp_rmb() \ + ({ \ + if (!CONFIG_X86_64) \ + smp_mb(); \ + else \ + barrier(); \ + }) + +#define smp_wmb() \ + ({ \ + if (!CONFIG_X86_64) \ + smp_mb(); \ + else \ + barrier(); \ + }) + +/* Control dependency provides LOAD->STORE, provide LOAD->LOAD */ +#define smp_acquire__after_ctrl_dep() ({ smp_rmb(); }) + +#define smp_load_acquire(p) \ + ({ \ + __unqual_typeof(*(p)) __v = READ_ONCE(*(p)); \ + if (!CONFIG_X86_64) \ + smp_mb(); \ + barrier(); \ + __v; \ + }) + +#define smp_store_release(p, val) \ + ({ \ + if (!CONFIG_X86_64) \ + smp_mb(); \ + barrier(); \ + WRITE_ONCE(*(p), val); \ + }) + +#define smp_cond_load_relaxed_label(p, cond_expr, label) \ + ({ \ + typeof(p) __ptr = (p); \ + __unqual_typeof(*(p)) VAL; \ + for (;;) { \ + VAL = (__unqual_typeof(*(p)))READ_ONCE(*__ptr); \ + if (cond_expr) \ + break; \ + cond_break_label(label); \ + cpu_relax(); \ + } \ + (typeof(*(p)))VAL; \ + }) + +#define smp_cond_load_acquire_label(p, cond_expr, label) \ + ({ \ + __unqual_typeof(*p) __val = \ + smp_cond_load_relaxed_label(p, cond_expr, label); \ + smp_acquire__after_ctrl_dep(); \ + (typeof(*(p)))__val; \ + }) + +#define atomic_read(p) READ_ONCE((p)->counter) + +#define atomic_cond_read_relaxed_label(p, cond_expr, label) \ + smp_cond_load_relaxed_label(&(p)->counter, cond_expr, label) + +#define atomic_cond_read_acquire_label(p, cond_expr, label) \ + smp_cond_load_acquire_label(&(p)->counter, cond_expr, label) + +#define atomic_try_cmpxchg_relaxed(p, pold, new) \ + try_cmpxchg_relaxed(&(p)->counter, pold, new) + +#define atomic_try_cmpxchg_acquire(p, pold, new) \ + try_cmpxchg_acquire(&(p)->counter, pold, new) + +#endif /* BPF_ATOMIC_H */ diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h index cd8ecd39c3f3c..6535c8ae3c469 100644 --- a/tools/testing/selftests/bpf/bpf_experimental.h +++ b/tools/testing/selftests/bpf/bpf_experimental.h @@ -368,12 +368,12 @@ l_true: \ ret; \ }) -#define cond_break \ +#define __cond_break(expr) \ ({ __label__ l_break, l_continue; \ asm volatile goto("may_goto %l[l_break]" \ :::: l_break); \ goto l_continue; \ - l_break: break; \ + l_break: expr; \ l_continue:; \ }) #else @@ -392,7 +392,7 @@ l_true: \ ret; \ }) -#define cond_break \ +#define __cond_break(expr) \ ({ __label__ l_break, l_continue; \ asm volatile goto("1:.byte 0xe5; \ .byte 0; \ @@ -400,7 +400,7 @@ l_true: \ .short 0" \ :::: l_break); \ goto l_continue; \ - l_break: break; \ + l_break: expr; \ l_continue:; \ }) #else @@ -418,7 +418,7 @@ l_true: \ ret; \ }) -#define cond_break \ +#define __cond_break(expr) \ ({ __label__ l_break, l_continue; \ asm volatile goto("1:.byte 0xe5; \ .byte 0; \ @@ -426,12 +426,15 @@ l_true: \ .short 0" \ :::: l_break); \ goto l_continue; \ - l_break: break; \ + l_break: expr; \ l_continue:; \ }) #endif #endif +#define cond_break __cond_break(break) +#define cond_break_label(label) __cond_break(goto label) + #ifndef bpf_nop_mov #define bpf_nop_mov(var) \ asm volatile("%[reg]=%[reg]"::[reg]"r"((short)var)) diff --git a/tools/testing/selftests/bpf/cap_helpers.c b/tools/testing/selftests/bpf/cap_helpers.c index d5ac507401d7c..98f840c3a38f7 100644 --- a/tools/testing/selftests/bpf/cap_helpers.c +++ b/tools/testing/selftests/bpf/cap_helpers.c @@ -19,7 +19,7 @@ int cap_enable_effective(__u64 caps, __u64 *old_caps) err = capget(&hdr, data); if (err) - return err; + return -errno; if (old_caps) *old_caps = (__u64)(data[1].effective) << 32 | data[0].effective; @@ -32,7 +32,7 @@ int cap_enable_effective(__u64 caps, __u64 *old_caps) data[1].effective |= cap1; err = capset(&hdr, data); if (err) - return err; + return -errno; return 0; } @@ -49,7 +49,7 @@ int cap_disable_effective(__u64 caps, __u64 *old_caps) err = capget(&hdr, data); if (err) - return err; + return -errno; if (old_caps) *old_caps = (__u64)(data[1].effective) << 32 | data[0].effective; @@ -61,7 +61,7 @@ int cap_disable_effective(__u64 caps, __u64 *old_caps) data[1].effective &= ~cap1; err = capset(&hdr, data); if (err) - return err; + return -errno; return 0; } diff --git a/tools/testing/selftests/bpf/cap_helpers.h b/tools/testing/selftests/bpf/cap_helpers.h index 6d163530cb0fd..8dcb28557f762 100644 --- a/tools/testing/selftests/bpf/cap_helpers.h +++ b/tools/testing/selftests/bpf/cap_helpers.h @@ -4,6 +4,7 @@ #include #include +#include #ifndef CAP_PERFMON #define CAP_PERFMON 38 diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c index 737a952dcf80b..97fb7caf95f45 100644 --- a/tools/testing/selftests/bpf/network_helpers.c +++ b/tools/testing/selftests/bpf/network_helpers.c @@ -750,6 +750,36 @@ struct tmonitor_ctx { int pcap_fd; }; +static int __base_pr(const char *format, va_list args) +{ + return vfprintf(stdout, format, args); +} + +static tm_print_fn_t __tm_pr = __base_pr; + +tm_print_fn_t traffic_monitor_set_print(tm_print_fn_t fn) +{ + tm_print_fn_t old_print_fn; + + old_print_fn = __atomic_exchange_n(&__tm_pr, fn, __ATOMIC_RELAXED); + + return old_print_fn; +} + +void tm_print(const char *format, ...) +{ + tm_print_fn_t print_fn; + va_list args; + + print_fn = __atomic_load_n(&__tm_pr, __ATOMIC_RELAXED); + if (!print_fn) + return; + + va_start(args, format); + print_fn(format, args); + va_end(args); +} + /* Is this packet captured with a Ethernet protocol type? */ static bool is_ethernet(const u_char *packet) { @@ -767,7 +797,7 @@ static bool is_ethernet(const u_char *packet) case 770: /* ARPHRD_FRAD */ case 778: /* ARPHDR_IPGRE */ case 803: /* ARPHRD_IEEE80211_RADIOTAP */ - printf("Packet captured: arphdr_type=%d\n", arphdr_type); + tm_print("Packet captured: arphdr_type=%d\n", arphdr_type); return false; } return true; @@ -817,19 +847,19 @@ static void show_transport(const u_char *packet, u16 len, u32 ifindex, dst_port = ntohs(tcp->dest); transport_str = "TCP"; } else if (proto == IPPROTO_ICMP) { - printf("%-7s %-3s IPv4 %s > %s: ICMP, length %d, type %d, code %d\n", - ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, - packet[0], packet[1]); + tm_print("%-7s %-3s IPv4 %s > %s: ICMP, length %d, type %d, code %d\n", + ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, + packet[0], packet[1]); return; } else if (proto == IPPROTO_ICMPV6) { - printf("%-7s %-3s IPv6 %s > %s: ICMPv6, length %d, type %d, code %d\n", - ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, - packet[0], packet[1]); + tm_print("%-7s %-3s IPv6 %s > %s: ICMPv6, length %d, type %d, code %d\n", + ifname, pkt_type_str(pkt_type), src_addr, dst_addr, len, + packet[0], packet[1]); return; } else { - printf("%-7s %-3s %s %s > %s: protocol %d\n", - ifname, pkt_type_str(pkt_type), ipv6 ? "IPv6" : "IPv4", - src_addr, dst_addr, proto); + tm_print("%-7s %-3s %s %s > %s: protocol %d\n", + ifname, pkt_type_str(pkt_type), ipv6 ? "IPv6" : "IPv4", + src_addr, dst_addr, proto); return; } @@ -843,13 +873,13 @@ static void show_transport(const u_char *packet, u16 len, u32 ifindex, tcp->ack ? ", ACK" : ""); if (ipv6) - printf("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d%s\n", - ifname, pkt_type_str(pkt_type), src_addr, src_port, - dst_addr, dst_port, transport_str, len, flags); + tm_print("%-7s %-3s IPv6 %s.%d > %s.%d: %s, length %d%s\n", + ifname, pkt_type_str(pkt_type), src_addr, src_port, + dst_addr, dst_port, transport_str, len, flags); else - printf("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d%s\n", - ifname, pkt_type_str(pkt_type), src_addr, src_port, - dst_addr, dst_port, transport_str, len, flags); + tm_print("%-7s %-3s IPv4 %s:%d > %s:%d: %s, length %d%s\n", + ifname, pkt_type_str(pkt_type), src_addr, src_port, + dst_addr, dst_port, transport_str, len, flags); } static void show_ipv6_packet(const u_char *packet, u32 ifindex, u8 pkt_type) @@ -964,8 +994,8 @@ static void *traffic_monitor_thread(void *arg) ifname = _ifname; } - printf("%-7s %-3s Unknown network protocol type 0x%x\n", - ifname, pkt_type_str(ptype), proto); + tm_print("%-7s %-3s Unknown network protocol type 0x%x\n", + ifname, pkt_type_str(ptype), proto); } } @@ -1165,8 +1195,9 @@ void traffic_monitor_stop(struct tmonitor_ctx *ctx) write(ctx->wake_fd, &w, sizeof(w)); pthread_join(ctx->thread, NULL); - printf("Packet file: %s\n", strrchr(ctx->pkt_fname, '/') + 1); + tm_print("Packet file: %s\n", strrchr(ctx->pkt_fname, '/') + 1); traffic_monitor_release(ctx); } + #endif /* TRAFFIC_MONITOR */ diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h index 9f6e05d886c54..a4e20c12c57ef 100644 --- a/tools/testing/selftests/bpf/network_helpers.h +++ b/tools/testing/selftests/bpf/network_helpers.h @@ -17,6 +17,7 @@ typedef __u16 __sum16; #include #include #include +#include #define MAGIC_VAL 0x1234 #define NUM_ITER 100000 @@ -249,10 +250,13 @@ static inline __sum16 build_udp_v6_csum(const struct ipv6hdr *ip6h, struct tmonitor_ctx; +typedef int (*tm_print_fn_t)(const char *format, va_list args); + #ifdef TRAFFIC_MONITOR struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name, const char *subtest_name); void traffic_monitor_stop(struct tmonitor_ctx *ctx); +tm_print_fn_t traffic_monitor_set_print(tm_print_fn_t fn); #else static inline struct tmonitor_ctx *traffic_monitor_start(const char *netns, const char *test_name, const char *subtest_name) @@ -263,6 +267,11 @@ static inline struct tmonitor_ctx *traffic_monitor_start(const char *netns, cons static inline void traffic_monitor_stop(struct tmonitor_ctx *ctx) { } + +static inline tm_print_fn_t traffic_monitor_set_print(tm_print_fn_t fn) +{ + return NULL; +} #endif #endif diff --git a/tools/testing/selftests/bpf/prog_tests/align.c b/tools/testing/selftests/bpf/prog_tests/align.c index 4ebd0da898f5c..1d53a8561ee2f 100644 --- a/tools/testing/selftests/bpf/prog_tests/align.c +++ b/tools/testing/selftests/bpf/prog_tests/align.c @@ -610,9 +610,11 @@ static int do_test_single(struct bpf_align_test *test) .log_size = sizeof(bpf_vlog), .log_level = 2, ); + const char *main_pass_start = "0: R1=ctx() R10=fp0"; const char *line_ptr; int cur_line = -1; int prog_len, i; + char *start; int fd_prog; int ret; @@ -632,7 +634,13 @@ static int do_test_single(struct bpf_align_test *test) ret = 0; /* We make a local copy so that we can strtok() it */ strncpy(bpf_vlog_copy, bpf_vlog, sizeof(bpf_vlog_copy)); - line_ptr = strtok(bpf_vlog_copy, "\n"); + start = strstr(bpf_vlog_copy, main_pass_start); + if (!start) { + ret = 1; + printf("Can't find initial line '%s'\n", main_pass_start); + goto out; + } + line_ptr = strtok(start, "\n"); for (i = 0; i < MAX_MATCHES; i++) { struct bpf_reg_match m = test->matches[i]; const char *p; @@ -682,6 +690,7 @@ static int do_test_single(struct bpf_align_test *test) break; } } +out: if (fd_prog >= 0) close(fd_prog); } diff --git a/tools/testing/selftests/bpf/prog_tests/arena_atomics.c b/tools/testing/selftests/bpf/prog_tests/arena_atomics.c index 26e7c06c6cb4d..d98577a6babc0 100644 --- a/tools/testing/selftests/bpf/prog_tests/arena_atomics.c +++ b/tools/testing/selftests/bpf/prog_tests/arena_atomics.c @@ -162,6 +162,66 @@ static void test_uaf(struct arena_atomics *skel) ASSERT_EQ(skel->arena->uaf_recovery_fails, 0, "uaf_recovery_fails"); } +static void test_load_acquire(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + if (skel->data->skip_lacq_srel_tests) { + printf("%s:SKIP: ENABLE_ATOMICS_TESTS not defined, Clang doesn't support addr_space_cast, and/or JIT doesn't support load-acquire\n", + __func__); + test__skip(); + return; + } + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.load_acquire); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->load_acquire8_result, 0x12, + "load_acquire8_result"); + ASSERT_EQ(skel->arena->load_acquire16_result, 0x1234, + "load_acquire16_result"); + ASSERT_EQ(skel->arena->load_acquire32_result, 0x12345678, + "load_acquire32_result"); + ASSERT_EQ(skel->arena->load_acquire64_result, 0x1234567890abcdef, + "load_acquire64_result"); +} + +static void test_store_release(struct arena_atomics *skel) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + int err, prog_fd; + + if (skel->data->skip_lacq_srel_tests) { + printf("%s:SKIP: ENABLE_ATOMICS_TESTS not defined, Clang doesn't support addr_space_cast, and/or JIT doesn't support store-release\n", + __func__); + test__skip(); + return; + } + + /* No need to attach it, just run it directly */ + prog_fd = bpf_program__fd(skel->progs.store_release); + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + return; + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + return; + + ASSERT_EQ(skel->arena->store_release8_result, 0x12, + "store_release8_result"); + ASSERT_EQ(skel->arena->store_release16_result, 0x1234, + "store_release16_result"); + ASSERT_EQ(skel->arena->store_release32_result, 0x12345678, + "store_release32_result"); + ASSERT_EQ(skel->arena->store_release64_result, 0x1234567890abcdef, + "store_release64_result"); +} + void test_arena_atomics(void) { struct arena_atomics *skel; @@ -171,7 +231,7 @@ void test_arena_atomics(void) if (!ASSERT_OK_PTR(skel, "arena atomics skeleton open")) return; - if (skel->data->skip_tests) { + if (skel->data->skip_all_tests) { printf("%s:SKIP:no ENABLE_ATOMICS_TESTS or no addr_space_cast support in clang", __func__); test__skip(); @@ -198,6 +258,10 @@ void test_arena_atomics(void) test_xchg(skel); if (test__start_subtest("uaf")) test_uaf(skel); + if (test__start_subtest("load_acquire")) + test_load_acquire(skel); + if (test__start_subtest("store_release")) + test_store_release(skel); cleanup: arena_atomics__destroy(skel); diff --git a/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c b/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c new file mode 100644 index 0000000000000..7565fc7690c27 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/arena_spin_lock.c @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include + +struct __qspinlock { int val; }; +typedef struct __qspinlock arena_spinlock_t; + +struct arena_qnode { + unsigned long next; + int count; + int locked; +}; + +#include "arena_spin_lock.skel.h" + +static long cpu; +static int repeat; + +pthread_barrier_t barrier; + +static void *spin_lock_thread(void *arg) +{ + int err, prog_fd = *(u32 *)arg; + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = repeat, + ); + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + CPU_SET(__sync_fetch_and_add(&cpu, 1), &cpuset); + ASSERT_OK(pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset), "cpu affinity"); + + err = pthread_barrier_wait(&barrier); + if (err != PTHREAD_BARRIER_SERIAL_THREAD && err != 0) + ASSERT_FALSE(true, "pthread_barrier"); + + err = bpf_prog_test_run_opts(prog_fd, &topts); + ASSERT_OK(err, "test_run err"); + ASSERT_EQ((int)topts.retval, 0, "test_run retval"); + + pthread_exit(arg); +} + +static void test_arena_spin_lock_size(int size) +{ + LIBBPF_OPTS(bpf_test_run_opts, topts); + struct arena_spin_lock *skel; + pthread_t thread_id[16]; + int prog_fd, i, err; + void *ret; + + if (get_nprocs() < 2) { + test__skip(); + return; + } + + skel = arena_spin_lock__open_and_load(); + if (!ASSERT_OK_PTR(skel, "arena_spin_lock__open_and_load")) + return; + if (skel->data->test_skip == 2) { + test__skip(); + goto end; + } + skel->bss->cs_count = size; + skel->bss->limit = repeat * 16; + + ASSERT_OK(pthread_barrier_init(&barrier, NULL, 16), "barrier init"); + + prog_fd = bpf_program__fd(skel->progs.prog); + for (i = 0; i < 16; i++) { + err = pthread_create(&thread_id[i], NULL, &spin_lock_thread, &prog_fd); + if (!ASSERT_OK(err, "pthread_create")) + goto end_barrier; + } + + for (i = 0; i < 16; i++) { + if (!ASSERT_OK(pthread_join(thread_id[i], &ret), "pthread_join")) + goto end_barrier; + if (!ASSERT_EQ(ret, &prog_fd, "ret == prog_fd")) + goto end_barrier; + } + + ASSERT_EQ(skel->bss->counter, repeat * 16, "check counter value"); + +end_barrier: + pthread_barrier_destroy(&barrier); +end: + arena_spin_lock__destroy(skel); + return; +} + +void test_arena_spin_lock(void) +{ + repeat = 1000; + if (test__start_subtest("arena_spin_lock_1")) + test_arena_spin_lock_size(1); + cpu = 0; + if (test__start_subtest("arena_spin_lock_1000")) + test_arena_spin_lock_size(1000); + cpu = 0; + repeat = 100; + if (test__start_subtest("arena_spin_lock_50000")) + test_arena_spin_lock_size(50000); +} diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c index a4a1f93878d40..dbd13f8e42a7a 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c @@ -72,11 +72,14 @@ static void test_bpf_nf_ct(int mode) if (!ASSERT_OK(system(cmd), cmd)) goto end; - srv_port = (mode == TEST_XDP) ? 5005 : 5006; - srv_fd = start_server(AF_INET, SOCK_STREAM, "127.0.0.1", srv_port, TIMEOUT_MS); + srv_fd = start_server(AF_INET, SOCK_STREAM, "127.0.0.1", 0, TIMEOUT_MS); if (!ASSERT_GE(srv_fd, 0, "start_server")) goto end; + srv_port = get_socket_local_port(srv_fd); + if (!ASSERT_GE(srv_port, 0, "get_sock_local_port")) + goto end; + client_fd = connect_to_server(srv_fd); if (!ASSERT_GE(client_fd, 0, "connect_to_server")) goto end; @@ -91,7 +94,7 @@ static void test_bpf_nf_ct(int mode) skel->bss->saddr = peer_addr.sin_addr.s_addr; skel->bss->sport = peer_addr.sin_port; skel->bss->daddr = peer_addr.sin_addr.s_addr; - skel->bss->dport = htons(srv_port); + skel->bss->dport = srv_port; if (mode == TEST_XDP) prog_fd = bpf_program__fd(skel->progs.nf_xdp_ct_test); diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_preorder.c b/tools/testing/selftests/bpf/prog_tests/cgroup_preorder.c new file mode 100644 index 0000000000000..d4d583872fa26 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_preorder.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include "cgroup_helpers.h" +#include "cgroup_preorder.skel.h" + +static int run_getsockopt_test(int cg_parent, int cg_child, int sock_fd, bool all_preorder) +{ + LIBBPF_OPTS(bpf_prog_attach_opts, opts); + enum bpf_attach_type prog_c_atype, prog_c2_atype, prog_p_atype, prog_p2_atype; + int prog_c_fd, prog_c2_fd, prog_p_fd, prog_p2_fd; + struct cgroup_preorder *skel = NULL; + struct bpf_program *prog; + __u8 *result, buf; + socklen_t optlen; + int err = 0; + + skel = cgroup_preorder__open_and_load(); + if (!ASSERT_OK_PTR(skel, "cgroup_preorder__open_and_load")) + return 0; + + buf = 0x00; + err = setsockopt(sock_fd, SOL_IP, IP_TOS, &buf, 1); + if (!ASSERT_OK(err, "setsockopt")) + goto close_skel; + + opts.flags = BPF_F_ALLOW_MULTI; + if (all_preorder) + opts.flags |= BPF_F_PREORDER; + prog = skel->progs.child; + prog_c_fd = bpf_program__fd(prog); + prog_c_atype = bpf_program__expected_attach_type(prog); + err = bpf_prog_attach_opts(prog_c_fd, cg_child, prog_c_atype, &opts); + if (!ASSERT_OK(err, "bpf_prog_attach_opts-child")) + goto close_skel; + + opts.flags = BPF_F_ALLOW_MULTI | BPF_F_PREORDER; + prog = skel->progs.child_2; + prog_c2_fd = bpf_program__fd(prog); + prog_c2_atype = bpf_program__expected_attach_type(prog); + err = bpf_prog_attach_opts(prog_c2_fd, cg_child, prog_c2_atype, &opts); + if (!ASSERT_OK(err, "bpf_prog_attach_opts-child_2")) + goto detach_child; + + optlen = 1; + err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen); + if (!ASSERT_OK(err, "getsockopt")) + goto detach_child_2; + + result = skel->bss->result; + if (all_preorder) + ASSERT_TRUE(result[0] == 1 && result[1] == 2, "child only"); + else + ASSERT_TRUE(result[0] == 2 && result[1] == 1, "child only"); + + skel->bss->idx = 0; + memset(result, 0, 4); + + opts.flags = BPF_F_ALLOW_MULTI; + if (all_preorder) + opts.flags |= BPF_F_PREORDER; + prog = skel->progs.parent; + prog_p_fd = bpf_program__fd(prog); + prog_p_atype = bpf_program__expected_attach_type(prog); + err = bpf_prog_attach_opts(prog_p_fd, cg_parent, prog_p_atype, &opts); + if (!ASSERT_OK(err, "bpf_prog_attach_opts-parent")) + goto detach_child_2; + + opts.flags = BPF_F_ALLOW_MULTI | BPF_F_PREORDER; + prog = skel->progs.parent_2; + prog_p2_fd = bpf_program__fd(prog); + prog_p2_atype = bpf_program__expected_attach_type(prog); + err = bpf_prog_attach_opts(prog_p2_fd, cg_parent, prog_p2_atype, &opts); + if (!ASSERT_OK(err, "bpf_prog_attach_opts-parent_2")) + goto detach_parent; + + err = getsockopt(sock_fd, SOL_IP, IP_TOS, &buf, &optlen); + if (!ASSERT_OK(err, "getsockopt")) + goto detach_parent_2; + + if (all_preorder) + ASSERT_TRUE(result[0] == 3 && result[1] == 4 && result[2] == 1 && result[3] == 2, + "parent and child"); + else + ASSERT_TRUE(result[0] == 4 && result[1] == 2 && result[2] == 1 && result[3] == 3, + "parent and child"); + +detach_parent_2: + ASSERT_OK(bpf_prog_detach2(prog_p2_fd, cg_parent, prog_p2_atype), + "bpf_prog_detach2-parent_2"); +detach_parent: + ASSERT_OK(bpf_prog_detach2(prog_p_fd, cg_parent, prog_p_atype), + "bpf_prog_detach2-parent"); +detach_child_2: + ASSERT_OK(bpf_prog_detach2(prog_c2_fd, cg_child, prog_c2_atype), + "bpf_prog_detach2-child_2"); +detach_child: + ASSERT_OK(bpf_prog_detach2(prog_c_fd, cg_child, prog_c_atype), + "bpf_prog_detach2-child"); +close_skel: + cgroup_preorder__destroy(skel); + return err; +} + +void test_cgroup_preorder(void) +{ + int cg_parent = -1, cg_child = -1, sock_fd = -1; + + cg_parent = test__join_cgroup("/parent"); + if (!ASSERT_GE(cg_parent, 0, "join_cgroup /parent")) + goto out; + + cg_child = test__join_cgroup("/parent/child"); + if (!ASSERT_GE(cg_child, 0, "join_cgroup /parent/child")) + goto out; + + sock_fd = socket(AF_INET, SOCK_STREAM, 0); + if (!ASSERT_GE(sock_fd, 0, "socket")) + goto out; + + ASSERT_OK(run_getsockopt_test(cg_parent, cg_child, sock_fd, false), "getsockopt_test_1"); + ASSERT_OK(run_getsockopt_test(cg_parent, cg_child, sock_fd, true), "getsockopt_test_2"); + +out: + close(sock_fd); + close(cg_child); + close(cg_parent); +} diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c b/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c index 64abba72ac107..37c1cc52ed98e 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_v1v2.c @@ -10,12 +10,18 @@ static int run_test(int cgroup_fd, int server_fd, bool classid) { struct connect4_dropper *skel; - int fd, err = 0; + int fd, err = 0, port; skel = connect4_dropper__open_and_load(); if (!ASSERT_OK_PTR(skel, "skel_open")) return -1; + port = get_socket_local_port(server_fd); + if (!ASSERT_GE(port, 0, "get_socket_local_port")) + return -1; + + skel->bss->port = ntohs(port); + skel->links.connect_v4_dropper = bpf_program__attach_cgroup(skel->progs.connect_v4_dropper, cgroup_fd); @@ -48,10 +54,9 @@ void test_cgroup_v1v2(void) { struct network_helper_opts opts = {}; int server_fd, client_fd, cgroup_fd; - static const int port = 60120; /* Step 1: Check base connectivity works without any BPF. */ - server_fd = start_server(AF_INET, SOCK_STREAM, NULL, port, 0); + server_fd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0); if (!ASSERT_GE(server_fd, 0, "server_fd")) return; client_fd = connect_to_fd_opts(server_fd, &opts); @@ -66,7 +71,7 @@ void test_cgroup_v1v2(void) cgroup_fd = test__join_cgroup("/connect_dropper"); if (!ASSERT_GE(cgroup_fd, 0, "cgroup_fd")) return; - server_fd = start_server(AF_INET, SOCK_STREAM, NULL, port, 0); + server_fd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0); if (!ASSERT_GE(server_fd, 0, "server_fd")) { close(cgroup_fd); return; diff --git a/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c b/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c deleted file mode 100644 index 7526de3790814..0000000000000 --- a/tools/testing/selftests/bpf/prog_tests/changes_pkt_data.c +++ /dev/null @@ -1,107 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include "bpf/libbpf.h" -#include "changes_pkt_data_freplace.skel.h" -#include "changes_pkt_data.skel.h" -#include - -static void print_verifier_log(const char *log) -{ - if (env.verbosity >= VERBOSE_VERY) - fprintf(stdout, "VERIFIER LOG:\n=============\n%s=============\n", log); -} - -static void test_aux(const char *main_prog_name, - const char *to_be_replaced, - const char *replacement, - bool expect_load) -{ - struct changes_pkt_data_freplace *freplace = NULL; - struct bpf_program *freplace_prog = NULL; - struct bpf_program *main_prog = NULL; - LIBBPF_OPTS(bpf_object_open_opts, opts); - struct changes_pkt_data *main = NULL; - char log[16*1024]; - int err; - - opts.kernel_log_buf = log; - opts.kernel_log_size = sizeof(log); - if (env.verbosity >= VERBOSE_SUPER) - opts.kernel_log_level = 1 | 2 | 4; - main = changes_pkt_data__open_opts(&opts); - if (!ASSERT_OK_PTR(main, "changes_pkt_data__open")) - goto out; - main_prog = bpf_object__find_program_by_name(main->obj, main_prog_name); - if (!ASSERT_OK_PTR(main_prog, "main_prog")) - goto out; - bpf_program__set_autoload(main_prog, true); - err = changes_pkt_data__load(main); - print_verifier_log(log); - if (!ASSERT_OK(err, "changes_pkt_data__load")) - goto out; - freplace = changes_pkt_data_freplace__open_opts(&opts); - if (!ASSERT_OK_PTR(freplace, "changes_pkt_data_freplace__open")) - goto out; - freplace_prog = bpf_object__find_program_by_name(freplace->obj, replacement); - if (!ASSERT_OK_PTR(freplace_prog, "freplace_prog")) - goto out; - bpf_program__set_autoload(freplace_prog, true); - bpf_program__set_autoattach(freplace_prog, true); - bpf_program__set_attach_target(freplace_prog, - bpf_program__fd(main_prog), - to_be_replaced); - err = changes_pkt_data_freplace__load(freplace); - print_verifier_log(log); - if (expect_load) { - ASSERT_OK(err, "changes_pkt_data_freplace__load"); - } else { - ASSERT_ERR(err, "changes_pkt_data_freplace__load"); - ASSERT_HAS_SUBSTR(log, "Extension program changes packet data", "error log"); - } - -out: - changes_pkt_data_freplace__destroy(freplace); - changes_pkt_data__destroy(main); -} - -/* There are two global subprograms in both changes_pkt_data.skel.h: - * - one changes packet data; - * - another does not. - * It is ok to freplace subprograms that change packet data with those - * that either do or do not. It is only ok to freplace subprograms - * that do not change packet data with those that do not as well. - * The below tests check outcomes for each combination of such freplace. - * Also test a case when main subprogram itself is replaced and is a single - * subprogram in a program. - */ -void test_changes_pkt_data_freplace(void) -{ - struct { - const char *main; - const char *to_be_replaced; - bool changes; - } mains[] = { - { "main_with_subprogs", "changes_pkt_data", true }, - { "main_with_subprogs", "does_not_change_pkt_data", false }, - { "main_changes", "main_changes", true }, - { "main_does_not_change", "main_does_not_change", false }, - }; - struct { - const char *func; - bool changes; - } replacements[] = { - { "changes_pkt_data", true }, - { "does_not_change_pkt_data", false } - }; - char buf[64]; - - for (int i = 0; i < ARRAY_SIZE(mains); ++i) { - for (int j = 0; j < ARRAY_SIZE(replacements); ++j) { - snprintf(buf, sizeof(buf), "%s_with_%s", - mains[i].to_be_replaced, replacements[j].func); - if (!test__start_subtest(buf)) - continue; - test_aux(mains[i].main, mains[i].to_be_replaced, replacements[j].func, - mains[i].changes || !replacements[j].changes); - } - } -} diff --git a/tools/testing/selftests/bpf/prog_tests/compute_live_registers.c b/tools/testing/selftests/bpf/prog_tests/compute_live_registers.c new file mode 100644 index 0000000000000..285f20241fe14 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/compute_live_registers.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "compute_live_registers.skel.h" +#include "test_progs.h" + +void test_compute_live_registers(void) +{ + RUN_TESTS(compute_live_registers); +} diff --git a/tools/testing/selftests/bpf/prog_tests/cpumask.c b/tools/testing/selftests/bpf/prog_tests/cpumask.c index e58a04654238c..6c45330a5ca31 100644 --- a/tools/testing/selftests/bpf/prog_tests/cpumask.c +++ b/tools/testing/selftests/bpf/prog_tests/cpumask.c @@ -25,6 +25,10 @@ static const char * const cpumask_success_testcases[] = { "test_global_mask_nested_deep_rcu", "test_global_mask_nested_deep_array_rcu", "test_cpumask_weight", + "test_refcount_null_tracking", + "test_populate_reject_small_mask", + "test_populate_reject_unaligned", + "test_populate", }; static void verify_success(const char *prog_name) @@ -78,6 +82,5 @@ void test_cpumask(void) verify_success(cpumask_success_testcases[i]); } - RUN_TESTS(cpumask_success); RUN_TESTS(cpumask_failure); } diff --git a/tools/testing/selftests/bpf/prog_tests/dynptr.c b/tools/testing/selftests/bpf/prog_tests/dynptr.c index b614a5272dfd6..e29cc16124c2d 100644 --- a/tools/testing/selftests/bpf/prog_tests/dynptr.c +++ b/tools/testing/selftests/bpf/prog_tests/dynptr.c @@ -10,6 +10,7 @@ enum test_setup_type { SETUP_SYSCALL_SLEEP, SETUP_SKB_PROG, SETUP_SKB_PROG_TP, + SETUP_XDP_PROG, }; static struct { @@ -18,6 +19,8 @@ static struct { } success_tests[] = { {"test_read_write", SETUP_SYSCALL_SLEEP}, {"test_dynptr_data", SETUP_SYSCALL_SLEEP}, + {"test_dynptr_copy", SETUP_SYSCALL_SLEEP}, + {"test_dynptr_copy_xdp", SETUP_XDP_PROG}, {"test_ringbuf", SETUP_SYSCALL_SLEEP}, {"test_skb_readonly", SETUP_SKB_PROG}, {"test_dynptr_skb_data", SETUP_SKB_PROG}, @@ -120,6 +123,24 @@ static void verify_success(const char *prog_name, enum test_setup_type setup_typ break; } + case SETUP_XDP_PROG: + { + char data[5000]; + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, opts, + .data_in = &data, + .data_size_in = sizeof(data), + .repeat = 1, + ); + + prog_fd = bpf_program__fd(prog); + err = bpf_prog_test_run_opts(prog_fd, &opts); + + if (!ASSERT_OK(err, "test_run")) + goto cleanup; + + break; + } } ASSERT_EQ(skel->bss->err, 0, "err"); diff --git a/tools/testing/selftests/bpf/prog_tests/fd_array.c b/tools/testing/selftests/bpf/prog_tests/fd_array.c index a1d52e73fb162..9add890c2d372 100644 --- a/tools/testing/selftests/bpf/prog_tests/fd_array.c +++ b/tools/testing/selftests/bpf/prog_tests/fd_array.c @@ -83,8 +83,8 @@ static inline int bpf_prog_get_map_ids(int prog_fd, __u32 *nr_map_ids, __u32 *ma int err; memset(&info, 0, len); - info.nr_map_ids = *nr_map_ids, - info.map_ids = ptr_to_u64(map_ids), + info.nr_map_ids = *nr_map_ids; + info.map_ids = ptr_to_u64(map_ids); err = bpf_prog_get_info_by_fd(prog_fd, &info, &len); if (!ASSERT_OK(err, "bpf_prog_get_info_by_fd")) diff --git a/tools/testing/selftests/bpf/prog_tests/kernel_flag.c b/tools/testing/selftests/bpf/prog_tests/kernel_flag.c new file mode 100644 index 0000000000000..a133354ac9bc2 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/kernel_flag.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Microsoft */ +#include +#include "kfunc_call_test.skel.h" +#include "kfunc_call_test.lskel.h" +#include "test_kernel_flag.skel.h" + +void test_kernel_flag(void) +{ + struct test_kernel_flag *lsm_skel; + struct kfunc_call_test *skel = NULL; + struct kfunc_call_test_lskel *lskel = NULL; + int ret; + + lsm_skel = test_kernel_flag__open_and_load(); + if (!ASSERT_OK_PTR(lsm_skel, "lsm_skel")) + return; + + lsm_skel->bss->monitored_tid = gettid(); + + ret = test_kernel_flag__attach(lsm_skel); + if (!ASSERT_OK(ret, "test_kernel_flag__attach")) + goto close_prog; + + /* Test with skel. This should pass the gatekeeper */ + skel = kfunc_call_test__open_and_load(); + if (!ASSERT_OK_PTR(skel, "skel")) + goto close_prog; + + /* Test with lskel. This should fail due to blocking kernel-based bpf() invocations */ + lskel = kfunc_call_test_lskel__open_and_load(); + if (!ASSERT_ERR_PTR(lskel, "lskel")) + goto close_prog; + +close_prog: + if (skel) + kfunc_call_test__destroy(skel); + if (lskel) + kfunc_call_test_lskel__destroy(lskel); + + lsm_skel->bss->monitored_tid = 0; + test_kernel_flag__destroy(lsm_skel); +} diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_ip_encap.c b/tools/testing/selftests/bpf/prog_tests/lwt_ip_encap.c new file mode 100644 index 0000000000000..b6391af5f6f96 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/lwt_ip_encap.c @@ -0,0 +1,540 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include + +#include "network_helpers.h" +#include "test_progs.h" + +#define BPF_FILE "test_lwt_ip_encap.bpf.o" + +#define NETNS_NAME_SIZE 32 +#define NETNS_BASE "ns-lwt-ip-encap" + +#define IP4_ADDR_1 "172.16.1.100" +#define IP4_ADDR_2 "172.16.2.100" +#define IP4_ADDR_3 "172.16.3.100" +#define IP4_ADDR_4 "172.16.4.100" +#define IP4_ADDR_5 "172.16.5.100" +#define IP4_ADDR_6 "172.16.6.100" +#define IP4_ADDR_7 "172.16.7.100" +#define IP4_ADDR_8 "172.16.8.100" +#define IP4_ADDR_GRE "172.16.16.100" + +#define IP4_ADDR_SRC IP4_ADDR_1 +#define IP4_ADDR_DST IP4_ADDR_4 + +#define IP6_ADDR_1 "fb01::1" +#define IP6_ADDR_2 "fb02::1" +#define IP6_ADDR_3 "fb03::1" +#define IP6_ADDR_4 "fb04::1" +#define IP6_ADDR_5 "fb05::1" +#define IP6_ADDR_6 "fb06::1" +#define IP6_ADDR_7 "fb07::1" +#define IP6_ADDR_8 "fb08::1" +#define IP6_ADDR_GRE "fb10::1" + +#define IP6_ADDR_SRC IP6_ADDR_1 +#define IP6_ADDR_DST IP6_ADDR_4 + +/* Setup/topology: + * + * NS1 NS2 NS3 + * veth1 <---> veth2 veth3 <---> veth4 (the top route) + * veth5 <---> veth6 veth7 <---> veth8 (the bottom route) + * + * Each vethN gets IP[4|6]_ADDR_N address. + * + * IP*_ADDR_SRC = IP*_ADDR_1 + * IP*_ADDR_DST = IP*_ADDR_4 + * + * All tests test pings from IP*_ADDR__SRC to IP*_ADDR_DST. + * + * By default, routes are configured to allow packets to go + * IP*_ADDR_1 <=> IP*_ADDR_2 <=> IP*_ADDR_3 <=> IP*_ADDR_4 (the top route). + * + * A GRE device is installed in NS3 with IP*_ADDR_GRE, and + * NS1/NS2 are configured to route packets to IP*_ADDR_GRE via IP*_ADDR_8 + * (the bottom route). + * + * Tests: + * + * 1. Routes NS2->IP*_ADDR_DST are brought down, so the only way a ping + * from IP*_ADDR_SRC to IP*_ADDR_DST can work is via IP*_ADDR_GRE. + * + * 2a. In an egress test, a bpf LWT_XMIT program is installed on veth1 + * that encaps the packets with an IP/GRE header to route to IP*_ADDR_GRE. + * + * ping: SRC->[encap at veth1:egress]->GRE:decap->DST + * ping replies go DST->SRC directly + * + * 2b. In an ingress test, a bpf LWT_IN program is installed on veth2 + * that encaps the packets with an IP/GRE header to route to IP*_ADDR_GRE. + * + * ping: SRC->[encap at veth2:ingress]->GRE:decap->DST + * ping replies go DST->SRC directly + */ + +static int create_ns(char *name, size_t name_sz) +{ + if (!name) + goto fail; + + if (!ASSERT_OK(append_tid(name, name_sz), "append TID")) + goto fail; + + SYS(fail, "ip netns add %s", name); + + /* rp_filter gets confused by what these tests are doing, so disable it */ + SYS(fail, "ip netns exec %s sysctl -wq net.ipv4.conf.all.rp_filter=0", name); + SYS(fail, "ip netns exec %s sysctl -wq net.ipv4.conf.default.rp_filter=0", name); + /* Disable IPv6 DAD because it sometimes takes too long and fails tests */ + SYS(fail, "ip netns exec %s sysctl -wq net.ipv6.conf.all.accept_dad=0", name); + SYS(fail, "ip netns exec %s sysctl -wq net.ipv6.conf.default.accept_dad=0", name); + + return 0; +fail: + return -1; +} + +static int set_top_addr(const char *ns1, const char *ns2, const char *ns3) +{ + SYS(fail, "ip -n %s a add %s/24 dev veth1", ns1, IP4_ADDR_1); + SYS(fail, "ip -n %s a add %s/24 dev veth2", ns2, IP4_ADDR_2); + SYS(fail, "ip -n %s a add %s/24 dev veth3", ns2, IP4_ADDR_3); + SYS(fail, "ip -n %s a add %s/24 dev veth4", ns3, IP4_ADDR_4); + SYS(fail, "ip -n %s -6 a add %s/128 dev veth1", ns1, IP6_ADDR_1); + SYS(fail, "ip -n %s -6 a add %s/128 dev veth2", ns2, IP6_ADDR_2); + SYS(fail, "ip -n %s -6 a add %s/128 dev veth3", ns2, IP6_ADDR_3); + SYS(fail, "ip -n %s -6 a add %s/128 dev veth4", ns3, IP6_ADDR_4); + + SYS(fail, "ip -n %s link set dev veth1 up", ns1); + SYS(fail, "ip -n %s link set dev veth2 up", ns2); + SYS(fail, "ip -n %s link set dev veth3 up", ns2); + SYS(fail, "ip -n %s link set dev veth4 up", ns3); + + return 0; +fail: + return 1; +} + +static int set_bottom_addr(const char *ns1, const char *ns2, const char *ns3) +{ + SYS(fail, "ip -n %s a add %s/24 dev veth5", ns1, IP4_ADDR_5); + SYS(fail, "ip -n %s a add %s/24 dev veth6", ns2, IP4_ADDR_6); + SYS(fail, "ip -n %s a add %s/24 dev veth7", ns2, IP4_ADDR_7); + SYS(fail, "ip -n %s a add %s/24 dev veth8", ns3, IP4_ADDR_8); + SYS(fail, "ip -n %s -6 a add %s/128 dev veth5", ns1, IP6_ADDR_5); + SYS(fail, "ip -n %s -6 a add %s/128 dev veth6", ns2, IP6_ADDR_6); + SYS(fail, "ip -n %s -6 a add %s/128 dev veth7", ns2, IP6_ADDR_7); + SYS(fail, "ip -n %s -6 a add %s/128 dev veth8", ns3, IP6_ADDR_8); + + SYS(fail, "ip -n %s link set dev veth5 up", ns1); + SYS(fail, "ip -n %s link set dev veth6 up", ns2); + SYS(fail, "ip -n %s link set dev veth7 up", ns2); + SYS(fail, "ip -n %s link set dev veth8 up", ns3); + + return 0; +fail: + return 1; +} + +static int configure_vrf(const char *ns1, const char *ns2) +{ + if (!ns1 || !ns2) + goto fail; + + SYS(fail, "ip -n %s link add red type vrf table 1001", ns1); + SYS(fail, "ip -n %s link set red up", ns1); + SYS(fail, "ip -n %s route add table 1001 unreachable default metric 8192", ns1); + SYS(fail, "ip -n %s -6 route add table 1001 unreachable default metric 8192", ns1); + SYS(fail, "ip -n %s link set veth1 vrf red", ns1); + SYS(fail, "ip -n %s link set veth5 vrf red", ns1); + + SYS(fail, "ip -n %s link add red type vrf table 1001", ns2); + SYS(fail, "ip -n %s link set red up", ns2); + SYS(fail, "ip -n %s route add table 1001 unreachable default metric 8192", ns2); + SYS(fail, "ip -n %s -6 route add table 1001 unreachable default metric 8192", ns2); + SYS(fail, "ip -n %s link set veth2 vrf red", ns2); + SYS(fail, "ip -n %s link set veth3 vrf red", ns2); + SYS(fail, "ip -n %s link set veth6 vrf red", ns2); + SYS(fail, "ip -n %s link set veth7 vrf red", ns2); + + return 0; +fail: + return -1; +} + +static int configure_ns1(const char *ns1, const char *vrf) +{ + struct nstoken *nstoken = NULL; + + if (!ns1 || !vrf) + goto fail; + + nstoken = open_netns(ns1); + if (!ASSERT_OK_PTR(nstoken, "open ns1")) + goto fail; + + /* Top route */ + SYS(fail, "ip route add %s/32 dev veth1 %s", IP4_ADDR_2, vrf); + SYS(fail, "ip route add default dev veth1 via %s %s", IP4_ADDR_2, vrf); + SYS(fail, "ip -6 route add %s/128 dev veth1 %s", IP6_ADDR_2, vrf); + SYS(fail, "ip -6 route add default dev veth1 via %s %s", IP6_ADDR_2, vrf); + /* Bottom route */ + SYS(fail, "ip route add %s/32 dev veth5 %s", IP4_ADDR_6, vrf); + SYS(fail, "ip route add %s/32 dev veth5 via %s %s", IP4_ADDR_7, IP4_ADDR_6, vrf); + SYS(fail, "ip route add %s/32 dev veth5 via %s %s", IP4_ADDR_8, IP4_ADDR_6, vrf); + SYS(fail, "ip -6 route add %s/128 dev veth5 %s", IP6_ADDR_6, vrf); + SYS(fail, "ip -6 route add %s/128 dev veth5 via %s %s", IP6_ADDR_7, IP6_ADDR_6, vrf); + SYS(fail, "ip -6 route add %s/128 dev veth5 via %s %s", IP6_ADDR_8, IP6_ADDR_6, vrf); + + close_netns(nstoken); + return 0; +fail: + close_netns(nstoken); + return -1; +} + +static int configure_ns2(const char *ns2, const char *vrf) +{ + struct nstoken *nstoken = NULL; + + if (!ns2 || !vrf) + goto fail; + + nstoken = open_netns(ns2); + if (!ASSERT_OK_PTR(nstoken, "open ns2")) + goto fail; + + SYS(fail, "ip netns exec %s sysctl -wq net.ipv4.ip_forward=1", ns2); + SYS(fail, "ip netns exec %s sysctl -wq net.ipv6.conf.all.forwarding=1", ns2); + + /* Top route */ + SYS(fail, "ip route add %s/32 dev veth2 %s", IP4_ADDR_1, vrf); + SYS(fail, "ip route add %s/32 dev veth3 %s", IP4_ADDR_4, vrf); + SYS(fail, "ip -6 route add %s/128 dev veth2 %s", IP6_ADDR_1, vrf); + SYS(fail, "ip -6 route add %s/128 dev veth3 %s", IP6_ADDR_4, vrf); + /* Bottom route */ + SYS(fail, "ip route add %s/32 dev veth6 %s", IP4_ADDR_5, vrf); + SYS(fail, "ip route add %s/32 dev veth7 %s", IP4_ADDR_8, vrf); + SYS(fail, "ip -6 route add %s/128 dev veth6 %s", IP6_ADDR_5, vrf); + SYS(fail, "ip -6 route add %s/128 dev veth7 %s", IP6_ADDR_8, vrf); + + close_netns(nstoken); + return 0; +fail: + close_netns(nstoken); + return -1; +} + +static int configure_ns3(const char *ns3) +{ + struct nstoken *nstoken = NULL; + + if (!ns3) + goto fail; + + nstoken = open_netns(ns3); + if (!ASSERT_OK_PTR(nstoken, "open ns3")) + goto fail; + + /* Top route */ + SYS(fail, "ip route add %s/32 dev veth4", IP4_ADDR_3); + SYS(fail, "ip route add %s/32 dev veth4 via %s", IP4_ADDR_1, IP4_ADDR_3); + SYS(fail, "ip route add %s/32 dev veth4 via %s", IP4_ADDR_2, IP4_ADDR_3); + SYS(fail, "ip -6 route add %s/128 dev veth4", IP6_ADDR_3); + SYS(fail, "ip -6 route add %s/128 dev veth4 via %s", IP6_ADDR_1, IP6_ADDR_3); + SYS(fail, "ip -6 route add %s/128 dev veth4 via %s", IP6_ADDR_2, IP6_ADDR_3); + /* Bottom route */ + SYS(fail, "ip route add %s/32 dev veth8", IP4_ADDR_7); + SYS(fail, "ip route add %s/32 dev veth8 via %s", IP4_ADDR_5, IP4_ADDR_7); + SYS(fail, "ip route add %s/32 dev veth8 via %s", IP4_ADDR_6, IP4_ADDR_7); + SYS(fail, "ip -6 route add %s/128 dev veth8", IP6_ADDR_7); + SYS(fail, "ip -6 route add %s/128 dev veth8 via %s", IP6_ADDR_5, IP6_ADDR_7); + SYS(fail, "ip -6 route add %s/128 dev veth8 via %s", IP6_ADDR_6, IP6_ADDR_7); + + /* Configure IPv4 GRE device */ + SYS(fail, "ip tunnel add gre_dev mode gre remote %s local %s ttl 255", + IP4_ADDR_1, IP4_ADDR_GRE); + SYS(fail, "ip link set gre_dev up"); + SYS(fail, "ip a add %s dev gre_dev", IP4_ADDR_GRE); + + /* Configure IPv6 GRE device */ + SYS(fail, "ip tunnel add gre6_dev mode ip6gre remote %s local %s ttl 255", + IP6_ADDR_1, IP6_ADDR_GRE); + SYS(fail, "ip link set gre6_dev up"); + SYS(fail, "ip a add %s dev gre6_dev", IP6_ADDR_GRE); + + close_netns(nstoken); + return 0; +fail: + close_netns(nstoken); + return -1; +} + +static int setup_network(char *ns1, char *ns2, char *ns3, const char *vrf) +{ + if (!ns1 || !ns2 || !ns3 || !vrf) + goto fail; + + SYS(fail, "ip -n %s link add veth1 type veth peer name veth2 netns %s", ns1, ns2); + SYS(fail, "ip -n %s link add veth3 type veth peer name veth4 netns %s", ns2, ns3); + SYS(fail, "ip -n %s link add veth5 type veth peer name veth6 netns %s", ns1, ns2); + SYS(fail, "ip -n %s link add veth7 type veth peer name veth8 netns %s", ns2, ns3); + + if (vrf[0]) { + if (!ASSERT_OK(configure_vrf(ns1, ns2), "configure vrf")) + goto fail; + } + if (!ASSERT_OK(set_top_addr(ns1, ns2, ns3), "set top addresses")) + goto fail; + + if (!ASSERT_OK(set_bottom_addr(ns1, ns2, ns3), "set bottom addresses")) + goto fail; + + if (!ASSERT_OK(configure_ns1(ns1, vrf), "configure ns1 routes")) + goto fail; + + if (!ASSERT_OK(configure_ns2(ns2, vrf), "configure ns2 routes")) + goto fail; + + if (!ASSERT_OK(configure_ns3(ns3), "configure ns3 routes")) + goto fail; + + /* Link bottom route to the GRE tunnels */ + SYS(fail, "ip -n %s route add %s/32 dev veth5 via %s %s", + ns1, IP4_ADDR_GRE, IP4_ADDR_6, vrf); + SYS(fail, "ip -n %s route add %s/32 dev veth7 via %s %s", + ns2, IP4_ADDR_GRE, IP4_ADDR_8, vrf); + SYS(fail, "ip -n %s -6 route add %s/128 dev veth5 via %s %s", + ns1, IP6_ADDR_GRE, IP6_ADDR_6, vrf); + SYS(fail, "ip -n %s -6 route add %s/128 dev veth7 via %s %s", + ns2, IP6_ADDR_GRE, IP6_ADDR_8, vrf); + + return 0; +fail: + return -1; +} + +static int remove_routes_to_gredev(const char *ns1, const char *ns2, const char *vrf) +{ + SYS(fail, "ip -n %s route del %s dev veth5 %s", ns1, IP4_ADDR_GRE, vrf); + SYS(fail, "ip -n %s route del %s dev veth7 %s", ns2, IP4_ADDR_GRE, vrf); + SYS(fail, "ip -n %s -6 route del %s/128 dev veth5 %s", ns1, IP6_ADDR_GRE, vrf); + SYS(fail, "ip -n %s -6 route del %s/128 dev veth7 %s", ns2, IP6_ADDR_GRE, vrf); + + return 0; +fail: + return -1; +} + +static int add_unreachable_routes_to_gredev(const char *ns1, const char *ns2, const char *vrf) +{ + SYS(fail, "ip -n %s route add unreachable %s/32 %s", ns1, IP4_ADDR_GRE, vrf); + SYS(fail, "ip -n %s route add unreachable %s/32 %s", ns2, IP4_ADDR_GRE, vrf); + SYS(fail, "ip -n %s -6 route add unreachable %s/128 %s", ns1, IP6_ADDR_GRE, vrf); + SYS(fail, "ip -n %s -6 route add unreachable %s/128 %s", ns2, IP6_ADDR_GRE, vrf); + + return 0; +fail: + return -1; +} + +#define GSO_SIZE 5000 +#define GSO_TCP_PORT 9000 +/* This tests the fix from commit ea0371f78799 ("net: fix GSO in bpf_lwt_push_ip_encap") */ +static int test_gso_fix(const char *ns1, const char *ns3, int family) +{ + const char *ip_addr = family == AF_INET ? IP4_ADDR_DST : IP6_ADDR_DST; + char gso_packet[GSO_SIZE] = {}; + struct nstoken *nstoken = NULL; + int sfd, cfd, afd; + ssize_t bytes; + int ret = -1; + + if (!ns1 || !ns3) + return ret; + + nstoken = open_netns(ns3); + if (!ASSERT_OK_PTR(nstoken, "open ns3")) + return ret; + + sfd = start_server_str(family, SOCK_STREAM, ip_addr, GSO_TCP_PORT, NULL); + if (!ASSERT_OK_FD(sfd, "start server")) + goto close_netns; + + close_netns(nstoken); + + nstoken = open_netns(ns1); + if (!ASSERT_OK_PTR(nstoken, "open ns1")) + goto close_server; + + cfd = connect_to_addr_str(family, SOCK_STREAM, ip_addr, GSO_TCP_PORT, NULL); + if (!ASSERT_OK_FD(cfd, "connect to server")) + goto close_server; + + close_netns(nstoken); + nstoken = NULL; + + afd = accept(sfd, NULL, NULL); + if (!ASSERT_OK_FD(afd, "accept")) + goto close_client; + + /* Send a packet larger than MTU */ + bytes = send(cfd, gso_packet, GSO_SIZE, 0); + if (!ASSERT_EQ(bytes, GSO_SIZE, "send packet")) + goto close_accept; + + /* Verify we received all expected bytes */ + bytes = read(afd, gso_packet, GSO_SIZE); + if (!ASSERT_EQ(bytes, GSO_SIZE, "receive packet")) + goto close_accept; + + ret = 0; + +close_accept: + close(afd); +close_client: + close(cfd); +close_server: + close(sfd); +close_netns: + close_netns(nstoken); + + return ret; +} + +static int check_ping_ok(const char *ns1) +{ + SYS(fail, "ip netns exec %s ping -c 1 -W1 -I veth1 %s > /dev/null", ns1, IP4_ADDR_DST); + SYS(fail, "ip netns exec %s ping6 -c 1 -W1 -I veth1 %s > /dev/null", ns1, IP6_ADDR_DST); + return 0; +fail: + return -1; +} + +static int check_ping_fails(const char *ns1) +{ + int ret; + + ret = SYS_NOFAIL("ip netns exec %s ping -c 1 -W1 -I veth1 %s", ns1, IP4_ADDR_DST); + if (!ret) + return -1; + + ret = SYS_NOFAIL("ip netns exec %s ping6 -c 1 -W1 -I veth1 %s", ns1, IP6_ADDR_DST); + if (!ret) + return -1; + + return 0; +} + +#define EGRESS true +#define INGRESS false +#define IPV4_ENCAP true +#define IPV6_ENCAP false +static void lwt_ip_encap(bool ipv4_encap, bool egress, const char *vrf) +{ + char ns1[NETNS_NAME_SIZE] = NETNS_BASE "-1-"; + char ns2[NETNS_NAME_SIZE] = NETNS_BASE "-2-"; + char ns3[NETNS_NAME_SIZE] = NETNS_BASE "-3-"; + char *sec = ipv4_encap ? "encap_gre" : "encap_gre6"; + + if (!vrf) + return; + + if (!ASSERT_OK(create_ns(ns1, NETNS_NAME_SIZE), "create ns1")) + goto out; + if (!ASSERT_OK(create_ns(ns2, NETNS_NAME_SIZE), "create ns2")) + goto out; + if (!ASSERT_OK(create_ns(ns3, NETNS_NAME_SIZE), "create ns3")) + goto out; + + if (!ASSERT_OK(setup_network(ns1, ns2, ns3, vrf), "setup network")) + goto out; + + /* By default, pings work */ + if (!ASSERT_OK(check_ping_ok(ns1), "ping OK")) + goto out; + + /* Remove NS2->DST routes, ping fails */ + SYS(out, "ip -n %s route del %s/32 dev veth3 %s", ns2, IP4_ADDR_DST, vrf); + SYS(out, "ip -n %s -6 route del %s/128 dev veth3 %s", ns2, IP6_ADDR_DST, vrf); + if (!ASSERT_OK(check_ping_fails(ns1), "ping expected fail")) + goto out; + + /* Install replacement routes (LWT/eBPF), pings succeed */ + if (egress) { + SYS(out, "ip -n %s route add %s encap bpf xmit obj %s sec %s dev veth1 %s", + ns1, IP4_ADDR_DST, BPF_FILE, sec, vrf); + SYS(out, "ip -n %s -6 route add %s encap bpf xmit obj %s sec %s dev veth1 %s", + ns1, IP6_ADDR_DST, BPF_FILE, sec, vrf); + } else { + SYS(out, "ip -n %s route add %s encap bpf in obj %s sec %s dev veth2 %s", + ns2, IP4_ADDR_DST, BPF_FILE, sec, vrf); + SYS(out, "ip -n %s -6 route add %s encap bpf in obj %s sec %s dev veth2 %s", + ns2, IP6_ADDR_DST, BPF_FILE, sec, vrf); + } + + if (!ASSERT_OK(check_ping_ok(ns1), "ping OK")) + goto out; + + /* Skip GSO tests with VRF: VRF routing needs properly assigned + * source IP/device, which is easy to do with ping but hard with TCP. + */ + if (egress && !vrf[0]) { + if (!ASSERT_OK(test_gso_fix(ns1, ns3, AF_INET), "test GSO")) + goto out; + } + + /* Negative test: remove routes to GRE devices: ping fails */ + if (!ASSERT_OK(remove_routes_to_gredev(ns1, ns2, vrf), "remove routes to gredev")) + goto out; + if (!ASSERT_OK(check_ping_fails(ns1), "ping expected fail")) + goto out; + + /* Another negative test */ + if (!ASSERT_OK(add_unreachable_routes_to_gredev(ns1, ns2, vrf), + "add unreachable routes")) + goto out; + ASSERT_OK(check_ping_fails(ns1), "ping expected fail"); + +out: + SYS_NOFAIL("ip netns del %s", ns1); + SYS_NOFAIL("ip netns del %s", ns2); + SYS_NOFAIL("ip netns del %s", ns3); +} + +void test_lwt_ip_encap_vrf_ipv6(void) +{ + if (test__start_subtest("egress")) + lwt_ip_encap(IPV6_ENCAP, EGRESS, "vrf red"); + + if (test__start_subtest("ingress")) + lwt_ip_encap(IPV6_ENCAP, INGRESS, "vrf red"); +} + +void test_lwt_ip_encap_vrf_ipv4(void) +{ + if (test__start_subtest("egress")) + lwt_ip_encap(IPV4_ENCAP, EGRESS, "vrf red"); + + if (test__start_subtest("ingress")) + lwt_ip_encap(IPV4_ENCAP, INGRESS, "vrf red"); +} + +void test_lwt_ip_encap_ipv6(void) +{ + if (test__start_subtest("egress")) + lwt_ip_encap(IPV6_ENCAP, EGRESS, ""); + + if (test__start_subtest("ingress")) + lwt_ip_encap(IPV6_ENCAP, INGRESS, ""); +} + +void test_lwt_ip_encap_ipv4(void) +{ + if (test__start_subtest("egress")) + lwt_ip_encap(IPV4_ENCAP, EGRESS, ""); + + if (test__start_subtest("ingress")) + lwt_ip_encap(IPV4_ENCAP, INGRESS, ""); +} diff --git a/tools/testing/selftests/bpf/prog_tests/lwt_seg6local.c b/tools/testing/selftests/bpf/prog_tests/lwt_seg6local.c new file mode 100644 index 0000000000000..3bc730b7c7fa0 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/lwt_seg6local.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* Connects 6 network namespaces through veths. + * Each NS may have different IPv6 global scope addresses : + * + * NS1 NS2 NS3 NS4 NS5 NS6 + * lo veth1 <-> veth2 veth3 <-> veth4 veth5 <-> veth6 lo veth7 <-> veth8 veth9 <-> veth10 lo + * fb00 ::1 ::12 ::21 ::34 ::43 ::56 ::65 ::78 ::87 ::910 ::109 ::6 + * fd00 ::4 + * fc42 ::1 + * + * All IPv6 packets going to fb00::/16 through NS2 will be encapsulated in a + * IPv6 header with a Segment Routing Header, with segments : + * fd00::1 -> fd00::2 -> fd00::3 -> fd00::4 + * + * 3 fd00::/16 IPv6 addresses are binded to seg6local End.BPF actions : + * - fd00::1 : add a TLV, change the flags and apply a End.X action to fc42::1 + * - fd00::2 : remove the TLV, change the flags, add a tag + * - fd00::3 : apply an End.T action to fd00::4, through routing table 117 + * + * fd00::4 is a simple Segment Routing node decapsulating the inner IPv6 packet. + * Each End.BPF action will validate the operations applied on the SRH by the + * previous BPF program in the chain, otherwise the packet is dropped. + * + * An UDP datagram is sent from fb00::1 to fb00::6. The test succeeds if this + * datagram can be read on NS6 when binding to fb00::6. + */ + +#include "network_helpers.h" +#include "test_progs.h" + +#define NETNS_BASE "lwt-seg6local-" +#define BPF_FILE "test_lwt_seg6local.bpf.o" + +static void cleanup(void) +{ + int ns; + + for (ns = 1; ns < 7; ns++) + SYS_NOFAIL("ip netns del %s%d", NETNS_BASE, ns); +} + +static int setup(void) +{ + int ns; + + for (ns = 1; ns < 7; ns++) + SYS(fail, "ip netns add %s%d", NETNS_BASE, ns); + + SYS(fail, "ip -n %s6 link set dev lo up", NETNS_BASE); + + for (ns = 1; ns < 6; ns++) { + int local_id = ns * 2 - 1; + int peer_id = ns * 2; + int next_ns = ns + 1; + + SYS(fail, "ip -n %s%d link add veth%d type veth peer name veth%d netns %s%d", + NETNS_BASE, ns, local_id, peer_id, NETNS_BASE, next_ns); + + SYS(fail, "ip -n %s%d link set dev veth%d up", NETNS_BASE, ns, local_id); + SYS(fail, "ip -n %s%d link set dev veth%d up", NETNS_BASE, next_ns, peer_id); + + /* All link scope addresses to veths */ + SYS(fail, "ip -n %s%d -6 addr add fb00::%d%d/16 dev veth%d scope link", + NETNS_BASE, ns, local_id, peer_id, local_id); + SYS(fail, "ip -n %s%d -6 addr add fb00::%d%d/16 dev veth%d scope link", + NETNS_BASE, next_ns, peer_id, local_id, peer_id); + } + + + SYS(fail, "ip -n %s5 -6 route add fb00::109 table 117 dev veth9 scope link", NETNS_BASE); + + SYS(fail, "ip -n %s1 -6 addr add fb00::1/16 dev lo", NETNS_BASE); + SYS(fail, "ip -n %s1 -6 route add fb00::6 dev veth1 via fb00::21", NETNS_BASE); + + SYS(fail, "ip -n %s2 -6 route add fb00::6 encap bpf in obj %s sec encap_srh dev veth2", + NETNS_BASE, BPF_FILE); + SYS(fail, "ip -n %s2 -6 route add fd00::1 dev veth3 via fb00::43 scope link", NETNS_BASE); + + SYS(fail, "ip -n %s3 -6 route add fc42::1 dev veth5 via fb00::65", NETNS_BASE); + SYS(fail, + "ip -n %s3 -6 route add fd00::1 encap seg6local action End.BPF endpoint obj %s sec add_egr_x dev veth4", + NETNS_BASE, BPF_FILE); + + SYS(fail, + "ip -n %s4 -6 route add fd00::2 encap seg6local action End.BPF endpoint obj %s sec pop_egr dev veth6", + NETNS_BASE, BPF_FILE); + SYS(fail, "ip -n %s4 -6 addr add fc42::1 dev lo", NETNS_BASE); + SYS(fail, "ip -n %s4 -6 route add fd00::3 dev veth7 via fb00::87", NETNS_BASE); + + SYS(fail, "ip -n %s5 -6 route add fd00::4 table 117 dev veth9 via fb00::109", NETNS_BASE); + SYS(fail, + "ip -n %s5 -6 route add fd00::3 encap seg6local action End.BPF endpoint obj %s sec inspect_t dev veth8", + NETNS_BASE, BPF_FILE); + + SYS(fail, "ip -n %s6 -6 addr add fb00::6/16 dev lo", NETNS_BASE); + SYS(fail, "ip -n %s6 -6 addr add fd00::4/16 dev lo", NETNS_BASE); + + for (ns = 1; ns < 6; ns++) + SYS(fail, "ip netns exec %s%d sysctl -wq net.ipv6.conf.all.forwarding=1", + NETNS_BASE, ns); + + SYS(fail, "ip netns exec %s6 sysctl -wq net.ipv6.conf.all.seg6_enabled=1", NETNS_BASE); + SYS(fail, "ip netns exec %s6 sysctl -wq net.ipv6.conf.lo.seg6_enabled=1", NETNS_BASE); + SYS(fail, "ip netns exec %s6 sysctl -wq net.ipv6.conf.veth10.seg6_enabled=1", NETNS_BASE); + + return 0; +fail: + return -1; +} + +#define SERVER_PORT 7330 +#define CLIENT_PORT 2121 +void test_lwt_seg6local(void) +{ + struct sockaddr_in6 server_addr = {}; + const char *ns1 = NETNS_BASE "1"; + const char *ns6 = NETNS_BASE "6"; + struct nstoken *nstoken = NULL; + const char *foobar = "foobar"; + ssize_t bytes; + int sfd, cfd; + char buf[7]; + + if (!ASSERT_OK(setup(), "setup")) + goto out; + + nstoken = open_netns(ns6); + if (!ASSERT_OK_PTR(nstoken, "open ns6")) + goto out; + + sfd = start_server_str(AF_INET6, SOCK_DGRAM, "fb00::6", SERVER_PORT, NULL); + if (!ASSERT_OK_FD(sfd, "start server")) + goto close_netns; + + close_netns(nstoken); + + nstoken = open_netns(ns1); + if (!ASSERT_OK_PTR(nstoken, "open ns1")) + goto close_server; + + cfd = start_server_str(AF_INET6, SOCK_DGRAM, "fb00::1", CLIENT_PORT, NULL); + if (!ASSERT_OK_FD(cfd, "start client")) + goto close_server; + + close_netns(nstoken); + nstoken = NULL; + + /* Send a packet larger than MTU */ + server_addr.sin6_family = AF_INET6; + server_addr.sin6_port = htons(SERVER_PORT); + if (!ASSERT_EQ(inet_pton(AF_INET6, "fb00::6", &server_addr.sin6_addr), 1, + "build target addr")) + goto close_client; + + bytes = sendto(cfd, foobar, sizeof(foobar), 0, + (struct sockaddr *)&server_addr, sizeof(server_addr)); + if (!ASSERT_EQ(bytes, sizeof(foobar), "send packet")) + goto close_client; + + /* Verify we received all expected bytes */ + bytes = read(sfd, buf, sizeof(buf)); + if (!ASSERT_EQ(bytes, sizeof(buf), "receive packet")) + goto close_client; + ASSERT_STREQ(buf, foobar, "check udp packet"); + +close_client: + close(cfd); +close_server: + close(sfd); +close_netns: + close_netns(nstoken); + +out: + cleanup(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/prepare.c b/tools/testing/selftests/bpf/prog_tests/prepare.c new file mode 100644 index 0000000000000..fb5cdad971168 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/prepare.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta */ + +#include +#include +#include "prepare.skel.h" + +static bool check_prepared(struct bpf_object *obj) +{ + bool is_prepared = true; + const struct bpf_map *map; + + bpf_object__for_each_map(map, obj) { + if (bpf_map__fd(map) < 0) + is_prepared = false; + } + + return is_prepared; +} + +static void test_prepare_no_load(void) +{ + struct prepare *skel; + int err; + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + ); + + skel = prepare__open(); + if (!ASSERT_OK_PTR(skel, "prepare__open")) + return; + + if (!ASSERT_FALSE(check_prepared(skel->obj), "not check_prepared")) + goto cleanup; + + err = bpf_object__prepare(skel->obj); + + if (!ASSERT_TRUE(check_prepared(skel->obj), "check_prepared")) + goto cleanup; + + if (!ASSERT_OK(err, "bpf_object__prepare")) + goto cleanup; + +cleanup: + prepare__destroy(skel); +} + +static void test_prepare_load(void) +{ + struct prepare *skel; + int err, prog_fd; + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + ); + + skel = prepare__open(); + if (!ASSERT_OK_PTR(skel, "prepare__open")) + return; + + if (!ASSERT_FALSE(check_prepared(skel->obj), "not check_prepared")) + goto cleanup; + + err = bpf_object__prepare(skel->obj); + if (!ASSERT_OK(err, "bpf_object__prepare")) + goto cleanup; + + err = prepare__load(skel); + if (!ASSERT_OK(err, "prepare__load")) + goto cleanup; + + if (!ASSERT_TRUE(check_prepared(skel->obj), "check_prepared")) + goto cleanup; + + prog_fd = bpf_program__fd(skel->progs.program); + if (!ASSERT_GE(prog_fd, 0, "prog_fd")) + goto cleanup; + + err = bpf_prog_test_run_opts(prog_fd, &topts); + if (!ASSERT_OK(err, "test_run_opts err")) + goto cleanup; + + if (!ASSERT_OK(topts.retval, "test_run_opts retval")) + goto cleanup; + + ASSERT_EQ(skel->bss->err, 0, "err"); + +cleanup: + prepare__destroy(skel); +} + +void test_prepare(void) +{ + if (test__start_subtest("prepare_load")) + test_prepare_load(); + if (test__start_subtest("prepare_no_load")) + test_prepare_no_load(); +} diff --git a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c index ebe0c12b55363..c9f855e5da240 100644 --- a/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/rcu_read_lock.c @@ -81,6 +81,9 @@ static const char * const inproper_region_tests[] = { "nested_rcu_region", "rcu_read_lock_global_subprog_lock", "rcu_read_lock_global_subprog_unlock", + "rcu_read_lock_sleepable_helper_global_subprog", + "rcu_read_lock_sleepable_kfunc_global_subprog", + "rcu_read_lock_sleepable_global_subprog_indirect", }; static void test_inproper_region(void) diff --git a/tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c b/tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c index d56e18b255280..cf554e8fdf791 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_iter_batch.c @@ -6,15 +6,275 @@ #include "sock_iter_batch.skel.h" #define TEST_NS "sock_iter_batch_netns" +#define nr_soreuse 4 -static const int nr_soreuse = 4; +static const __u16 reuse_port = 10001; + +struct iter_out { + int idx; + __u64 cookie; +} __packed; + +struct sock_count { + __u64 cookie; + int count; +}; + +static int insert(__u64 cookie, struct sock_count counts[], int counts_len) +{ + int insert = -1; + int i = 0; + + for (; i < counts_len; i++) { + if (!counts[i].cookie) { + insert = i; + } else if (counts[i].cookie == cookie) { + insert = i; + break; + } + } + if (insert < 0) + return insert; + + counts[insert].cookie = cookie; + counts[insert].count++; + + return counts[insert].count; +} + +static int read_n(int iter_fd, int n, struct sock_count counts[], + int counts_len) +{ + struct iter_out out; + int nread = 1; + int i = 0; + + for (; nread > 0 && (n < 0 || i < n); i++) { + nread = read(iter_fd, &out, sizeof(out)); + if (!nread || !ASSERT_GE(nread, 1, "nread")) + break; + ASSERT_GE(insert(out.cookie, counts, counts_len), 0, "insert"); + } + + ASSERT_TRUE(n < 0 || i == n, "n < 0 || i == n"); + + return i; +} + +static __u64 socket_cookie(int fd) +{ + __u64 cookie; + socklen_t cookie_len = sizeof(cookie); + static __u32 duration; /* for CHECK macro */ + + if (CHECK(getsockopt(fd, SOL_SOCKET, SO_COOKIE, &cookie, &cookie_len) < 0, + "getsockopt(SO_COOKIE)", "%s\n", strerror(errno))) + return 0; + return cookie; +} + +static bool was_seen(int fd, struct sock_count counts[], int counts_len) +{ + __u64 cookie = socket_cookie(fd); + int i = 0; + + for (; cookie && i < counts_len; i++) + if (cookie == counts[i].cookie) + return true; + + return false; +} + +static int get_seen_socket(int *fds, struct sock_count counts[], int n) +{ + int i = 0; + + for (; i < n; i++) + if (was_seen(fds[i], counts, n)) + return i; + return -1; +} + +static int get_seen_count(int fd, struct sock_count counts[], int n) +{ + __u64 cookie = socket_cookie(fd); + int count = 0; + int i = 0; + + for (; cookie && !count && i < n; i++) + if (cookie == counts[i].cookie) + count = counts[i].count; + + return count; +} + +static void check_n_were_seen_once(int *fds, int fds_len, int n, + struct sock_count counts[], int counts_len) +{ + int seen_once = 0; + int seen_cnt; + int i = 0; + + for (; i < fds_len; i++) { + /* Skip any sockets that were closed or that weren't seen + * exactly once. + */ + if (fds[i] < 0) + continue; + seen_cnt = get_seen_count(fds[i], counts, counts_len); + if (seen_cnt && ASSERT_EQ(seen_cnt, 1, "seen_cnt")) + seen_once++; + } + + ASSERT_EQ(seen_once, n, "seen_once"); +} + +static void do_skip_test(int sock_type) +{ + struct sock_count counts[nr_soreuse] = {}; + struct bpf_link *link = NULL; + struct sock_iter_batch *skel; + int err, iter_fd = -1; + int close_idx; + int *fds; + + skel = sock_iter_batch__open(); + if (!ASSERT_OK_PTR(skel, "sock_iter_batch__open")) + return; + + /* Prepare a bucket of sockets in the kernel hashtable */ + int local_port; + + fds = start_reuseport_server(AF_INET, sock_type, "127.0.0.1", 0, 0, + nr_soreuse); + if (!ASSERT_OK_PTR(fds, "start_reuseport_server")) + goto done; + local_port = get_socket_local_port(*fds); + if (!ASSERT_GE(local_port, 0, "get_socket_local_port")) + goto done; + skel->rodata->ports[0] = ntohs(local_port); + skel->rodata->sf = AF_INET; + + err = sock_iter_batch__load(skel); + if (!ASSERT_OK(err, "sock_iter_batch__load")) + goto done; + + link = bpf_program__attach_iter(sock_type == SOCK_STREAM ? + skel->progs.iter_tcp_soreuse : + skel->progs.iter_udp_soreuse, + NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_iter")) + goto done; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (!ASSERT_GE(iter_fd, 0, "bpf_iter_create")) + goto done; + + /* Iterate through the first three sockets. */ + read_n(iter_fd, nr_soreuse - 1, counts, nr_soreuse); + + /* Make sure we saw three sockets from fds exactly once. */ + check_n_were_seen_once(fds, nr_soreuse, nr_soreuse - 1, counts, + nr_soreuse); + + /* Close a socket we've already seen to remove it from the bucket. */ + close_idx = get_seen_socket(fds, counts, nr_soreuse); + if (!ASSERT_GE(close_idx, 0, "close_idx")) + goto done; + close(fds[close_idx]); + fds[close_idx] = -1; + + /* Iterate through the rest of the sockets. */ + read_n(iter_fd, -1, counts, nr_soreuse); + + /* Make sure the last socket wasn't skipped and that there were no + * repeats. + */ + check_n_were_seen_once(fds, nr_soreuse, nr_soreuse - 1, counts, + nr_soreuse); +done: + free_fds(fds, nr_soreuse); + if (iter_fd < 0) + close(iter_fd); + bpf_link__destroy(link); + sock_iter_batch__destroy(skel); +} + +static void do_repeat_test(int sock_type) +{ + struct sock_count counts[nr_soreuse] = {}; + struct bpf_link *link = NULL; + struct sock_iter_batch *skel; + int err, i, iter_fd = -1; + int *fds[2] = {}; + + skel = sock_iter_batch__open(); + if (!ASSERT_OK_PTR(skel, "sock_iter_batch__open")) + return; + + /* Prepare a bucket of sockets in the kernel hashtable */ + int local_port; + + fds[0] = start_reuseport_server(AF_INET, sock_type, "127.0.0.1", + reuse_port, 0, nr_soreuse); + if (!ASSERT_OK_PTR(fds[0], "start_reuseport_server")) + goto done; + local_port = get_socket_local_port(*fds[0]); + if (!ASSERT_GE(local_port, 0, "get_socket_local_port")) + goto done; + skel->rodata->ports[0] = ntohs(local_port); + skel->rodata->sf = AF_INET; + + err = sock_iter_batch__load(skel); + if (!ASSERT_OK(err, "sock_iter_batch__load")) + goto done; + + link = bpf_program__attach_iter(sock_type == SOCK_STREAM ? + skel->progs.iter_tcp_soreuse : + skel->progs.iter_udp_soreuse, + NULL); + if (!ASSERT_OK_PTR(link, "bpf_program__attach_iter")) + goto done; + + iter_fd = bpf_iter_create(bpf_link__fd(link)); + if (!ASSERT_GE(iter_fd, 0, "bpf_iter_create")) + goto done; + + /* Iterate through the first three sockets */ + read_n(iter_fd, nr_soreuse - 1, counts, nr_soreuse); + + /* Make sure we saw three sockets from fds exactly once. */ + check_n_were_seen_once(fds[0], nr_soreuse, nr_soreuse - 1, counts, + nr_soreuse); + + /* Add nr_soreuse more sockets to the bucket. */ + fds[1] = start_reuseport_server(AF_INET, sock_type, "127.0.0.1", + reuse_port, 0, nr_soreuse); + if (!ASSERT_OK_PTR(fds[1], "start_reuseport_server")) + goto done; + + /* Iterate through the rest of the sockets. */ + read_n(iter_fd, -1, counts, nr_soreuse); + + /* Make sure each socket from the first set was seen exactly once. */ + check_n_were_seen_once(fds[0], nr_soreuse, nr_soreuse, counts, + nr_soreuse); +done: + for (i = 0; i < ARRAY_SIZE(fds); i++) + free_fds(fds[i], nr_soreuse); + if (iter_fd < 0) + close(iter_fd); + bpf_link__destroy(link); + sock_iter_batch__destroy(skel); +} static void do_test(int sock_type, bool onebyone) { int err, i, nread, to_read, total_read, iter_fd = -1; - int first_idx, second_idx, indices[nr_soreuse]; + struct iter_out outputs[nr_soreuse]; struct bpf_link *link = NULL; struct sock_iter_batch *skel; + int first_idx, second_idx; int *fds[2] = {}; skel = sock_iter_batch__open(); @@ -34,6 +294,7 @@ static void do_test(int sock_type, bool onebyone) goto done; skel->rodata->ports[i] = ntohs(local_port); } + skel->rodata->sf = AF_INET6; err = sock_iter_batch__load(skel); if (!ASSERT_OK(err, "sock_iter_batch__load")) @@ -55,38 +316,38 @@ static void do_test(int sock_type, bool onebyone) * from a bucket and leave one socket out from * that bucket on purpose. */ - to_read = (nr_soreuse - 1) * sizeof(*indices); + to_read = (nr_soreuse - 1) * sizeof(*outputs); total_read = 0; first_idx = -1; do { - nread = read(iter_fd, indices, onebyone ? sizeof(*indices) : to_read); - if (nread <= 0 || nread % sizeof(*indices)) + nread = read(iter_fd, outputs, onebyone ? sizeof(*outputs) : to_read); + if (nread <= 0 || nread % sizeof(*outputs)) break; total_read += nread; if (first_idx == -1) - first_idx = indices[0]; - for (i = 0; i < nread / sizeof(*indices); i++) - ASSERT_EQ(indices[i], first_idx, "first_idx"); + first_idx = outputs[0].idx; + for (i = 0; i < nread / sizeof(*outputs); i++) + ASSERT_EQ(outputs[i].idx, first_idx, "first_idx"); } while (total_read < to_read); - ASSERT_EQ(nread, onebyone ? sizeof(*indices) : to_read, "nread"); + ASSERT_EQ(nread, onebyone ? sizeof(*outputs) : to_read, "nread"); ASSERT_EQ(total_read, to_read, "total_read"); free_fds(fds[first_idx], nr_soreuse); fds[first_idx] = NULL; /* Read the "whole" second bucket */ - to_read = nr_soreuse * sizeof(*indices); + to_read = nr_soreuse * sizeof(*outputs); total_read = 0; second_idx = !first_idx; do { - nread = read(iter_fd, indices, onebyone ? sizeof(*indices) : to_read); - if (nread <= 0 || nread % sizeof(*indices)) + nread = read(iter_fd, outputs, onebyone ? sizeof(*outputs) : to_read); + if (nread <= 0 || nread % sizeof(*outputs)) break; total_read += nread; - for (i = 0; i < nread / sizeof(*indices); i++) - ASSERT_EQ(indices[i], second_idx, "second_idx"); + for (i = 0; i < nread / sizeof(*outputs); i++) + ASSERT_EQ(outputs[i].idx, second_idx, "second_idx"); } while (total_read <= to_read); ASSERT_EQ(nread, 0, "nread"); /* Both so_reuseport ports should be in different buckets, so @@ -123,10 +384,14 @@ void test_sock_iter_batch(void) if (test__start_subtest("tcp")) { do_test(SOCK_STREAM, true); do_test(SOCK_STREAM, false); + do_skip_test(SOCK_STREAM); + do_repeat_test(SOCK_STREAM); } if (test__start_subtest("udp")) { do_test(SOCK_DGRAM, true); do_test(SOCK_DGRAM, false); + do_skip_test(SOCK_DGRAM); + do_repeat_test(SOCK_DGRAM); } close_netns(nstoken); diff --git a/tools/testing/selftests/bpf/prog_tests/spin_lock.c b/tools/testing/selftests/bpf/prog_tests/spin_lock.c index 2b0068742ef9f..e3ea5dc2f697c 100644 --- a/tools/testing/selftests/bpf/prog_tests/spin_lock.c +++ b/tools/testing/selftests/bpf/prog_tests/spin_lock.c @@ -50,6 +50,9 @@ static struct { { "lock_id_mismatch_innermapval_mapval", "bpf_spin_unlock of different lock" }, { "lock_global_subprog_call1", "global function calls are not allowed while holding a lock" }, { "lock_global_subprog_call2", "global function calls are not allowed while holding a lock" }, + { "lock_global_sleepable_helper_subprog", "global function calls are not allowed while holding a lock" }, + { "lock_global_sleepable_kfunc_subprog", "global function calls are not allowed while holding a lock" }, + { "lock_global_sleepable_subprog_indirect", "global function calls are not allowed while holding a lock" }, }; static int match_regex(const char *pattern, const char *string) diff --git a/tools/testing/selftests/bpf/prog_tests/summarization.c b/tools/testing/selftests/bpf/prog_tests/summarization.c new file mode 100644 index 0000000000000..5dd6c120a8386 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/summarization.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bpf/libbpf.h" +#include "summarization_freplace.skel.h" +#include "summarization.skel.h" +#include + +static void print_verifier_log(const char *log) +{ + if (env.verbosity >= VERBOSE_VERY) + fprintf(stdout, "VERIFIER LOG:\n=============\n%s=============\n", log); +} + +static void test_aux(const char *main_prog_name, + const char *to_be_replaced, + const char *replacement, + bool expect_load, + const char *err_msg) +{ + struct summarization_freplace *freplace = NULL; + struct bpf_program *freplace_prog = NULL; + struct bpf_program *main_prog = NULL; + LIBBPF_OPTS(bpf_object_open_opts, opts); + struct summarization *main = NULL; + char log[16*1024]; + int err; + + opts.kernel_log_buf = log; + opts.kernel_log_size = sizeof(log); + if (env.verbosity >= VERBOSE_SUPER) + opts.kernel_log_level = 1 | 2 | 4; + main = summarization__open_opts(&opts); + if (!ASSERT_OK_PTR(main, "summarization__open")) + goto out; + main_prog = bpf_object__find_program_by_name(main->obj, main_prog_name); + if (!ASSERT_OK_PTR(main_prog, "main_prog")) + goto out; + bpf_program__set_autoload(main_prog, true); + err = summarization__load(main); + print_verifier_log(log); + if (!ASSERT_OK(err, "summarization__load")) + goto out; + freplace = summarization_freplace__open_opts(&opts); + if (!ASSERT_OK_PTR(freplace, "summarization_freplace__open")) + goto out; + freplace_prog = bpf_object__find_program_by_name(freplace->obj, replacement); + if (!ASSERT_OK_PTR(freplace_prog, "freplace_prog")) + goto out; + bpf_program__set_autoload(freplace_prog, true); + bpf_program__set_autoattach(freplace_prog, true); + bpf_program__set_attach_target(freplace_prog, + bpf_program__fd(main_prog), + to_be_replaced); + err = summarization_freplace__load(freplace); + print_verifier_log(log); + + /* The might_sleep extension doesn't work yet as sleepable calls are not + * allowed, but preserve the check in case it's supported later and then + * this particular combination can be enabled. + */ + if (!strcmp("might_sleep", replacement) && err) { + ASSERT_HAS_SUBSTR(log, "helper call might sleep in a non-sleepable prog", "error log"); + ASSERT_EQ(err, -EINVAL, "err"); + test__skip(); + goto out; + } + + if (expect_load) { + ASSERT_OK(err, "summarization_freplace__load"); + } else { + ASSERT_ERR(err, "summarization_freplace__load"); + ASSERT_HAS_SUBSTR(log, err_msg, "error log"); + } + +out: + summarization_freplace__destroy(freplace); + summarization__destroy(main); +} + +/* There are two global subprograms in both summarization.skel.h: + * - one changes packet data; + * - another does not. + * It is ok to freplace subprograms that change packet data with those + * that either do or do not. It is only ok to freplace subprograms + * that do not change packet data with those that do not as well. + * The below tests check outcomes for each combination of such freplace. + * Also test a case when main subprogram itself is replaced and is a single + * subprogram in a program. + * + * This holds for might_sleep programs. It is ok to replace might_sleep with + * might_sleep and with does_not_sleep, but does_not_sleep cannot be replaced + * with might_sleep. + */ +void test_summarization_freplace(void) +{ + struct { + const char *main; + const char *to_be_replaced; + bool has_side_effect; + } mains[2][4] = { + { + { "main_changes_with_subprogs", "changes_pkt_data", true }, + { "main_changes_with_subprogs", "does_not_change_pkt_data", false }, + { "main_changes", "main_changes", true }, + { "main_does_not_change", "main_does_not_change", false }, + }, + { + { "main_might_sleep_with_subprogs", "might_sleep", true }, + { "main_might_sleep_with_subprogs", "does_not_sleep", false }, + { "main_might_sleep", "main_might_sleep", true }, + { "main_does_not_sleep", "main_does_not_sleep", false }, + }, + }; + const char *pkt_err = "Extension program changes packet data"; + const char *slp_err = "Extension program may sleep"; + struct { + const char *func; + bool has_side_effect; + const char *err_msg; + } replacements[2][2] = { + { + { "changes_pkt_data", true, pkt_err }, + { "does_not_change_pkt_data", false, pkt_err }, + }, + { + { "might_sleep", true, slp_err }, + { "does_not_sleep", false, slp_err }, + }, + }; + char buf[64]; + + for (int t = 0; t < 2; t++) { + for (int i = 0; i < ARRAY_SIZE(mains); ++i) { + for (int j = 0; j < ARRAY_SIZE(replacements); ++j) { + snprintf(buf, sizeof(buf), "%s_with_%s", + mains[t][i].to_be_replaced, replacements[t][j].func); + if (!test__start_subtest(buf)) + continue; + test_aux(mains[t][i].main, mains[t][i].to_be_replaced, replacements[t][j].func, + mains[t][i].has_side_effect || !replacements[t][j].has_side_effect, + replacements[t][j].err_msg); + } + } + } +} diff --git a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c index cec746e77cd3a..bae0e9de277d2 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_tunnel.c +++ b/tools/testing/selftests/bpf/prog_tests/test_tunnel.c @@ -71,6 +71,8 @@ #define IP4_ADDR2_VETH1 "172.16.1.20" #define IP4_ADDR_TUNL_DEV0 "10.1.1.100" #define IP4_ADDR_TUNL_DEV1 "10.1.1.200" +#define IP6_ADDR_TUNL_DEV0 "fc80::100" +#define IP6_ADDR_TUNL_DEV1 "fc80::200" #define IP6_ADDR_VETH0 "::11" #define IP6_ADDR1_VETH1 "::22" @@ -98,6 +100,27 @@ #define XFRM_SPI_IN_TO_OUT 0x1 #define XFRM_SPI_OUT_TO_IN 0x2 +#define GRE_TUNL_DEV0 "gre00" +#define GRE_TUNL_DEV1 "gre11" + +#define IP6GRE_TUNL_DEV0 "ip6gre00" +#define IP6GRE_TUNL_DEV1 "ip6gre11" + +#define ERSPAN_TUNL_DEV0 "erspan00" +#define ERSPAN_TUNL_DEV1 "erspan11" + +#define IP6ERSPAN_TUNL_DEV0 "ip6erspan00" +#define IP6ERSPAN_TUNL_DEV1 "ip6erspan11" + +#define GENEVE_TUNL_DEV0 "geneve00" +#define GENEVE_TUNL_DEV1 "geneve11" + +#define IP6GENEVE_TUNL_DEV0 "ip6geneve00" +#define IP6GENEVE_TUNL_DEV1 "ip6geneve11" + +#define IP6TNL_TUNL_DEV0 "ip6tnl00" +#define IP6TNL_TUNL_DEV1 "ip6tnl11" + #define PING_ARGS "-i 0.01 -c 3 -w 10 -q" static int config_device(void) @@ -216,6 +239,18 @@ static int set_ipip_encap(const char *ipproto, const char *type) return -1; } +static int set_ipv4_addr(const char *dev0, const char *dev1) +{ + SYS(fail, "ip -n at_ns0 link set dev %s up", dev0); + SYS(fail, "ip -n at_ns0 addr add dev %s %s/24", dev0, IP4_ADDR_TUNL_DEV0); + SYS(fail, "ip link set dev %s up", dev1); + SYS(fail, "ip addr add dev %s %s/24", dev1, IP4_ADDR_TUNL_DEV1); + + return 0; +fail: + return 1; +} + static int add_ipip_tunnel(enum ipip_encap encap) { int err; @@ -356,6 +391,99 @@ static void delete_xfrm_tunnel(void) IP4_ADDR1_VETH1, IP4_ADDR_VETH0, XFRM_SPI_OUT_TO_IN); } +static int add_ipv4_tunnel(const char *dev0, const char *dev1, + const char *type, const char *opt) +{ + if (!type || !opt || !dev0 || !dev1) + return -1; + + SYS(fail, "ip -n at_ns0 link add dev %s type %s %s local %s remote %s", + dev0, type, opt, IP4_ADDR_VETH0, IP4_ADDR1_VETH1); + + SYS(fail, "ip link add dev %s type %s external", dev1, type); + + return set_ipv4_addr(dev0, dev1); +fail: + return -1; +} + +static void delete_tunnel(const char *dev0, const char *dev1) +{ + if (!dev0 || !dev1) + return; + + SYS_NOFAIL("ip netns exec at_ns0 ip link delete dev %s", dev0); + SYS_NOFAIL("ip link delete dev %s", dev1); +} + +static int set_ipv6_addr(const char *dev0, const char *dev1) +{ + /* disable IPv6 DAD because it might take too long and fail tests */ + SYS(fail, "ip -n at_ns0 addr add %s/96 dev veth0 nodad", IP6_ADDR_VETH0); + SYS(fail, "ip -n at_ns0 link set dev veth0 up"); + SYS(fail, "ip addr add %s/96 dev veth1 nodad", IP6_ADDR1_VETH1); + SYS(fail, "ip link set dev veth1 up"); + + SYS(fail, "ip -n at_ns0 addr add dev %s %s/24", dev0, IP4_ADDR_TUNL_DEV0); + SYS(fail, "ip -n at_ns0 addr add dev %s %s/96 nodad", dev0, IP6_ADDR_TUNL_DEV0); + SYS(fail, "ip -n at_ns0 link set dev %s up", dev0); + + SYS(fail, "ip addr add dev %s %s/24", dev1, IP4_ADDR_TUNL_DEV1); + SYS(fail, "ip addr add dev %s %s/96 nodad", dev1, IP6_ADDR_TUNL_DEV1); + SYS(fail, "ip link set dev %s up", dev1); + return 0; +fail: + return 1; +} + +static int add_ipv6_tunnel(const char *dev0, const char *dev1, + const char *type, const char *opt) +{ + if (!type || !opt || !dev0 || !dev1) + return -1; + + SYS(fail, "ip -n at_ns0 link add dev %s type %s %s local %s remote %s", + dev0, type, opt, IP6_ADDR_VETH0, IP6_ADDR1_VETH1); + + SYS(fail, "ip link add dev %s type %s external", dev1, type); + + return set_ipv6_addr(dev0, dev1); +fail: + return -1; +} + +static int add_geneve_tunnel(const char *dev0, const char *dev1, + const char *type, const char *opt) +{ + if (!type || !opt || !dev0 || !dev1) + return -1; + + SYS(fail, "ip -n at_ns0 link add dev %s type %s id 2 %s remote %s", + dev0, type, opt, IP4_ADDR1_VETH1); + + SYS(fail, "ip link add dev %s type %s %s external", dev1, type, opt); + + return set_ipv4_addr(dev0, dev1); +fail: + return -1; +} + +static int add_ip6geneve_tunnel(const char *dev0, const char *dev1, + const char *type, const char *opt) +{ + if (!type || !opt || !dev0 || !dev1) + return -1; + + SYS(fail, "ip -n at_ns0 link add dev %s type %s id 22 %s remote %s", + dev0, type, opt, IP6_ADDR1_VETH1); + + SYS(fail, "ip link add dev %s type %s %s external", dev1, type, opt); + + return set_ipv6_addr(dev0, dev1); +fail: + return -1; +} + static int test_ping(int family, const char *addr) { SYS(fail, "%s %s %s > /dev/null", ping_command(family), PING_ARGS, addr); @@ -364,32 +492,76 @@ static int test_ping(int family, const char *addr) return -1; } -static int attach_tc_prog(struct bpf_tc_hook *hook, int igr_fd, int egr_fd) +static void ping_dev0(void) { + /* ping from root namespace test */ + test_ping(AF_INET, IP4_ADDR_TUNL_DEV0); +} + +static void ping_dev1(void) +{ + struct nstoken *nstoken; + + /* ping from at_ns0 namespace test */ + nstoken = open_netns("at_ns0"); + if (!ASSERT_OK_PTR(nstoken, "setns")) + return; + + test_ping(AF_INET, IP4_ADDR_TUNL_DEV1); + close_netns(nstoken); +} + +static void ping6_veth0(void) +{ + test_ping(AF_INET6, IP6_ADDR_VETH0); +} + +static void ping6_dev0(void) +{ + test_ping(AF_INET6, IP6_ADDR_TUNL_DEV0); +} + +static void ping6_dev1(void) +{ + struct nstoken *nstoken; + + /* ping from at_ns0 namespace test */ + nstoken = open_netns("at_ns0"); + if (!ASSERT_OK_PTR(nstoken, "setns")) + return; + + test_ping(AF_INET, IP6_ADDR_TUNL_DEV1); + close_netns(nstoken); +} + +static int attach_tc_prog(int ifindex, int igr_fd, int egr_fd) +{ + DECLARE_LIBBPF_OPTS(bpf_tc_hook, hook, .ifindex = ifindex, + .attach_point = BPF_TC_INGRESS | BPF_TC_EGRESS); DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts1, .handle = 1, .priority = 1, .prog_fd = igr_fd); DECLARE_LIBBPF_OPTS(bpf_tc_opts, opts2, .handle = 1, .priority = 1, .prog_fd = egr_fd); int ret; - ret = bpf_tc_hook_create(hook); + ret = bpf_tc_hook_create(&hook); if (!ASSERT_OK(ret, "create tc hook")) return ret; if (igr_fd >= 0) { - hook->attach_point = BPF_TC_INGRESS; - ret = bpf_tc_attach(hook, &opts1); + hook.attach_point = BPF_TC_INGRESS; + ret = bpf_tc_attach(&hook, &opts1); if (!ASSERT_OK(ret, "bpf_tc_attach")) { - bpf_tc_hook_destroy(hook); + bpf_tc_hook_destroy(&hook); return ret; } } if (egr_fd >= 0) { - hook->attach_point = BPF_TC_EGRESS; - ret = bpf_tc_attach(hook, &opts2); + hook.attach_point = BPF_TC_EGRESS; + ret = bpf_tc_attach(&hook, &opts2); if (!ASSERT_OK(ret, "bpf_tc_attach")) { - bpf_tc_hook_destroy(hook); + bpf_tc_hook_destroy(&hook); return ret; } } @@ -397,6 +569,50 @@ static int attach_tc_prog(struct bpf_tc_hook *hook, int igr_fd, int egr_fd) return 0; } +static int generic_attach(const char *dev, int igr_fd, int egr_fd) +{ + int ifindex; + + if (!ASSERT_OK_FD(igr_fd, "check ingress fd")) + return -1; + if (!ASSERT_OK_FD(egr_fd, "check egress fd")) + return -1; + + ifindex = if_nametoindex(dev); + if (!ASSERT_NEQ(ifindex, 0, "get ifindex")) + return -1; + + return attach_tc_prog(ifindex, igr_fd, egr_fd); +} + +static int generic_attach_igr(const char *dev, int igr_fd) +{ + int ifindex; + + if (!ASSERT_OK_FD(igr_fd, "check ingress fd")) + return -1; + + ifindex = if_nametoindex(dev); + if (!ASSERT_NEQ(ifindex, 0, "get ifindex")) + return -1; + + return attach_tc_prog(ifindex, igr_fd, -1); +} + +static int generic_attach_egr(const char *dev, int egr_fd) +{ + int ifindex; + + if (!ASSERT_OK_FD(egr_fd, "check egress fd")) + return -1; + + ifindex = if_nametoindex(dev); + if (!ASSERT_NEQ(ifindex, 0, "get ifindex")) + return -1; + + return attach_tc_prog(ifindex, -1, egr_fd); +} + static void test_vxlan_tunnel(void) { struct test_tunnel_kern *skel = NULL; @@ -404,11 +620,9 @@ static void test_vxlan_tunnel(void) int local_ip_map_fd = -1; int set_src_prog_fd, get_src_prog_fd; int set_dst_prog_fd; - int key = 0, ifindex = -1; + int key = 0; uint local_ip; int err; - DECLARE_LIBBPF_OPTS(bpf_tc_hook, tc_hook, - .attach_point = BPF_TC_INGRESS); /* add vxlan tunnel */ err = add_vxlan_tunnel(); @@ -419,42 +633,22 @@ static void test_vxlan_tunnel(void) skel = test_tunnel_kern__open_and_load(); if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) goto done; - ifindex = if_nametoindex(VXLAN_TUNL_DEV1); - if (!ASSERT_NEQ(ifindex, 0, "vxlan11 ifindex")) - goto done; - tc_hook.ifindex = ifindex; get_src_prog_fd = bpf_program__fd(skel->progs.vxlan_get_tunnel_src); set_src_prog_fd = bpf_program__fd(skel->progs.vxlan_set_tunnel_src); - if (!ASSERT_GE(get_src_prog_fd, 0, "bpf_program__fd")) - goto done; - if (!ASSERT_GE(set_src_prog_fd, 0, "bpf_program__fd")) - goto done; - if (attach_tc_prog(&tc_hook, get_src_prog_fd, set_src_prog_fd)) + if (generic_attach(VXLAN_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd)) goto done; /* load and attach bpf prog to veth dev tc hook point */ - ifindex = if_nametoindex("veth1"); - if (!ASSERT_NEQ(ifindex, 0, "veth1 ifindex")) - goto done; - tc_hook.ifindex = ifindex; set_dst_prog_fd = bpf_program__fd(skel->progs.veth_set_outer_dst); - if (!ASSERT_GE(set_dst_prog_fd, 0, "bpf_program__fd")) - goto done; - if (attach_tc_prog(&tc_hook, set_dst_prog_fd, -1)) + if (generic_attach_igr("veth1", set_dst_prog_fd)) goto done; /* load and attach prog set_md to tunnel dev tc hook point at_ns0 */ nstoken = open_netns("at_ns0"); if (!ASSERT_OK_PTR(nstoken, "setns src")) goto done; - ifindex = if_nametoindex(VXLAN_TUNL_DEV0); - if (!ASSERT_NEQ(ifindex, 0, "vxlan00 ifindex")) - goto done; - tc_hook.ifindex = ifindex; set_dst_prog_fd = bpf_program__fd(skel->progs.vxlan_set_tunnel_dst); - if (!ASSERT_GE(set_dst_prog_fd, 0, "bpf_program__fd")) - goto done; - if (attach_tc_prog(&tc_hook, -1, set_dst_prog_fd)) + if (generic_attach_egr(VXLAN_TUNL_DEV0, set_dst_prog_fd)) goto done; close_netns(nstoken); @@ -468,9 +662,7 @@ static void test_vxlan_tunnel(void) goto done; /* ping test */ - err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV0); - if (!ASSERT_OK(err, "test_ping")) - goto done; + ping_dev0(); done: /* delete vxlan tunnel */ @@ -488,11 +680,9 @@ static void test_ip6vxlan_tunnel(void) int local_ip_map_fd = -1; int set_src_prog_fd, get_src_prog_fd; int set_dst_prog_fd; - int key = 0, ifindex = -1; + int key = 0; uint local_ip; int err; - DECLARE_LIBBPF_OPTS(bpf_tc_hook, tc_hook, - .attach_point = BPF_TC_INGRESS); /* add vxlan tunnel */ err = add_ip6vxlan_tunnel(); @@ -503,31 +693,17 @@ static void test_ip6vxlan_tunnel(void) skel = test_tunnel_kern__open_and_load(); if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) goto done; - ifindex = if_nametoindex(IP6VXLAN_TUNL_DEV1); - if (!ASSERT_NEQ(ifindex, 0, "ip6vxlan11 ifindex")) - goto done; - tc_hook.ifindex = ifindex; get_src_prog_fd = bpf_program__fd(skel->progs.ip6vxlan_get_tunnel_src); set_src_prog_fd = bpf_program__fd(skel->progs.ip6vxlan_set_tunnel_src); - if (!ASSERT_GE(set_src_prog_fd, 0, "bpf_program__fd")) - goto done; - if (!ASSERT_GE(get_src_prog_fd, 0, "bpf_program__fd")) - goto done; - if (attach_tc_prog(&tc_hook, get_src_prog_fd, set_src_prog_fd)) + if (generic_attach(IP6VXLAN_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd)) goto done; /* load and attach prog set_md to tunnel dev tc hook point at_ns0 */ nstoken = open_netns("at_ns0"); if (!ASSERT_OK_PTR(nstoken, "setns src")) goto done; - ifindex = if_nametoindex(IP6VXLAN_TUNL_DEV0); - if (!ASSERT_NEQ(ifindex, 0, "ip6vxlan00 ifindex")) - goto done; - tc_hook.ifindex = ifindex; set_dst_prog_fd = bpf_program__fd(skel->progs.ip6vxlan_set_tunnel_dst); - if (!ASSERT_GE(set_dst_prog_fd, 0, "bpf_program__fd")) - goto done; - if (attach_tc_prog(&tc_hook, -1, set_dst_prog_fd)) + if (generic_attach_egr(IP6VXLAN_TUNL_DEV0, set_dst_prog_fd)) goto done; close_netns(nstoken); @@ -541,9 +717,7 @@ static void test_ip6vxlan_tunnel(void) goto done; /* ping test */ - err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV0); - if (!ASSERT_OK(err, "test_ping")) - goto done; + ping_dev0(); done: /* delete ipv6 vxlan tunnel */ @@ -557,12 +731,8 @@ static void test_ip6vxlan_tunnel(void) static void test_ipip_tunnel(enum ipip_encap encap) { struct test_tunnel_kern *skel = NULL; - struct nstoken *nstoken; int set_src_prog_fd, get_src_prog_fd; - int ifindex = -1; int err; - DECLARE_LIBBPF_OPTS(bpf_tc_hook, tc_hook, - .attach_point = BPF_TC_INGRESS); /* add ipip tunnel */ err = add_ipip_tunnel(encap); @@ -573,10 +743,6 @@ static void test_ipip_tunnel(enum ipip_encap encap) skel = test_tunnel_kern__open_and_load(); if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) goto done; - ifindex = if_nametoindex(IPIP_TUNL_DEV1); - if (!ASSERT_NEQ(ifindex, 0, "ipip11 ifindex")) - goto done; - tc_hook.ifindex = ifindex; switch (encap) { case FOU: @@ -598,26 +764,11 @@ static void test_ipip_tunnel(enum ipip_encap encap) skel->progs.ipip_set_tunnel); } - if (!ASSERT_GE(set_src_prog_fd, 0, "bpf_program__fd")) - goto done; - if (!ASSERT_GE(get_src_prog_fd, 0, "bpf_program__fd")) - goto done; - if (attach_tc_prog(&tc_hook, get_src_prog_fd, set_src_prog_fd)) + if (generic_attach(IPIP_TUNL_DEV1, get_src_prog_fd, set_src_prog_fd)) goto done; - /* ping from root namespace test */ - err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV0); - if (!ASSERT_OK(err, "test_ping")) - goto done; - - /* ping from at_ns0 namespace test */ - nstoken = open_netns("at_ns0"); - if (!ASSERT_OK_PTR(nstoken, "setns")) - goto done; - err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1); - if (!ASSERT_OK(err, "test_ping")) - goto done; - close_netns(nstoken); + ping_dev0(); + ping_dev1(); done: /* delete ipip tunnel */ @@ -628,11 +779,8 @@ static void test_ipip_tunnel(enum ipip_encap encap) static void test_xfrm_tunnel(void) { - DECLARE_LIBBPF_OPTS(bpf_tc_hook, tc_hook, - .attach_point = BPF_TC_INGRESS); LIBBPF_OPTS(bpf_xdp_attach_opts, opts); struct test_tunnel_kern *skel = NULL; - struct nstoken *nstoken; int xdp_prog_fd; int tc_prog_fd; int ifindex; @@ -646,19 +794,16 @@ static void test_xfrm_tunnel(void) if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) goto done; - ifindex = if_nametoindex("veth1"); - if (!ASSERT_NEQ(ifindex, 0, "veth1 ifindex")) - goto done; /* attach tc prog to tunnel dev */ - tc_hook.ifindex = ifindex; tc_prog_fd = bpf_program__fd(skel->progs.xfrm_get_state); - if (!ASSERT_GE(tc_prog_fd, 0, "bpf_program__fd")) - goto done; - if (attach_tc_prog(&tc_hook, tc_prog_fd, -1)) + if (generic_attach_igr("veth1", tc_prog_fd)) goto done; /* attach xdp prog to tunnel dev */ + ifindex = if_nametoindex("veth1"); + if (!ASSERT_NEQ(ifindex, 0, "veth1 ifindex")) + goto done; xdp_prog_fd = bpf_program__fd(skel->progs.xfrm_get_state_xdp); if (!ASSERT_GE(xdp_prog_fd, 0, "bpf_program__fd")) goto done; @@ -666,14 +811,7 @@ static void test_xfrm_tunnel(void) if (!ASSERT_OK(err, "bpf_xdp_attach")) goto done; - /* ping from at_ns0 namespace test */ - nstoken = open_netns("at_ns0"); - if (!ASSERT_OK_PTR(nstoken, "setns")) - goto done; - err = test_ping(AF_INET, IP4_ADDR_TUNL_DEV1); - close_netns(nstoken); - if (!ASSERT_OK(err, "test_ping")) - goto done; + ping_dev1(); if (!ASSERT_EQ(skel->bss->xfrm_reqid, 1, "req_id")) goto done; @@ -690,6 +828,281 @@ static void test_xfrm_tunnel(void) test_tunnel_kern__destroy(skel); } +enum gre_test { + GRE, + GRE_NOKEY, + GRETAP, + GRETAP_NOKEY, +}; + +static void test_gre_tunnel(enum gre_test test) +{ + struct test_tunnel_kern *skel; + int set_fd, get_fd; + int err; + + skel = test_tunnel_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) + return; + + switch (test) { + case GRE: + err = add_ipv4_tunnel(GRE_TUNL_DEV0, GRE_TUNL_DEV1, "gre", "seq"); + set_fd = bpf_program__fd(skel->progs.gre_set_tunnel_no_key); + get_fd = bpf_program__fd(skel->progs.gre_get_tunnel); + break; + case GRE_NOKEY: + err = add_ipv4_tunnel(GRE_TUNL_DEV0, GRE_TUNL_DEV1, "gre", "seq key 2"); + set_fd = bpf_program__fd(skel->progs.gre_set_tunnel); + get_fd = bpf_program__fd(skel->progs.gre_get_tunnel); + break; + case GRETAP: + err = add_ipv4_tunnel(GRE_TUNL_DEV0, GRE_TUNL_DEV1, "gretap", "seq"); + set_fd = bpf_program__fd(skel->progs.gre_set_tunnel_no_key); + get_fd = bpf_program__fd(skel->progs.gre_get_tunnel); + break; + case GRETAP_NOKEY: + err = add_ipv4_tunnel(GRE_TUNL_DEV0, GRE_TUNL_DEV1, "gretap", "seq key 2"); + set_fd = bpf_program__fd(skel->progs.gre_set_tunnel); + get_fd = bpf_program__fd(skel->progs.gre_get_tunnel); + break; + } + if (!ASSERT_OK(err, "add tunnel")) + goto done; + + if (generic_attach(GRE_TUNL_DEV1, get_fd, set_fd)) + goto done; + + ping_dev0(); + ping_dev1(); + +done: + delete_tunnel(GRE_TUNL_DEV0, GRE_TUNL_DEV1); + test_tunnel_kern__destroy(skel); +} + +enum ip6gre_test { + IP6GRE, + IP6GRETAP +}; + +static void test_ip6gre_tunnel(enum ip6gre_test test) +{ + struct test_tunnel_kern *skel; + int set_fd, get_fd; + int err; + + skel = test_tunnel_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) + return; + + switch (test) { + case IP6GRE: + err = add_ipv6_tunnel(IP6GRE_TUNL_DEV0, IP6GRE_TUNL_DEV1, + "ip6gre", "flowlabel 0xbcdef key 2"); + break; + case IP6GRETAP: + err = add_ipv6_tunnel(IP6GRE_TUNL_DEV0, IP6GRE_TUNL_DEV1, + "ip6gretap", "flowlabel 0xbcdef key 2"); + break; + } + if (!ASSERT_OK(err, "add tunnel")) + goto done; + + set_fd = bpf_program__fd(skel->progs.ip6gretap_set_tunnel); + get_fd = bpf_program__fd(skel->progs.ip6gretap_get_tunnel); + if (generic_attach(IP6GRE_TUNL_DEV1, get_fd, set_fd)) + goto done; + + ping6_veth0(); + ping6_dev1(); + ping_dev0(); + ping_dev1(); +done: + delete_tunnel(IP6GRE_TUNL_DEV0, IP6GRE_TUNL_DEV1); + test_tunnel_kern__destroy(skel); +} + +enum erspan_test { + V1, + V2 +}; + +static void test_erspan_tunnel(enum erspan_test test) +{ + struct test_tunnel_kern *skel; + int set_fd, get_fd; + int err; + + skel = test_tunnel_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) + return; + + switch (test) { + case V1: + err = add_ipv4_tunnel(ERSPAN_TUNL_DEV0, ERSPAN_TUNL_DEV1, + "erspan", "seq key 2 erspan_ver 1 erspan 123"); + break; + case V2: + err = add_ipv4_tunnel(ERSPAN_TUNL_DEV0, ERSPAN_TUNL_DEV1, + "erspan", + "seq key 2 erspan_ver 2 erspan_dir egress erspan_hwid 3"); + break; + } + if (!ASSERT_OK(err, "add tunnel")) + goto done; + + set_fd = bpf_program__fd(skel->progs.erspan_set_tunnel); + get_fd = bpf_program__fd(skel->progs.erspan_get_tunnel); + if (generic_attach(ERSPAN_TUNL_DEV1, get_fd, set_fd)) + goto done; + + ping_dev0(); + ping_dev1(); +done: + delete_tunnel(ERSPAN_TUNL_DEV0, ERSPAN_TUNL_DEV1); + test_tunnel_kern__destroy(skel); +} + +static void test_ip6erspan_tunnel(enum erspan_test test) +{ + struct test_tunnel_kern *skel; + int set_fd, get_fd; + int err; + + skel = test_tunnel_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) + return; + + switch (test) { + case V1: + err = add_ipv6_tunnel(IP6ERSPAN_TUNL_DEV0, IP6ERSPAN_TUNL_DEV1, + "ip6erspan", "seq key 2 erspan_ver 1 erspan 123"); + break; + case V2: + err = add_ipv6_tunnel(IP6ERSPAN_TUNL_DEV0, IP6ERSPAN_TUNL_DEV1, + "ip6erspan", + "seq key 2 erspan_ver 2 erspan_dir egress erspan_hwid 7"); + break; + } + if (!ASSERT_OK(err, "add tunnel")) + goto done; + + set_fd = bpf_program__fd(skel->progs.ip4ip6erspan_set_tunnel); + get_fd = bpf_program__fd(skel->progs.ip4ip6erspan_get_tunnel); + if (generic_attach(IP6ERSPAN_TUNL_DEV1, get_fd, set_fd)) + goto done; + + ping6_veth0(); + ping_dev1(); +done: + delete_tunnel(IP6ERSPAN_TUNL_DEV0, IP6ERSPAN_TUNL_DEV1); + test_tunnel_kern__destroy(skel); +} + +static void test_geneve_tunnel(void) +{ + struct test_tunnel_kern *skel; + int set_fd, get_fd; + int err; + + skel = test_tunnel_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) + return; + + err = add_geneve_tunnel(GENEVE_TUNL_DEV0, GENEVE_TUNL_DEV1, + "geneve", "dstport 6081"); + if (!ASSERT_OK(err, "add tunnel")) + goto done; + + set_fd = bpf_program__fd(skel->progs.geneve_set_tunnel); + get_fd = bpf_program__fd(skel->progs.geneve_get_tunnel); + if (generic_attach(GENEVE_TUNL_DEV1, get_fd, set_fd)) + goto done; + + ping_dev0(); + ping_dev1(); +done: + delete_tunnel(GENEVE_TUNL_DEV0, GENEVE_TUNL_DEV1); + test_tunnel_kern__destroy(skel); +} + +static void test_ip6geneve_tunnel(void) +{ + struct test_tunnel_kern *skel; + int set_fd, get_fd; + int err; + + skel = test_tunnel_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) + return; + + err = add_ip6geneve_tunnel(IP6GENEVE_TUNL_DEV0, IP6GENEVE_TUNL_DEV1, + "geneve", ""); + if (!ASSERT_OK(err, "add tunnel")) + goto done; + + set_fd = bpf_program__fd(skel->progs.ip6geneve_set_tunnel); + get_fd = bpf_program__fd(skel->progs.ip6geneve_get_tunnel); + if (generic_attach(IP6GENEVE_TUNL_DEV1, get_fd, set_fd)) + goto done; + + ping_dev0(); + ping_dev1(); +done: + delete_tunnel(IP6GENEVE_TUNL_DEV0, IP6GENEVE_TUNL_DEV1); + test_tunnel_kern__destroy(skel); +} + +enum ip6tnl_test { + IPIP6, + IP6IP6 +}; + +static void test_ip6tnl_tunnel(enum ip6tnl_test test) +{ + struct test_tunnel_kern *skel; + int set_fd, get_fd; + int err; + + skel = test_tunnel_kern__open_and_load(); + if (!ASSERT_OK_PTR(skel, "test_tunnel_kern__open_and_load")) + return; + + err = add_ipv6_tunnel(IP6TNL_TUNL_DEV0, IP6TNL_TUNL_DEV1, "ip6tnl", ""); + if (!ASSERT_OK(err, "add tunnel")) + goto done; + + switch (test) { + case IPIP6: + set_fd = bpf_program__fd(skel->progs.ipip6_set_tunnel); + get_fd = bpf_program__fd(skel->progs.ipip6_get_tunnel); + break; + case IP6IP6: + set_fd = bpf_program__fd(skel->progs.ip6ip6_set_tunnel); + get_fd = bpf_program__fd(skel->progs.ip6ip6_get_tunnel); + break; + } + if (generic_attach(IP6TNL_TUNL_DEV1, get_fd, set_fd)) + goto done; + + ping6_veth0(); + switch (test) { + case IPIP6: + ping_dev0(); + ping_dev1(); + break; + case IP6IP6: + ping6_dev0(); + ping6_dev1(); + break; + } + +done: + delete_tunnel(IP6TNL_TUNL_DEV0, IP6TNL_TUNL_DEV1); + test_tunnel_kern__destroy(skel); +} + #define RUN_TEST(name, ...) \ ({ \ if (test__start_subtest(#name)) { \ @@ -707,6 +1120,20 @@ static void *test_tunnel_run_tests(void *arg) RUN_TEST(ipip_tunnel, FOU); RUN_TEST(ipip_tunnel, GUE); RUN_TEST(xfrm_tunnel); + RUN_TEST(gre_tunnel, GRE); + RUN_TEST(gre_tunnel, GRE_NOKEY); + RUN_TEST(gre_tunnel, GRETAP); + RUN_TEST(gre_tunnel, GRETAP_NOKEY); + RUN_TEST(ip6gre_tunnel, IP6GRE); + RUN_TEST(ip6gre_tunnel, IP6GRETAP); + RUN_TEST(erspan_tunnel, V1); + RUN_TEST(erspan_tunnel, V2); + RUN_TEST(ip6erspan_tunnel, V1); + RUN_TEST(ip6erspan_tunnel, V2); + RUN_TEST(geneve_tunnel); + RUN_TEST(ip6geneve_tunnel); + RUN_TEST(ip6tnl_tunnel, IPIP6); + RUN_TEST(ip6tnl_tunnel, IP6IP6); return NULL; } diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c index 8a0e1ff8a2dc6..e66a57970d28c 100644 --- a/tools/testing/selftests/bpf/prog_tests/verifier.c +++ b/tools/testing/selftests/bpf/prog_tests/verifier.c @@ -45,6 +45,7 @@ #include "verifier_ldsx.skel.h" #include "verifier_leak_ptr.skel.h" #include "verifier_linked_scalars.skel.h" +#include "verifier_load_acquire.skel.h" #include "verifier_loops1.skel.h" #include "verifier_lwt.skel.h" #include "verifier_map_in_map.skel.h" @@ -80,6 +81,7 @@ #include "verifier_spill_fill.skel.h" #include "verifier_spin_lock.skel.h" #include "verifier_stack_ptr.skel.h" +#include "verifier_store_release.skel.h" #include "verifier_subprog_precision.skel.h" #include "verifier_subreg.skel.h" #include "verifier_tailcall_jit.skel.h" @@ -121,7 +123,7 @@ static void run_tests_aux(const char *skel_name, /* test_verifier tests are executed w/o CAP_SYS_ADMIN, do the same here */ err = cap_disable_effective(1ULL << CAP_SYS_ADMIN, &old_caps); if (err) { - PRINT_FAIL("failed to drop CAP_SYS_ADMIN: %i, %s\n", err, strerror(err)); + PRINT_FAIL("failed to drop CAP_SYS_ADMIN: %i, %s\n", err, strerror(-err)); return; } @@ -131,7 +133,7 @@ static void run_tests_aux(const char *skel_name, err = cap_enable_effective(old_caps, NULL); if (err) - PRINT_FAIL("failed to restore CAP_SYS_ADMIN: %i, %s\n", err, strerror(err)); + PRINT_FAIL("failed to restore CAP_SYS_ADMIN: %i, %s\n", err, strerror(-err)); } #define RUN(skel) run_tests_aux(#skel, skel##__elf_bytes, NULL) @@ -173,6 +175,7 @@ void test_verifier_int_ptr(void) { RUN(verifier_int_ptr); } void test_verifier_iterating_callbacks(void) { RUN(verifier_iterating_callbacks); } void test_verifier_jeq_infer_not_null(void) { RUN(verifier_jeq_infer_not_null); } void test_verifier_jit_convergence(void) { RUN(verifier_jit_convergence); } +void test_verifier_load_acquire(void) { RUN(verifier_load_acquire); } void test_verifier_ld_ind(void) { RUN(verifier_ld_ind); } void test_verifier_ldsx(void) { RUN(verifier_ldsx); } void test_verifier_leak_ptr(void) { RUN(verifier_leak_ptr); } @@ -211,6 +214,7 @@ void test_verifier_sockmap_mutate(void) { RUN(verifier_sockmap_mutate); } void test_verifier_spill_fill(void) { RUN(verifier_spill_fill); } void test_verifier_spin_lock(void) { RUN(verifier_spin_lock); } void test_verifier_stack_ptr(void) { RUN(verifier_stack_ptr); } +void test_verifier_store_release(void) { RUN(verifier_store_release); } void test_verifier_subprog_precision(void) { RUN(verifier_subprog_precision); } void test_verifier_subreg(void) { RUN(verifier_subreg); } void test_verifier_tailcall_jit(void) { RUN(verifier_tailcall_jit); } diff --git a/tools/testing/selftests/bpf/progs/arena_atomics.c b/tools/testing/selftests/bpf/progs/arena_atomics.c index 40dd57fca5cc2..a52feff981126 100644 --- a/tools/testing/selftests/bpf/progs/arena_atomics.c +++ b/tools/testing/selftests/bpf/progs/arena_atomics.c @@ -6,6 +6,8 @@ #include #include #include "bpf_arena_common.h" +#include "../../../include/linux/filter.h" +#include "bpf_misc.h" struct { __uint(type, BPF_MAP_TYPE_ARENA); @@ -19,9 +21,17 @@ struct { } arena SEC(".maps"); #if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) -bool skip_tests __attribute((__section__(".data"))) = false; +bool skip_all_tests __attribute((__section__(".data"))) = false; #else -bool skip_tests = true; +bool skip_all_tests = true; +#endif + +#if defined(ENABLE_ATOMICS_TESTS) && \ + defined(__BPF_FEATURE_ADDR_SPACE_CAST) && \ + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) +bool skip_lacq_srel_tests __attribute((__section__(".data"))) = false; +#else +bool skip_lacq_srel_tests = true; #endif __u32 pid = 0; @@ -274,4 +284,111 @@ int uaf(const void *ctx) return 0; } +#if __clang_major__ >= 18 +__u8 __arena_global load_acquire8_value = 0x12; +__u16 __arena_global load_acquire16_value = 0x1234; +__u32 __arena_global load_acquire32_value = 0x12345678; +__u64 __arena_global load_acquire64_value = 0x1234567890abcdef; + +__u8 __arena_global load_acquire8_result = 0; +__u16 __arena_global load_acquire16_result = 0; +__u32 __arena_global load_acquire32_result = 0; +__u64 __arena_global load_acquire64_result = 0; +#else +/* clang-17 crashes if the .addr_space.1 ELF section has holes. Work around + * this issue by defining the below variables as 64-bit. + */ +__u64 __arena_global load_acquire8_value; +__u64 __arena_global load_acquire16_value; +__u64 __arena_global load_acquire32_value; +__u64 __arena_global load_acquire64_value; + +__u64 __arena_global load_acquire8_result; +__u64 __arena_global load_acquire16_result; +__u64 __arena_global load_acquire32_result; +__u64 __arena_global load_acquire64_result; +#endif + +SEC("raw_tp/sys_enter") +int load_acquire(const void *ctx) +{ +#if defined(ENABLE_ATOMICS_TESTS) && \ + defined(__BPF_FEATURE_ADDR_SPACE_CAST) && \ + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) + +#define LOAD_ACQUIRE_ARENA(SIZEOP, SIZE, SRC, DST) \ + { asm volatile ( \ + "r1 = %[" #SRC "] ll;" \ + "r1 = addr_space_cast(r1, 0x0, 0x1);" \ + ".8byte %[load_acquire_insn];" \ + "r3 = %[" #DST "] ll;" \ + "r3 = addr_space_cast(r3, 0x0, 0x1);" \ + "*(" #SIZE " *)(r3 + 0) = r2;" \ + : \ + : __imm_addr(SRC), \ + __imm_insn(load_acquire_insn, \ + BPF_ATOMIC_OP(BPF_##SIZEOP, BPF_LOAD_ACQ, \ + BPF_REG_2, BPF_REG_1, 0)), \ + __imm_addr(DST) \ + : __clobber_all); } \ + + LOAD_ACQUIRE_ARENA(B, u8, load_acquire8_value, load_acquire8_result) + LOAD_ACQUIRE_ARENA(H, u16, load_acquire16_value, + load_acquire16_result) + LOAD_ACQUIRE_ARENA(W, u32, load_acquire32_value, + load_acquire32_result) + LOAD_ACQUIRE_ARENA(DW, u64, load_acquire64_value, + load_acquire64_result) +#undef LOAD_ACQUIRE_ARENA + +#endif + return 0; +} + +#if __clang_major__ >= 18 +__u8 __arena_global store_release8_result = 0; +__u16 __arena_global store_release16_result = 0; +__u32 __arena_global store_release32_result = 0; +__u64 __arena_global store_release64_result = 0; +#else +/* clang-17 crashes if the .addr_space.1 ELF section has holes. Work around + * this issue by defining the below variables as 64-bit. + */ +__u64 __arena_global store_release8_result; +__u64 __arena_global store_release16_result; +__u64 __arena_global store_release32_result; +__u64 __arena_global store_release64_result; +#endif + +SEC("raw_tp/sys_enter") +int store_release(const void *ctx) +{ +#if defined(ENABLE_ATOMICS_TESTS) && \ + defined(__BPF_FEATURE_ADDR_SPACE_CAST) && \ + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) + +#define STORE_RELEASE_ARENA(SIZEOP, DST, VAL) \ + { asm volatile ( \ + "r1 = " VAL ";" \ + "r2 = %[" #DST "] ll;" \ + "r2 = addr_space_cast(r2, 0x0, 0x1);" \ + ".8byte %[store_release_insn];" \ + : \ + : __imm_addr(DST), \ + __imm_insn(store_release_insn, \ + BPF_ATOMIC_OP(BPF_##SIZEOP, BPF_STORE_REL, \ + BPF_REG_2, BPF_REG_1, 0)) \ + : __clobber_all); } \ + + STORE_RELEASE_ARENA(B, store_release8_result, "0x12") + STORE_RELEASE_ARENA(H, store_release16_result, "0x1234") + STORE_RELEASE_ARENA(W, store_release32_result, "0x12345678") + STORE_RELEASE_ARENA(DW, store_release64_result, + "0x1234567890abcdef ll") +#undef STORE_RELEASE_ARENA + +#endif + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/arena_spin_lock.c b/tools/testing/selftests/bpf/progs/arena_spin_lock.c new file mode 100644 index 0000000000000..c4500c37f85e0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/arena_spin_lock.c @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include +#include "bpf_misc.h" +#include "bpf_arena_spin_lock.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 100); /* number of pages */ +#ifdef __TARGET_ARCH_arm64 + __ulong(map_extra, 0x1ull << 32); /* start of mmap() region */ +#else + __ulong(map_extra, 0x1ull << 44); /* start of mmap() region */ +#endif +} arena SEC(".maps"); + +int cs_count; + +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) +arena_spinlock_t __arena lock; +int test_skip = 1; +#else +int test_skip = 2; +#endif + +int counter; +int limit; + +SEC("tc") +int prog(void *ctx) +{ + int ret = -2; + +#if defined(ENABLE_ATOMICS_TESTS) && defined(__BPF_FEATURE_ADDR_SPACE_CAST) + unsigned long flags; + + if ((ret = arena_spin_lock_irqsave(&lock, flags))) + return ret; + if (counter != limit) + counter++; + bpf_repeat(cs_count); + ret = 0; + arena_spin_unlock_irqrestore(&lock, flags); +#endif + return ret; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index 34f555da546fb..13a2e22f54658 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -213,4 +213,21 @@ #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #endif +#if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ + (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || \ + defined(__TARGET_ARCH_arm) || defined(__TARGET_ARCH_s390) || \ + defined(__TARGET_ARCH_loongarch)) && \ + __clang_major__ >= 18 +#define CAN_USE_GOTOL +#endif + +#if _clang_major__ >= 18 +#define CAN_USE_BPF_ST +#endif + +#if __clang_major__ >= 18 && defined(ENABLE_ATOMICS_TESTS) && \ + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) +#define CAN_USE_LOAD_ACQ_STORE_REL +#endif + #endif diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h index 59843b430f76a..82928cc5d87b7 100644 --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h @@ -123,6 +123,7 @@ #define sk_refcnt __sk_common.skc_refcnt #define sk_state __sk_common.skc_state #define sk_net __sk_common.skc_net +#define sk_rcv_saddr __sk_common.skc_rcv_saddr #define sk_v6_daddr __sk_common.skc_v6_daddr #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr #define sk_flags __sk_common.skc_flags diff --git a/tools/testing/selftests/bpf/progs/cgroup_preorder.c b/tools/testing/selftests/bpf/progs/cgroup_preorder.c new file mode 100644 index 0000000000000..4ef6202baa0a4 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/cgroup_preorder.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta Platforms, Inc. and affiliates. */ +#include +#include + +char _license[] SEC("license") = "GPL"; + +unsigned int idx; +__u8 result[4]; + +SEC("cgroup/getsockopt") +int child(struct bpf_sockopt *ctx) +{ + if (idx < 4) + result[idx++] = 1; + return 1; +} + +SEC("cgroup/getsockopt") +int child_2(struct bpf_sockopt *ctx) +{ + if (idx < 4) + result[idx++] = 2; + return 1; +} + +SEC("cgroup/getsockopt") +int parent(struct bpf_sockopt *ctx) +{ + if (idx < 4) + result[idx++] = 3; + return 1; +} + +SEC("cgroup/getsockopt") +int parent_2(struct bpf_sockopt *ctx) +{ + if (idx < 4) + result[idx++] = 4; + return 1; +} diff --git a/tools/testing/selftests/bpf/progs/changes_pkt_data.c b/tools/testing/selftests/bpf/progs/changes_pkt_data.c deleted file mode 100644 index 43cada48b28ad..0000000000000 --- a/tools/testing/selftests/bpf/progs/changes_pkt_data.c +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include -#include - -__noinline -long changes_pkt_data(struct __sk_buff *sk) -{ - return bpf_skb_pull_data(sk, 0); -} - -__noinline __weak -long does_not_change_pkt_data(struct __sk_buff *sk) -{ - return 0; -} - -SEC("?tc") -int main_with_subprogs(struct __sk_buff *sk) -{ - changes_pkt_data(sk); - does_not_change_pkt_data(sk); - return 0; -} - -SEC("?tc") -int main_changes(struct __sk_buff *sk) -{ - bpf_skb_pull_data(sk, 0); - return 0; -} - -SEC("?tc") -int main_does_not_change(struct __sk_buff *sk) -{ - return 0; -} - -char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/compute_live_registers.c b/tools/testing/selftests/bpf/progs/compute_live_registers.c new file mode 100644 index 0000000000000..f3d79aecbf935 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/compute_live_registers.c @@ -0,0 +1,424 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "../../../include/linux/filter.h" +#include "bpf_arena_common.h" +#include "bpf_misc.h" + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} test_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARENA); + __uint(map_flags, BPF_F_MMAPABLE); + __uint(max_entries, 1); +} arena SEC(".maps"); + +SEC("socket") +__log_level(2) +__msg(" 0: .......... (b7) r0 = 42") +__msg(" 1: 0......... (bf) r1 = r0") +__msg(" 2: .1........ (bf) r2 = r1") +__msg(" 3: ..2....... (bf) r3 = r2") +__msg(" 4: ...3...... (bf) r4 = r3") +__msg(" 5: ....4..... (bf) r5 = r4") +__msg(" 6: .....5.... (bf) r6 = r5") +__msg(" 7: ......6... (bf) r7 = r6") +__msg(" 8: .......7.. (bf) r8 = r7") +__msg(" 9: ........8. (bf) r9 = r8") +__msg("10: .........9 (bf) r0 = r9") +__msg("11: 0......... (95) exit") +__naked void assign_chain(void) +{ + asm volatile ( + "r0 = 42;" + "r1 = r0;" + "r2 = r1;" + "r3 = r2;" + "r4 = r3;" + "r5 = r4;" + "r6 = r5;" + "r7 = r6;" + "r8 = r7;" + "r9 = r8;" + "r0 = r9;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("0: .......... (b7) r1 = 7") +__msg("1: .1........ (07) r1 += 7") +__msg("2: .......... (b7) r2 = 7") +__msg("3: ..2....... (b7) r3 = 42") +__msg("4: ..23...... (0f) r2 += r3") +__msg("5: .......... (b7) r0 = 0") +__msg("6: 0......... (95) exit") +__naked void arithmetics(void) +{ + asm volatile ( + "r1 = 7;" + "r1 += 7;" + "r2 = 7;" + "r3 = 42;" + "r2 += r3;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +#ifdef CAN_USE_BPF_ST +SEC("socket") +__log_level(2) +__msg(" 1: .1........ (07) r1 += -8") +__msg(" 2: .1........ (7a) *(u64 *)(r1 +0) = 7") +__msg(" 3: .1........ (b7) r2 = 42") +__msg(" 4: .12....... (7b) *(u64 *)(r1 +0) = r2") +__msg(" 5: .12....... (7b) *(u64 *)(r1 +0) = r2") +__msg(" 6: .......... (b7) r0 = 0") +__naked void store(void) +{ + asm volatile ( + "r1 = r10;" + "r1 += -8;" + "*(u64 *)(r1 +0) = 7;" + "r2 = 42;" + "*(u64 *)(r1 +0) = r2;" + "*(u64 *)(r1 +0) = r2;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} +#endif + +SEC("socket") +__log_level(2) +__msg("1: ....4..... (07) r4 += -8") +__msg("2: ....4..... (79) r5 = *(u64 *)(r4 +0)") +__msg("3: ....45.... (07) r4 += -8") +__naked void load(void) +{ + asm volatile ( + "r4 = r10;" + "r4 += -8;" + "r5 = *(u64 *)(r4 +0);" + "r4 += -8;" + "r0 = r5;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("0: .1........ (61) r2 = *(u32 *)(r1 +0)") +__msg("1: ..2....... (d4) r2 = le64 r2") +__msg("2: ..2....... (bf) r0 = r2") +__naked void endian(void) +{ + asm volatile ( + "r2 = *(u32 *)(r1 +0);" + "r2 = le64 r2;" + "r0 = r2;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg(" 8: 0......... (b7) r1 = 1") +__msg(" 9: 01........ (db) r1 = atomic64_fetch_add((u64 *)(r0 +0), r1)") +__msg("10: 01........ (c3) lock *(u32 *)(r0 +0) += r1") +__msg("11: 01........ (db) r1 = atomic64_xchg((u64 *)(r0 +0), r1)") +__msg("12: 01........ (bf) r2 = r0") +__msg("13: .12....... (bf) r0 = r1") +__msg("14: 012....... (db) r0 = atomic64_cmpxchg((u64 *)(r2 +0), r0, r1)") +__naked void atomic(void) +{ + asm volatile ( + "r2 = r10;" + "r2 += -8;" + "r1 = 0;" + "*(u64 *)(r2 +0) = r1;" + "r1 = %[test_map] ll;" + "call %[bpf_map_lookup_elem];" + "if r0 == 0 goto 1f;" + "r1 = 1;" + "r1 = atomic_fetch_add((u64 *)(r0 +0), r1);" + ".8byte %[add_nofetch];" /* same as "lock *(u32 *)(r0 +0) += r1;" */ + "r1 = xchg_64(r0 + 0, r1);" + "r2 = r0;" + "r0 = r1;" + "r0 = cmpxchg_64(r2 + 0, r0, r1);" + "1: exit;" + : + : __imm(bpf_map_lookup_elem), + __imm_addr(test_map), + __imm_insn(add_nofetch, BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_0, BPF_REG_1, 0)) + : __clobber_all); +} + +#ifdef CAN_USE_LOAD_ACQ_STORE_REL + +SEC("socket") +__log_level(2) +__msg("2: .12....... (db) store_release((u64 *)(r2 -8), r1)") +__msg("3: .......... (bf) r3 = r10") +__msg("4: ...3...... (db) r4 = load_acquire((u64 *)(r3 -8))") +__naked void atomic_load_acq_store_rel(void) +{ + asm volatile ( + "r1 = 42;" + "r2 = r10;" + ".8byte %[store_release_insn];" /* store_release((u64 *)(r2 - 8), r1); */ + "r3 = r10;" + ".8byte %[load_acquire_insn];" /* r4 = load_acquire((u64 *)(r3 + 0)); */ + "r0 = r4;" + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_STORE_REL, BPF_REG_2, BPF_REG_1, -8)), + __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_LOAD_ACQ, BPF_REG_4, BPF_REG_3, -8)) + : __clobber_all); +} + +#endif /* CAN_USE_LOAD_ACQ_STORE_REL */ + +SEC("socket") +__log_level(2) +__msg("4: .12....7.. (85) call bpf_trace_printk#6") +__msg("5: 0......7.. (0f) r0 += r7") +__naked void regular_call(void) +{ + asm volatile ( + "r7 = 1;" + "r1 = r10;" + "r1 += -8;" + "r2 = 1;" + "call %[bpf_trace_printk];" + "r0 += r7;" + "exit;" + : + : __imm(bpf_trace_printk) + : __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("2: 012....... (25) if r1 > 0x7 goto pc+1") +__msg("3: ..2....... (bf) r0 = r2") +__naked void if1(void) +{ + asm volatile ( + "r0 = 1;" + "r2 = 2;" + "if r1 > 0x7 goto +1;" + "r0 = r2;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("3: 0123...... (2d) if r1 > r3 goto pc+1") +__msg("4: ..2....... (bf) r0 = r2") +__naked void if2(void) +{ + asm volatile ( + "r0 = 1;" + "r2 = 2;" + "r3 = 7;" + "if r1 > r3 goto +1;" + "r0 = r2;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("0: .......... (b7) r1 = 0") +__msg("1: .1........ (b7) r2 = 7") +__msg("2: .12....... (25) if r1 > 0x7 goto pc+4") +__msg("3: .12....... (07) r1 += 1") +__msg("4: .12....... (27) r2 *= 2") +__msg("5: .12....... (05) goto pc+0") +__msg("6: .12....... (05) goto pc-5") +__msg("7: .......... (b7) r0 = 0") +__msg("8: 0......... (95) exit") +__naked void loop(void) +{ + asm volatile ( + "r1 = 0;" + "r2 = 7;" + "if r1 > 0x7 goto +4;" + "r1 += 1;" + "r2 *= 2;" + "goto +0;" + "goto -5;" + "r0 = 0;" + "exit;" + : + : __imm(bpf_trace_printk) + : __clobber_all); +} + +#ifdef CAN_USE_GOTOL +SEC("socket") +__log_level(2) +__msg("2: .123...... (25) if r1 > 0x7 goto pc+2") +__msg("3: ..2....... (bf) r0 = r2") +__msg("4: 0......... (06) gotol pc+1") +__msg("5: ...3...... (bf) r0 = r3") +__msg("6: 0......... (95) exit") +__naked void gotol(void) +{ + asm volatile ( + "r2 = 42;" + "r3 = 24;" + "if r1 > 0x7 goto +2;" + "r0 = r2;" + "gotol +1;" + "r0 = r3;" + "exit;" + : + : __imm(bpf_trace_printk) + : __clobber_all); +} +#endif + +SEC("socket") +__log_level(2) +__msg("0: .......... (b7) r1 = 1") +__msg("1: .1........ (e5) may_goto pc+1") +__msg("2: .......... (05) goto pc-3") +__msg("3: .1........ (bf) r0 = r1") +__msg("4: 0......... (95) exit") +__naked void may_goto(void) +{ + asm volatile ( + "1: r1 = 1;" + ".8byte %[may_goto];" + "goto 1b;" + "r0 = r1;" + "exit;" + : + : __imm(bpf_get_smp_processor_id), + __imm_insn(may_goto, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, +1 /* offset */, 0)) + : __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("1: 0......... (18) r2 = 0x7") +__msg("3: 0.2....... (0f) r0 += r2") +__naked void ldimm64(void) +{ + asm volatile ( + "r0 = 0;" + "r2 = 0x7 ll;" + "r0 += r2;" + "exit;" + : + :: __clobber_all); +} + +/* No rules specific for LD_ABS/LD_IND, default behaviour kicks in */ +SEC("socket") +__log_level(2) +__msg("2: 0123456789 (30) r0 = *(u8 *)skb[42]") +__msg("3: 012.456789 (0f) r7 += r0") +__msg("4: 012.456789 (b7) r3 = 42") +__msg("5: 0123456789 (50) r0 = *(u8 *)skb[r3 + 0]") +__msg("6: 0......7.. (0f) r7 += r0") +__naked void ldabs(void) +{ + asm volatile ( + "r6 = r1;" + "r7 = 0;" + "r0 = *(u8 *)skb[42];" + "r7 += r0;" + "r3 = 42;" + ".8byte %[ld_ind];" /* same as "r0 = *(u8 *)skb[r3];" */ + "r7 += r0;" + "r0 = r7;" + "exit;" + : + : __imm_insn(ld_ind, BPF_LD_IND(BPF_B, BPF_REG_3, 0)) + : __clobber_all); +} + + +#ifdef __BPF_FEATURE_ADDR_SPACE_CAST +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +__log_level(2) +__msg(" 6: .12345.... (85) call bpf_arena_alloc_pages") +__msg(" 7: 0......... (bf) r1 = addr_space_cast(r0, 0, 1)") +__msg(" 8: .1........ (b7) r2 = 42") +__naked void addr_space_cast(void) +{ + asm volatile ( + "r1 = %[arena] ll;" + "r2 = 0;" + "r3 = 1;" + "r4 = 0;" + "r5 = 0;" + "call %[bpf_arena_alloc_pages];" + "r1 = addr_space_cast(r0, 0, 1);" + "r2 = 42;" + "*(u64 *)(r1 +0) = r2;" + "r0 = 0;" + "exit;" + : + : __imm(bpf_arena_alloc_pages), + __imm_addr(arena) + : __clobber_all); +} +#endif + +static __used __naked int aux1(void) +{ + asm volatile ( + "r0 = r1;" + "r0 += r2;" + "exit;" + ::: __clobber_all); +} + +SEC("socket") +__log_level(2) +__msg("0: ....45.... (b7) r1 = 1") +__msg("1: .1..45.... (b7) r2 = 2") +__msg("2: .12.45.... (b7) r3 = 3") +/* Conservative liveness for subprog parameters. */ +__msg("3: .12345.... (85) call pc+2") +__msg("4: .......... (b7) r0 = 0") +__msg("5: 0......... (95) exit") +__msg("6: .12....... (bf) r0 = r1") +__msg("7: 0.2....... (0f) r0 += r2") +/* Conservative liveness for subprog return value. */ +__msg("8: 0......... (95) exit") +__naked void subprog1(void) +{ + asm volatile ( + "r1 = 1;" + "r2 = 2;" + "r3 = 3;" + "call aux1;" + "r0 = 0;" + "exit;" + ::: __clobber_all); +} + +/* to retain debug info for BTF generation */ +void kfunc_root(void) +{ + bpf_arena_alloc_pages(0, 0, 0, 0, 0); +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/connect4_dropper.c b/tools/testing/selftests/bpf/progs/connect4_dropper.c index d3f4c5e4fb696..a3819a5d09c8f 100644 --- a/tools/testing/selftests/bpf/progs/connect4_dropper.c +++ b/tools/testing/selftests/bpf/progs/connect4_dropper.c @@ -13,12 +13,14 @@ #define VERDICT_REJECT 0 #define VERDICT_PROCEED 1 +int port; + SEC("cgroup/connect4") int connect_v4_dropper(struct bpf_sock_addr *ctx) { if (ctx->type != SOCK_STREAM) return VERDICT_PROCEED; - if (ctx->user_port == bpf_htons(60120)) + if (ctx->user_port == bpf_htons(port)) return VERDICT_REJECT; return VERDICT_PROCEED; } diff --git a/tools/testing/selftests/bpf/progs/cpumask_common.h b/tools/testing/selftests/bpf/progs/cpumask_common.h index 4ece7873ba609..86085b79f5cac 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_common.h +++ b/tools/testing/selftests/bpf/progs/cpumask_common.h @@ -61,6 +61,7 @@ u32 bpf_cpumask_any_distribute(const struct cpumask *src) __ksym __weak; u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, const struct cpumask *src2) __ksym __weak; u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym __weak; +int bpf_cpumask_populate(struct cpumask *cpumask, void *src, size_t src__sz) __ksym __weak; void bpf_rcu_read_lock(void) __ksym __weak; void bpf_rcu_read_unlock(void) __ksym __weak; diff --git a/tools/testing/selftests/bpf/progs/cpumask_failure.c b/tools/testing/selftests/bpf/progs/cpumask_failure.c index b40b52548ffb0..8a2fd596c8a39 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_failure.c +++ b/tools/testing/selftests/bpf/progs/cpumask_failure.c @@ -222,3 +222,41 @@ int BPF_PROG(test_invalid_nested_array, struct task_struct *task, u64 clone_flag return 0; } + +SEC("tp_btf/task_newtask") +__failure __msg("type=scalar expected=fp") +int BPF_PROG(test_populate_invalid_destination, struct task_struct *task, u64 clone_flags) +{ + struct bpf_cpumask *invalid = (struct bpf_cpumask *)0x123456; + u64 bits; + int ret; + + ret = bpf_cpumask_populate((struct cpumask *)invalid, &bits, sizeof(bits)); + if (!ret) + err = 2; + + return 0; +} + +SEC("tp_btf/task_newtask") +__failure __msg("leads to invalid memory access") +int BPF_PROG(test_populate_invalid_source, struct task_struct *task, u64 clone_flags) +{ + void *garbage = (void *)0x123456; + struct bpf_cpumask *local; + int ret; + + local = create_cpumask(); + if (!local) { + err = 1; + return 0; + } + + ret = bpf_cpumask_populate((struct cpumask *)local, garbage, 8); + if (!ret) + err = 2; + + bpf_cpumask_release(local); + + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/cpumask_success.c b/tools/testing/selftests/bpf/progs/cpumask_success.c index 80ee469b0b602..0e04c31b91c0c 100644 --- a/tools/testing/selftests/bpf/progs/cpumask_success.c +++ b/tools/testing/selftests/bpf/progs/cpumask_success.c @@ -749,7 +749,6 @@ int BPF_PROG(test_cpumask_weight, struct task_struct *task, u64 clone_flags) } SEC("tp_btf/task_newtask") -__success int BPF_PROG(test_refcount_null_tracking, struct task_struct *task, u64 clone_flags) { struct bpf_cpumask *mask1, *mask2; @@ -770,3 +769,122 @@ int BPF_PROG(test_refcount_null_tracking, struct task_struct *task, u64 clone_fl bpf_cpumask_release(mask2); return 0; } + +SEC("tp_btf/task_newtask") +int BPF_PROG(test_populate_reject_small_mask, struct task_struct *task, u64 clone_flags) +{ + struct bpf_cpumask *local; + u8 toofewbits; + int ret; + + if (!is_test_task()) + return 0; + + local = create_cpumask(); + if (!local) + return 0; + + /* The kfunc should prevent this operation */ + ret = bpf_cpumask_populate((struct cpumask *)local, &toofewbits, sizeof(toofewbits)); + if (ret != -EACCES) + err = 2; + + bpf_cpumask_release(local); + + return 0; +} + +/* Mask is guaranteed to be large enough for bpf_cpumask_t. */ +#define CPUMASK_TEST_MASKLEN (sizeof(cpumask_t)) + +/* Add an extra word for the test_populate_reject_unaligned test. */ +u64 bits[CPUMASK_TEST_MASKLEN / 8 + 1]; +extern bool CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS __kconfig __weak; + +SEC("tp_btf/task_newtask") +int BPF_PROG(test_populate_reject_unaligned, struct task_struct *task, u64 clone_flags) +{ + struct bpf_cpumask *mask; + char *src; + int ret; + + if (!is_test_task()) + return 0; + + /* Skip if unaligned accesses are fine for this arch. */ + if (CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) + return 0; + + mask = bpf_cpumask_create(); + if (!mask) { + err = 1; + return 0; + } + + /* Misalign the source array by a byte. */ + src = &((char *)bits)[1]; + + ret = bpf_cpumask_populate((struct cpumask *)mask, src, CPUMASK_TEST_MASKLEN); + if (ret != -EINVAL) + err = 2; + + bpf_cpumask_release(mask); + + return 0; +} + + +SEC("tp_btf/task_newtask") +int BPF_PROG(test_populate, struct task_struct *task, u64 clone_flags) +{ + struct bpf_cpumask *mask; + bool bit; + int ret; + int i; + + if (!is_test_task()) + return 0; + + /* Set only odd bits. */ + __builtin_memset(bits, 0xaa, CPUMASK_TEST_MASKLEN); + + mask = bpf_cpumask_create(); + if (!mask) { + err = 1; + return 0; + } + + /* Pass the entire bits array, the kfunc will only copy the valid bits. */ + ret = bpf_cpumask_populate((struct cpumask *)mask, bits, CPUMASK_TEST_MASKLEN); + if (ret) { + err = 2; + goto out; + } + + /* + * Test is there to appease the verifier. We cannot directly + * access NR_CPUS, the upper bound for nr_cpus, so we infer + * it from the size of cpumask_t. + */ + if (nr_cpus < 0 || nr_cpus >= CPUMASK_TEST_MASKLEN * 8) { + err = 3; + goto out; + } + + bpf_for(i, 0, nr_cpus) { + /* Odd-numbered bits should be set, even ones unset. */ + bit = bpf_cpumask_test_cpu(i, (const struct cpumask *)mask); + if (bit == (i % 2 != 0)) + continue; + + err = 4; + break; + } + +out: + bpf_cpumask_release(mask); + + return 0; +} + +#undef CPUMASK_TEST_MASKLEN diff --git a/tools/testing/selftests/bpf/progs/dynptr_success.c b/tools/testing/selftests/bpf/progs/dynptr_success.c index bfcc85686cf04..e1fba28e4a868 100644 --- a/tools/testing/selftests/bpf/progs/dynptr_success.c +++ b/tools/testing/selftests/bpf/progs/dynptr_success.c @@ -1,20 +1,19 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2022 Facebook */ +#include #include #include -#include #include #include #include "bpf_misc.h" -#include "bpf_kfuncs.h" #include "errno.h" char _license[] SEC("license") = "GPL"; int pid, err, val; -struct sample { +struct ringbuf_sample { int pid; int seq; long value; @@ -121,7 +120,7 @@ int test_dynptr_data(void *ctx) static int ringbuf_callback(__u32 index, void *data) { - struct sample *sample; + struct ringbuf_sample *sample; struct bpf_dynptr *ptr = (struct bpf_dynptr *)data; @@ -138,7 +137,7 @@ SEC("?tp/syscalls/sys_enter_nanosleep") int test_ringbuf(void *ctx) { struct bpf_dynptr ptr; - struct sample *sample; + struct ringbuf_sample *sample; if (bpf_get_current_pid_tgid() >> 32 != pid) return 0; @@ -567,3 +566,117 @@ int BPF_PROG(test_dynptr_skb_tp_btf, void *skb, void *location) return 1; } + +static inline int bpf_memcmp(const char *a, const char *b, u32 size) +{ + int i; + + bpf_for(i, 0, size) { + if (a[i] != b[i]) + return a[i] < b[i] ? -1 : 1; + } + return 0; +} + +SEC("?tp/syscalls/sys_enter_nanosleep") +int test_dynptr_copy(void *ctx) +{ + char data[] = "hello there, world!!"; + char buf[32] = {'\0'}; + __u32 sz = sizeof(data); + struct bpf_dynptr src, dst; + + bpf_ringbuf_reserve_dynptr(&ringbuf, sz, 0, &src); + bpf_ringbuf_reserve_dynptr(&ringbuf, sz, 0, &dst); + + /* Test basic case of copying contiguous memory backed dynptrs */ + err = bpf_dynptr_write(&src, 0, data, sz, 0); + err = err ?: bpf_dynptr_copy(&dst, 0, &src, 0, sz); + err = err ?: bpf_dynptr_read(buf, sz, &dst, 0, 0); + err = err ?: bpf_memcmp(data, buf, sz); + + /* Test that offsets are handled correctly */ + err = err ?: bpf_dynptr_copy(&dst, 3, &src, 5, sz - 5); + err = err ?: bpf_dynptr_read(buf, sz - 5, &dst, 3, 0); + err = err ?: bpf_memcmp(data + 5, buf, sz - 5); + + bpf_ringbuf_discard_dynptr(&src, 0); + bpf_ringbuf_discard_dynptr(&dst, 0); + return 0; +} + +SEC("xdp") +int test_dynptr_copy_xdp(struct xdp_md *xdp) +{ + struct bpf_dynptr ptr_buf, ptr_xdp; + char data[] = "qwertyuiopasdfghjkl"; + char buf[32] = {'\0'}; + __u32 len = sizeof(data); + int i, chunks = 200; + + /* ptr_xdp is backed by non-contiguous memory */ + bpf_dynptr_from_xdp(xdp, 0, &ptr_xdp); + bpf_ringbuf_reserve_dynptr(&ringbuf, len * chunks, 0, &ptr_buf); + + /* Destination dynptr is backed by non-contiguous memory */ + bpf_for(i, 0, chunks) { + err = bpf_dynptr_write(&ptr_buf, i * len, data, len, 0); + if (err) + goto out; + } + + err = bpf_dynptr_copy(&ptr_xdp, 0, &ptr_buf, 0, len * chunks); + if (err) + goto out; + + bpf_for(i, 0, chunks) { + __builtin_memset(buf, 0, sizeof(buf)); + err = bpf_dynptr_read(&buf, len, &ptr_xdp, i * len, 0); + if (err) + goto out; + if (bpf_memcmp(data, buf, len) != 0) + goto out; + } + + /* Source dynptr is backed by non-contiguous memory */ + __builtin_memset(buf, 0, sizeof(buf)); + bpf_for(i, 0, chunks) { + err = bpf_dynptr_write(&ptr_buf, i * len, buf, len, 0); + if (err) + goto out; + } + + err = bpf_dynptr_copy(&ptr_buf, 0, &ptr_xdp, 0, len * chunks); + if (err) + goto out; + + bpf_for(i, 0, chunks) { + __builtin_memset(buf, 0, sizeof(buf)); + err = bpf_dynptr_read(&buf, len, &ptr_buf, i * len, 0); + if (err) + goto out; + if (bpf_memcmp(data, buf, len) != 0) + goto out; + } + + /* Both source and destination dynptrs are backed by non-contiguous memory */ + err = bpf_dynptr_copy(&ptr_xdp, 2, &ptr_xdp, len, len * (chunks - 1)); + if (err) + goto out; + + bpf_for(i, 0, chunks - 1) { + __builtin_memset(buf, 0, sizeof(buf)); + err = bpf_dynptr_read(&buf, len, &ptr_xdp, 2 + i * len, 0); + if (err) + goto out; + if (bpf_memcmp(data, buf, len) != 0) + goto out; + } + + if (bpf_dynptr_copy(&ptr_xdp, 2000, &ptr_xdp, 0, len * chunks) != -E2BIG) + err = 1; + +out: + bpf_ringbuf_discard_dynptr(&ptr_buf, 0); + return XDP_DROP; +} diff --git a/tools/testing/selftests/bpf/progs/irq.c b/tools/testing/selftests/bpf/progs/irq.c index b0b53d9809642..298d48d7886d9 100644 --- a/tools/testing/selftests/bpf/progs/irq.c +++ b/tools/testing/selftests/bpf/progs/irq.c @@ -222,7 +222,7 @@ int __noinline global_local_irq_balance(void) } SEC("?tc") -__failure __msg("global function calls are not allowed with IRQs disabled") +__success int irq_global_subprog(struct __sk_buff *ctx) { unsigned long flags; @@ -441,4 +441,73 @@ int irq_ooo_refs_array(struct __sk_buff *ctx) return 0; } +int __noinline +global_subprog(int i) +{ + if (i) + bpf_printk("%p", &i); + return i; +} + +int __noinline +global_sleepable_helper_subprog(int i) +{ + if (i) + bpf_copy_from_user(&i, sizeof(i), NULL); + return i; +} + +int __noinline +global_sleepable_kfunc_subprog(int i) +{ + if (i) + bpf_copy_from_user_str(&i, sizeof(i), NULL, 0); + global_subprog(i); + return i; +} + +int __noinline +global_subprog_calling_sleepable_global(int i) +{ + if (!i) + global_sleepable_kfunc_subprog(i); + return i; +} + +SEC("?syscall") +__success +int irq_non_sleepable_global_subprog(void *ctx) +{ + unsigned long flags; + + bpf_local_irq_save(&flags); + global_subprog(0); + bpf_local_irq_restore(&flags); + return 0; +} + +SEC("?syscall") +__failure __msg("global functions that may sleep are not allowed in non-sleepable context") +int irq_sleepable_helper_global_subprog(void *ctx) +{ + unsigned long flags; + + bpf_local_irq_save(&flags); + global_sleepable_helper_subprog(0); + bpf_local_irq_restore(&flags); + return 0; +} + +SEC("?syscall") +__failure __msg("global functions that may sleep are not allowed in non-sleepable context") +int irq_sleepable_global_subprog_indirect(void *ctx) +{ + unsigned long flags; + + bpf_local_irq_save(&flags); + global_subprog_calling_sleepable_global(0); + bpf_local_irq_restore(&flags); + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/preempt_lock.c b/tools/testing/selftests/bpf/progs/preempt_lock.c index 6c5797bf0ead5..7d04254e61f1c 100644 --- a/tools/testing/selftests/bpf/progs/preempt_lock.c +++ b/tools/testing/selftests/bpf/progs/preempt_lock.c @@ -134,7 +134,7 @@ int __noinline preempt_global_subprog(void) } SEC("?tc") -__failure __msg("global function calls are not allowed with preemption disabled") +__success int preempt_global_subprog_test(struct __sk_buff *ctx) { preempt_disable(); @@ -143,4 +143,70 @@ int preempt_global_subprog_test(struct __sk_buff *ctx) return 0; } +int __noinline +global_subprog(int i) +{ + if (i) + bpf_printk("%p", &i); + return i; +} + +int __noinline +global_sleepable_helper_subprog(int i) +{ + if (i) + bpf_copy_from_user(&i, sizeof(i), NULL); + return i; +} + +int __noinline +global_sleepable_kfunc_subprog(int i) +{ + if (i) + bpf_copy_from_user_str(&i, sizeof(i), NULL, 0); + global_subprog(i); + return i; +} + +int __noinline +global_subprog_calling_sleepable_global(int i) +{ + if (!i) + global_sleepable_kfunc_subprog(i); + return i; +} + +SEC("?syscall") +__failure __msg("global functions that may sleep are not allowed in non-sleepable context") +int preempt_global_sleepable_helper_subprog(struct __sk_buff *ctx) +{ + preempt_disable(); + if (ctx->mark) + global_sleepable_helper_subprog(ctx->mark); + preempt_enable(); + return 0; +} + +SEC("?syscall") +__failure __msg("global functions that may sleep are not allowed in non-sleepable context") +int preempt_global_sleepable_kfunc_subprog(struct __sk_buff *ctx) +{ + preempt_disable(); + if (ctx->mark) + global_sleepable_kfunc_subprog(ctx->mark); + preempt_enable(); + return 0; +} + +SEC("?syscall") +__failure __msg("global functions that may sleep are not allowed in non-sleepable context") +int preempt_global_sleepable_subprog_indirect(struct __sk_buff *ctx) +{ + preempt_disable(); + if (ctx->mark) + global_subprog_calling_sleepable_global(ctx->mark); + preempt_enable(); + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/prepare.c b/tools/testing/selftests/bpf/progs/prepare.c new file mode 100644 index 0000000000000..1f1dd547e4ee2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/prepare.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Meta */ +#include +#include +//#include + +char _license[] SEC("license") = "GPL"; + +int err; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 4096); +} ringbuf SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u32); +} array_map SEC(".maps"); + +SEC("cgroup_skb/egress") +int program(struct __sk_buff *skb) +{ + err = 0; + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/rcu_read_lock.c b/tools/testing/selftests/bpf/progs/rcu_read_lock.c index ab3a532b7dd6d..43637ee2cdcd2 100644 --- a/tools/testing/selftests/bpf/progs/rcu_read_lock.c +++ b/tools/testing/selftests/bpf/progs/rcu_read_lock.c @@ -242,7 +242,8 @@ int inproper_sleepable_helper(void *ctx) } SEC("?lsm.s/bpf") -int BPF_PROG(inproper_sleepable_kfunc, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(inproper_sleepable_kfunc, int cmd, union bpf_attr *attr, unsigned int size, + bool kernel) { struct bpf_key *bkey; @@ -439,3 +440,61 @@ int rcu_read_lock_global_subprog_unlock(void *ctx) ret += global_subprog_unlock(ret); return 0; } + +int __noinline +global_sleepable_helper_subprog(int i) +{ + if (i) + bpf_copy_from_user(&i, sizeof(i), NULL); + return i; +} + +int __noinline +global_sleepable_kfunc_subprog(int i) +{ + if (i) + bpf_copy_from_user_str(&i, sizeof(i), NULL, 0); + global_subprog(i); + return i; +} + +int __noinline +global_subprog_calling_sleepable_global(int i) +{ + if (!i) + global_sleepable_kfunc_subprog(i); + return i; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int rcu_read_lock_sleepable_helper_global_subprog(void *ctx) +{ + volatile int ret = 0; + + bpf_rcu_read_lock(); + ret += global_sleepable_helper_subprog(ret); + bpf_rcu_read_unlock(); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int rcu_read_lock_sleepable_kfunc_global_subprog(void *ctx) +{ + volatile int ret = 0; + + bpf_rcu_read_lock(); + ret += global_sleepable_kfunc_subprog(ret); + bpf_rcu_read_unlock(); + return 0; +} + +SEC("?fentry.s/" SYS_PREFIX "sys_getpgid") +int rcu_read_lock_sleepable_global_subprog_indirect(void *ctx) +{ + volatile int ret = 0; + + bpf_rcu_read_lock(); + ret += global_subprog_calling_sleepable_global(ret); + bpf_rcu_read_unlock(); + return 0; +} diff --git a/tools/testing/selftests/bpf/progs/sock_iter_batch.c b/tools/testing/selftests/bpf/progs/sock_iter_batch.c index 96531b0d9d55b..8f483337e103c 100644 --- a/tools/testing/selftests/bpf/progs/sock_iter_batch.c +++ b/tools/testing/selftests/bpf/progs/sock_iter_batch.c @@ -17,6 +17,12 @@ static bool ipv6_addr_loopback(const struct in6_addr *a) a->s6_addr32[2] | (a->s6_addr32[3] ^ bpf_htonl(1))) == 0; } +static bool ipv4_addr_loopback(__be32 a) +{ + return a == bpf_ntohl(0x7f000001); +} + +volatile const unsigned int sf; volatile const __u16 ports[2]; unsigned int bucket[2]; @@ -26,16 +32,20 @@ int iter_tcp_soreuse(struct bpf_iter__tcp *ctx) struct sock *sk = (struct sock *)ctx->sk_common; struct inet_hashinfo *hinfo; unsigned int hash; + __u64 sock_cookie; struct net *net; int idx; if (!sk) return 0; + sock_cookie = bpf_get_socket_cookie(sk); sk = bpf_core_cast(sk, struct sock); - if (sk->sk_family != AF_INET6 || + if (sk->sk_family != sf || sk->sk_state != TCP_LISTEN || - !ipv6_addr_loopback(&sk->sk_v6_rcv_saddr)) + sk->sk_family == AF_INET6 ? + !ipv6_addr_loopback(&sk->sk_v6_rcv_saddr) : + !ipv4_addr_loopback(sk->sk_rcv_saddr)) return 0; if (sk->sk_num == ports[0]) @@ -52,6 +62,7 @@ int iter_tcp_soreuse(struct bpf_iter__tcp *ctx) hinfo = net->ipv4.tcp_death_row.hashinfo; bucket[idx] = hash & hinfo->lhash2_mask; bpf_seq_write(ctx->meta->seq, &idx, sizeof(idx)); + bpf_seq_write(ctx->meta->seq, &sock_cookie, sizeof(sock_cookie)); return 0; } @@ -63,14 +74,18 @@ int iter_udp_soreuse(struct bpf_iter__udp *ctx) { struct sock *sk = (struct sock *)ctx->udp_sk; struct udp_table *udptable; + __u64 sock_cookie; int idx; if (!sk) return 0; + sock_cookie = bpf_get_socket_cookie(sk); sk = bpf_core_cast(sk, struct sock); - if (sk->sk_family != AF_INET6 || - !ipv6_addr_loopback(&sk->sk_v6_rcv_saddr)) + if (sk->sk_family != sf || + sk->sk_family == AF_INET6 ? + !ipv6_addr_loopback(&sk->sk_v6_rcv_saddr) : + !ipv4_addr_loopback(sk->sk_rcv_saddr)) return 0; if (sk->sk_num == ports[0]) @@ -84,6 +99,7 @@ int iter_udp_soreuse(struct bpf_iter__udp *ctx) udptable = sk->sk_net.net->ipv4.udp_table; bucket[idx] = udp_sk(sk)->udp_portaddr_hash & udptable->mask; bpf_seq_write(ctx->meta->seq, &idx, sizeof(idx)); + bpf_seq_write(ctx->meta->seq, &sock_cookie, sizeof(sock_cookie)); return 0; } diff --git a/tools/testing/selftests/bpf/progs/summarization.c b/tools/testing/selftests/bpf/progs/summarization.c new file mode 100644 index 0000000000000..f89effe82c9e2 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/summarization.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include "bpf_misc.h" + +__noinline +long changes_pkt_data(struct __sk_buff *sk) +{ + return bpf_skb_pull_data(sk, 0); +} + +__noinline __weak +long does_not_change_pkt_data(struct __sk_buff *sk) +{ + return 0; +} + +SEC("?tc") +int main_changes_with_subprogs(struct __sk_buff *sk) +{ + changes_pkt_data(sk); + does_not_change_pkt_data(sk); + return 0; +} + +SEC("?tc") +int main_changes(struct __sk_buff *sk) +{ + bpf_skb_pull_data(sk, 0); + return 0; +} + +SEC("?tc") +int main_does_not_change(struct __sk_buff *sk) +{ + return 0; +} + +__noinline +long might_sleep(struct pt_regs *ctx __arg_ctx) +{ + int i; + + bpf_copy_from_user(&i, sizeof(i), NULL); + return i; +} + +__noinline __weak +long does_not_sleep(struct pt_regs *ctx __arg_ctx) +{ + return 0; +} + +SEC("?uprobe.s") +int main_might_sleep_with_subprogs(struct pt_regs *ctx) +{ + might_sleep(ctx); + does_not_sleep(ctx); + return 0; +} + +SEC("?uprobe.s") +int main_might_sleep(struct pt_regs *ctx) +{ + int i; + + bpf_copy_from_user(&i, sizeof(i), NULL); + return i; +} + +SEC("?uprobe.s") +int main_does_not_sleep(struct pt_regs *ctx) +{ + return 0; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c b/tools/testing/selftests/bpf/progs/summarization_freplace.c similarity index 57% rename from tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c rename to tools/testing/selftests/bpf/progs/summarization_freplace.c index f9a622705f1b3..935f00e0e9ea0 100644 --- a/tools/testing/selftests/bpf/progs/changes_pkt_data_freplace.c +++ b/tools/testing/selftests/bpf/progs/summarization_freplace.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include +#include #include SEC("?freplace") @@ -15,4 +15,19 @@ long does_not_change_pkt_data(struct __sk_buff *sk) return 0; } +SEC("?freplace") +long might_sleep(struct pt_regs *ctx) +{ + int i; + + bpf_copy_from_user(&i, sizeof(i), NULL); + return i; +} + +SEC("?freplace") +long does_not_sleep(struct pt_regs *ctx) +{ + return 0; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_cgroup1_hierarchy.c b/tools/testing/selftests/bpf/progs/test_cgroup1_hierarchy.c index 44628865fe1d4..4fee0fdc76076 100644 --- a/tools/testing/selftests/bpf/progs/test_cgroup1_hierarchy.c +++ b/tools/testing/selftests/bpf/progs/test_cgroup1_hierarchy.c @@ -51,13 +51,13 @@ static int bpf_link_create_verify(int cmd) } SEC("lsm/bpf") -int BPF_PROG(lsm_run, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(lsm_run, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { return bpf_link_create_verify(cmd); } SEC("lsm.s/bpf") -int BPF_PROG(lsm_s_run, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(lsm_s_run, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { return bpf_link_create_verify(cmd); } diff --git a/tools/testing/selftests/bpf/progs/test_kernel_flag.c b/tools/testing/selftests/bpf/progs/test_kernel_flag.c new file mode 100644 index 0000000000000..b45fab3be3528 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_kernel_flag.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (C) 2025 Microsoft Corporation + * + * Author: Blaise Boscaccy + */ + +#include "vmlinux.h" +#include +#include +#include + +char _license[] SEC("license") = "GPL"; + +__u32 monitored_tid; + +SEC("lsm.s/bpf") +int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) +{ + __u32 tid; + + tid = bpf_get_current_pid_tgid() & 0xFFFFFFFF; + if (!kernel || tid != monitored_tid) + return 0; + else + return -EINVAL; +} diff --git a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c index cd4d752bd089c..061befb004c24 100644 --- a/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c +++ b/tools/testing/selftests/bpf/progs/test_kfunc_dynptr_param.c @@ -36,7 +36,7 @@ char _license[] SEC("license") = "GPL"; SEC("?lsm.s/bpf") __failure __msg("cannot pass in dynptr at an offset=-8") -int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { unsigned long val; @@ -46,7 +46,7 @@ int BPF_PROG(not_valid_dynptr, int cmd, union bpf_attr *attr, unsigned int size) SEC("?lsm.s/bpf") __failure __msg("arg#0 expected pointer to stack or const struct bpf_dynptr") -int BPF_PROG(not_ptr_to_stack, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(not_ptr_to_stack, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { unsigned long val = 0; @@ -55,7 +55,7 @@ int BPF_PROG(not_ptr_to_stack, int cmd, union bpf_attr *attr, unsigned int size) } SEC("lsm.s/bpf") -int BPF_PROG(dynptr_data_null, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(dynptr_data_null, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { struct bpf_key *trusted_keyring; struct bpf_dynptr ptr; diff --git a/tools/testing/selftests/bpf/progs/test_lookup_key.c b/tools/testing/selftests/bpf/progs/test_lookup_key.c index c73776990ae30..cdbbb12f1491a 100644 --- a/tools/testing/selftests/bpf/progs/test_lookup_key.c +++ b/tools/testing/selftests/bpf/progs/test_lookup_key.c @@ -23,7 +23,7 @@ extern struct bpf_key *bpf_lookup_system_key(__u64 id) __ksym; extern void bpf_key_put(struct bpf_key *key) __ksym; SEC("lsm.s/bpf") -int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { struct bpf_key *bkey; __u32 pid; diff --git a/tools/testing/selftests/bpf/progs/test_ptr_untrusted.c b/tools/testing/selftests/bpf/progs/test_ptr_untrusted.c index 2fdc44e766248..89b0cd5a3e06e 100644 --- a/tools/testing/selftests/bpf/progs/test_ptr_untrusted.c +++ b/tools/testing/selftests/bpf/progs/test_ptr_untrusted.c @@ -7,7 +7,7 @@ char tp_name[128]; SEC("lsm.s/bpf") -int BPF_PROG(lsm_run, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(lsm_run, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { switch (cmd) { case BPF_RAW_TRACEPOINT_OPEN: diff --git a/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c b/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c index 5eb25c6ad75b1..a5be3267dbb01 100644 --- a/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c +++ b/tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2018 Facebook */ -#include #include #include #include diff --git a/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c b/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c index 1c8b678e2e9a3..f678ee6bd7eaf 100644 --- a/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c +++ b/tools/testing/selftests/bpf/progs/test_spin_lock_fail.c @@ -245,4 +245,73 @@ int lock_global_subprog_call2(struct __sk_buff *ctx) return ret; } +int __noinline +global_subprog_int(int i) +{ + if (i) + bpf_printk("%p", &i); + return i; +} + +int __noinline +global_sleepable_helper_subprog(int i) +{ + if (i) + bpf_copy_from_user(&i, sizeof(i), NULL); + return i; +} + +int __noinline +global_sleepable_kfunc_subprog(int i) +{ + if (i) + bpf_copy_from_user_str(&i, sizeof(i), NULL, 0); + global_subprog_int(i); + return i; +} + +int __noinline +global_subprog_calling_sleepable_global(int i) +{ + if (!i) + global_sleepable_kfunc_subprog(i); + return i; +} + +SEC("?syscall") +int lock_global_sleepable_helper_subprog(struct __sk_buff *ctx) +{ + int ret = 0; + + bpf_spin_lock(&lockA); + if (ctx->mark == 42) + ret = global_sleepable_helper_subprog(ctx->mark); + bpf_spin_unlock(&lockA); + return ret; +} + +SEC("?syscall") +int lock_global_sleepable_kfunc_subprog(struct __sk_buff *ctx) +{ + int ret = 0; + + bpf_spin_lock(&lockA); + if (ctx->mark == 42) + ret = global_sleepable_kfunc_subprog(ctx->mark); + bpf_spin_unlock(&lockA); + return ret; +} + +SEC("?syscall") +int lock_global_sleepable_subprog_indirect(struct __sk_buff *ctx) +{ + int ret = 0; + + bpf_spin_lock(&lockA); + if (ctx->mark == 42) + ret = global_subprog_calling_sleepable_global(ctx->mark); + bpf_spin_unlock(&lockA); + return ret; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/test_task_under_cgroup.c b/tools/testing/selftests/bpf/progs/test_task_under_cgroup.c index 7e750309ce274..0b74b8bd22e87 100644 --- a/tools/testing/selftests/bpf/progs/test_task_under_cgroup.c +++ b/tools/testing/selftests/bpf/progs/test_task_under_cgroup.c @@ -49,7 +49,7 @@ int BPF_PROG(tp_btf_run, struct task_struct *task, u64 clone_flags) } SEC("lsm.s/bpf") -int BPF_PROG(lsm_run, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(lsm_run, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { struct cgroup *cgrp = NULL; struct task_struct *task; diff --git a/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c b/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c index 12034a73ee2d2..e96d09e111155 100644 --- a/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c +++ b/tools/testing/selftests/bpf/progs/test_verify_pkcs7_sig.c @@ -37,7 +37,7 @@ struct { char _license[] SEC("license") = "GPL"; SEC("lsm.s/bpf") -int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size) +int BPF_PROG(bpf, int cmd, union bpf_attr *attr, unsigned int size, bool kernel) { struct bpf_dynptr data_ptr, sig_ptr; struct data *data_val; diff --git a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c index 5094c288cfd7b..a9be6ae494545 100644 --- a/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c +++ b/tools/testing/selftests/bpf/progs/verifier_bpf_fastcall.c @@ -620,23 +620,61 @@ __naked void helper_call_does_not_prevent_bpf_fastcall(void) SEC("raw_tp") __arch_x86_64 +__log_level(4) __msg("stack depth 24") +/* may_goto counter at -24 */ +__xlated("0: *(u64 *)(r10 -24) =") +/* may_goto timestamp at -16 */ +__xlated("1: *(u64 *)(r10 -16) =") +__xlated("2: r1 = 1") +__xlated("...") +__xlated("4: r0 = &(void __percpu *)(r0)") +__xlated("...") +/* may_goto expansion starts */ +__xlated("6: r11 = *(u64 *)(r10 -24)") +__xlated("7: if r11 == 0x0 goto pc+6") +__xlated("8: r11 -= 1") +__xlated("9: if r11 != 0x0 goto pc+2") +__xlated("10: r11 = -24") +__xlated("11: call unknown") +__xlated("12: *(u64 *)(r10 -24) = r11") +/* may_goto expansion ends */ +__xlated("13: *(u64 *)(r10 -8) = r1") +__xlated("14: exit") +__success +__naked void may_goto_interaction_x86_64(void) +{ + asm volatile ( + "r1 = 1;" + "*(u64 *)(r10 - 16) = r1;" + "call %[bpf_get_smp_processor_id];" + "r1 = *(u64 *)(r10 - 16);" + ".8byte %[may_goto];" + /* just touch some stack at -8 */ + "*(u64 *)(r10 - 8) = r1;" + "exit;" + : + : __imm(bpf_get_smp_processor_id), + __imm_insn(may_goto, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, +1 /* offset */, 0)) + : __clobber_all); +} + +SEC("raw_tp") +__arch_arm64 __log_level(4) __msg("stack depth 16") /* may_goto counter at -16 */ __xlated("0: *(u64 *)(r10 -16) =") __xlated("1: r1 = 1") -__xlated("...") -__xlated("3: r0 = &(void __percpu *)(r0)") -__xlated("...") +__xlated("2: call bpf_get_smp_processor_id") /* may_goto expansion starts */ -__xlated("5: r11 = *(u64 *)(r10 -16)") -__xlated("6: if r11 == 0x0 goto pc+3") -__xlated("7: r11 -= 1") -__xlated("8: *(u64 *)(r10 -16) = r11") +__xlated("3: r11 = *(u64 *)(r10 -16)") +__xlated("4: if r11 == 0x0 goto pc+3") +__xlated("5: r11 -= 1") +__xlated("6: *(u64 *)(r10 -16) = r11") /* may_goto expansion ends */ -__xlated("9: *(u64 *)(r10 -8) = r1") -__xlated("10: exit") +__xlated("7: *(u64 *)(r10 -8) = r1") +__xlated("8: exit") __success -__naked void may_goto_interaction(void) +__naked void may_goto_interaction_arm64(void) { asm volatile ( "r1 = 1;" diff --git a/tools/testing/selftests/bpf/progs/verifier_gotol.c b/tools/testing/selftests/bpf/progs/verifier_gotol.c index 05a329ee45ee9..d5d8f24df3949 100644 --- a/tools/testing/selftests/bpf/progs/verifier_gotol.c +++ b/tools/testing/selftests/bpf/progs/verifier_gotol.c @@ -4,11 +4,7 @@ #include #include "bpf_misc.h" -#if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ - (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || \ - defined(__TARGET_ARCH_arm) || defined(__TARGET_ARCH_s390) || \ - defined(__TARGET_ARCH_loongarch)) && \ - __clang_major__ >= 18 +#ifdef CAN_USE_GOTOL SEC("socket") __description("gotol, small_imm") diff --git a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c index e54bb5385bc1b..75dd922e4e9f8 100644 --- a/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c +++ b/tools/testing/selftests/bpf/progs/verifier_iterating_callbacks.c @@ -407,11 +407,7 @@ l0_%=: call %[bpf_jiffies64]; \ : __clobber_all); } -#if (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86) || \ - (defined(__TARGET_ARCH_riscv) && __riscv_xlen == 64) || \ - defined(__TARGET_ARCH_arm) || defined(__TARGET_ARCH_s390) || \ - defined(__TARGET_ARCH_loongarch)) && \ - __clang_major__ >= 18 +#ifdef CAN_USE_GOTOL SEC("socket") __success __retval(0) __naked void gotol_and_may_goto(void) diff --git a/tools/testing/selftests/bpf/progs/verifier_load_acquire.c b/tools/testing/selftests/bpf/progs/verifier_load_acquire.c new file mode 100644 index 0000000000000..6080974538327 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_load_acquire.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Google LLC. */ + +#include +#include +#include "../../../include/linux/filter.h" +#include "bpf_misc.h" + +#ifdef CAN_USE_LOAD_ACQ_STORE_REL + +SEC("socket") +__description("load-acquire, 8-bit") +__success __success_unpriv __retval(0x12) +__naked void load_acquire_8(void) +{ + asm volatile ( + "w1 = 0x12;" + "*(u8 *)(r10 - 1) = w1;" + ".8byte %[load_acquire_insn];" // w0 = load_acquire((u8 *)(r10 - 1)); + "exit;" + : + : __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_B, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_10, -1)) + : __clobber_all); +} + +SEC("socket") +__description("load-acquire, 16-bit") +__success __success_unpriv __retval(0x1234) +__naked void load_acquire_16(void) +{ + asm volatile ( + "w1 = 0x1234;" + "*(u16 *)(r10 - 2) = w1;" + ".8byte %[load_acquire_insn];" // w0 = load_acquire((u16 *)(r10 - 2)); + "exit;" + : + : __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_H, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_10, -2)) + : __clobber_all); +} + +SEC("socket") +__description("load-acquire, 32-bit") +__success __success_unpriv __retval(0x12345678) +__naked void load_acquire_32(void) +{ + asm volatile ( + "w1 = 0x12345678;" + "*(u32 *)(r10 - 4) = w1;" + ".8byte %[load_acquire_insn];" // w0 = load_acquire((u32 *)(r10 - 4)); + "exit;" + : + : __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_W, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_10, -4)) + : __clobber_all); +} + +SEC("socket") +__description("load-acquire, 64-bit") +__success __success_unpriv __retval(0x1234567890abcdef) +__naked void load_acquire_64(void) +{ + asm volatile ( + "r1 = 0x1234567890abcdef ll;" + "*(u64 *)(r10 - 8) = r1;" + ".8byte %[load_acquire_insn];" // r0 = load_acquire((u64 *)(r10 - 8)); + "exit;" + : + : __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_10, -8)) + : __clobber_all); +} + +SEC("socket") +__description("load-acquire with uninitialized src_reg") +__failure __failure_unpriv __msg("R2 !read_ok") +__naked void load_acquire_with_uninitialized_src_reg(void) +{ + asm volatile ( + ".8byte %[load_acquire_insn];" // r0 = load_acquire((u64 *)(r2 + 0)); + "exit;" + : + : __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_2, 0)) + : __clobber_all); +} + +SEC("socket") +__description("load-acquire with non-pointer src_reg") +__failure __failure_unpriv __msg("R1 invalid mem access 'scalar'") +__naked void load_acquire_with_non_pointer_src_reg(void) +{ + asm volatile ( + "r1 = 0;" + ".8byte %[load_acquire_insn];" // r0 = load_acquire((u64 *)(r1 + 0)); + "exit;" + : + : __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_1, 0)) + : __clobber_all); +} + +SEC("socket") +__description("misaligned load-acquire") +__failure __failure_unpriv __msg("misaligned stack access off") +__flag(BPF_F_ANY_ALIGNMENT) +__naked void load_acquire_misaligned(void) +{ + asm volatile ( + "r1 = 0;" + "*(u64 *)(r10 - 8) = r1;" + ".8byte %[load_acquire_insn];" // w0 = load_acquire((u32 *)(r10 - 5)); + "exit;" + : + : __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_W, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_10, -5)) + : __clobber_all); +} + +SEC("socket") +__description("load-acquire from ctx pointer") +__failure __failure_unpriv __msg("BPF_ATOMIC loads from R1 ctx is not allowed") +__naked void load_acquire_from_ctx_pointer(void) +{ + asm volatile ( + ".8byte %[load_acquire_insn];" // w0 = load_acquire((u8 *)(r1 + 0)); + "exit;" + : + : __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_B, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_1, 0)) + : __clobber_all); +} + +SEC("xdp") +__description("load-acquire from pkt pointer") +__failure __msg("BPF_ATOMIC loads from R2 pkt is not allowed") +__naked void load_acquire_from_pkt_pointer(void) +{ + asm volatile ( + "r2 = *(u32 *)(r1 + %[xdp_md_data]);" + ".8byte %[load_acquire_insn];" // w0 = load_acquire((u8 *)(r2 + 0)); + "exit;" + : + : __imm_const(xdp_md_data, offsetof(struct xdp_md, data)), + __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_B, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_2, 0)) + : __clobber_all); +} + +SEC("flow_dissector") +__description("load-acquire from flow_keys pointer") +__failure __msg("BPF_ATOMIC loads from R2 flow_keys is not allowed") +__naked void load_acquire_from_flow_keys_pointer(void) +{ + asm volatile ( + "r2 = *(u64 *)(r1 + %[__sk_buff_flow_keys]);" + ".8byte %[load_acquire_insn];" // w0 = load_acquire((u8 *)(r2 + 0)); + "exit;" + : + : __imm_const(__sk_buff_flow_keys, + offsetof(struct __sk_buff, flow_keys)), + __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_B, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_2, 0)) + : __clobber_all); +} + +SEC("sk_reuseport") +__description("load-acquire from sock pointer") +__failure __msg("BPF_ATOMIC loads from R2 sock is not allowed") +__naked void load_acquire_from_sock_pointer(void) +{ + asm volatile ( + "r2 = *(u64 *)(r1 + %[sk_reuseport_md_sk]);" + ".8byte %[load_acquire_insn];" // w0 = load_acquire((u8 *)(r2 + 0)); + "exit;" + : + : __imm_const(sk_reuseport_md_sk, offsetof(struct sk_reuseport_md, sk)), + __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_B, BPF_LOAD_ACQ, BPF_REG_0, BPF_REG_2, 0)) + : __clobber_all); +} + +#else /* CAN_USE_LOAD_ACQ_STORE_REL */ + +SEC("socket") +__description("Clang version < 18, ENABLE_ATOMICS_TESTS not defined, and/or JIT doesn't support load-acquire, use a dummy test") +__success +int dummy_test(void) +{ + return 0; +} + +#endif /* CAN_USE_LOAD_ACQ_STORE_REL */ + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c index e81097c96fe27..3966d827f2889 100644 --- a/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c +++ b/tools/testing/selftests/bpf/progs/verifier_may_goto_1.c @@ -69,8 +69,38 @@ __naked void may_goto_batch_1(void) } SEC("raw_tp") -__description("may_goto batch with offsets 2/0") +__description("may_goto batch with offsets 2/0 - x86_64") __arch_x86_64 +__xlated("0: *(u64 *)(r10 -16) = 65535") +__xlated("1: *(u64 *)(r10 -8) = 0") +__xlated("2: r11 = *(u64 *)(r10 -16)") +__xlated("3: if r11 == 0x0 goto pc+6") +__xlated("4: r11 -= 1") +__xlated("5: if r11 != 0x0 goto pc+2") +__xlated("6: r11 = -16") +__xlated("7: call unknown") +__xlated("8: *(u64 *)(r10 -16) = r11") +__xlated("9: r0 = 1") +__xlated("10: r0 = 2") +__xlated("11: exit") +__success +__naked void may_goto_batch_2_x86_64(void) +{ + asm volatile ( + ".8byte %[may_goto1];" + ".8byte %[may_goto3];" + "r0 = 1;" + "r0 = 2;" + "exit;" + : + : __imm_insn(may_goto1, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 2 /* offset */, 0)), + __imm_insn(may_goto3, BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0 /* offset */, 0)) + : __clobber_all); +} + +SEC("raw_tp") +__description("may_goto batch with offsets 2/0 - arm64") +__arch_arm64 __xlated("0: *(u64 *)(r10 -8) = 8388608") __xlated("1: r11 = *(u64 *)(r10 -8)") __xlated("2: if r11 == 0x0 goto pc+3") @@ -80,7 +110,7 @@ __xlated("5: r0 = 1") __xlated("6: r0 = 2") __xlated("7: exit") __success -__naked void may_goto_batch_2(void) +__naked void may_goto_batch_2_arm64(void) { asm volatile ( ".8byte %[may_goto1];" diff --git a/tools/testing/selftests/bpf/progs/verifier_precision.c b/tools/testing/selftests/bpf/progs/verifier_precision.c index 6b564d4c09866..6662d4b39969a 100644 --- a/tools/testing/selftests/bpf/progs/verifier_precision.c +++ b/tools/testing/selftests/bpf/progs/verifier_precision.c @@ -2,6 +2,7 @@ /* Copyright (C) 2023 SUSE LLC */ #include #include +#include "../../../include/linux/filter.h" #include "bpf_misc.h" SEC("?raw_tp") @@ -90,6 +91,54 @@ __naked int bpf_end_bswap(void) ::: __clobber_all); } +#if defined(ENABLE_ATOMICS_TESTS) && \ + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) + +SEC("?raw_tp") +__success __log_level(2) +__msg("mark_precise: frame0: regs=r2 stack= before 3: (bf) r3 = r10") +__msg("mark_precise: frame0: regs=r2 stack= before 2: (db) r2 = load_acquire((u64 *)(r10 -8))") +__msg("mark_precise: frame0: regs= stack=-8 before 1: (7b) *(u64 *)(r10 -8) = r1") +__msg("mark_precise: frame0: regs=r1 stack= before 0: (b7) r1 = 8") +__naked int bpf_load_acquire(void) +{ + asm volatile ( + "r1 = 8;" + "*(u64 *)(r10 - 8) = r1;" + ".8byte %[load_acquire_insn];" /* r2 = load_acquire((u64 *)(r10 - 8)); */ + "r3 = r10;" + "r3 += r2;" /* mark_precise */ + "r0 = 0;" + "exit;" + : + : __imm_insn(load_acquire_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_LOAD_ACQ, BPF_REG_2, BPF_REG_10, -8)) + : __clobber_all); +} + +SEC("?raw_tp") +__success __log_level(2) +__msg("mark_precise: frame0: regs=r1 stack= before 3: (bf) r2 = r10") +__msg("mark_precise: frame0: regs=r1 stack= before 2: (79) r1 = *(u64 *)(r10 -8)") +__msg("mark_precise: frame0: regs= stack=-8 before 1: (db) store_release((u64 *)(r10 -8), r1)") +__msg("mark_precise: frame0: regs=r1 stack= before 0: (b7) r1 = 8") +__naked int bpf_store_release(void) +{ + asm volatile ( + "r1 = 8;" + ".8byte %[store_release_insn];" /* store_release((u64 *)(r10 - 8), r1); */ + "r1 = *(u64 *)(r10 - 8);" + "r2 = r10;" + "r2 += r1;" /* mark_precise */ + "r0 = 0;" + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_STORE_REL, BPF_REG_10, BPF_REG_1, -8)) + : __clobber_all); +} + +#endif /* load-acquire, store-release */ #endif /* v4 instruction */ SEC("?raw_tp") diff --git a/tools/testing/selftests/bpf/progs/verifier_store_release.c b/tools/testing/selftests/bpf/progs/verifier_store_release.c new file mode 100644 index 0000000000000..f1c64c08f25fb --- /dev/null +++ b/tools/testing/selftests/bpf/progs/verifier_store_release.c @@ -0,0 +1,264 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Google LLC. */ + +#include +#include +#include "../../../include/linux/filter.h" +#include "bpf_misc.h" + +#if __clang_major__ >= 18 && defined(ENABLE_ATOMICS_TESTS) && \ + (defined(__TARGET_ARCH_arm64) || defined(__TARGET_ARCH_x86)) + +SEC("socket") +__description("store-release, 8-bit") +__success __success_unpriv __retval(0x12) +__naked void store_release_8(void) +{ + asm volatile ( + "w1 = 0x12;" + ".8byte %[store_release_insn];" // store_release((u8 *)(r10 - 1), w1); + "w0 = *(u8 *)(r10 - 1);" + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_B, BPF_STORE_REL, BPF_REG_10, BPF_REG_1, -1)) + : __clobber_all); +} + +SEC("socket") +__description("store-release, 16-bit") +__success __success_unpriv __retval(0x1234) +__naked void store_release_16(void) +{ + asm volatile ( + "w1 = 0x1234;" + ".8byte %[store_release_insn];" // store_release((u16 *)(r10 - 2), w1); + "w0 = *(u16 *)(r10 - 2);" + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_H, BPF_STORE_REL, BPF_REG_10, BPF_REG_1, -2)) + : __clobber_all); +} + +SEC("socket") +__description("store-release, 32-bit") +__success __success_unpriv __retval(0x12345678) +__naked void store_release_32(void) +{ + asm volatile ( + "w1 = 0x12345678;" + ".8byte %[store_release_insn];" // store_release((u32 *)(r10 - 4), w1); + "w0 = *(u32 *)(r10 - 4);" + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_W, BPF_STORE_REL, BPF_REG_10, BPF_REG_1, -4)) + : __clobber_all); +} + +SEC("socket") +__description("store-release, 64-bit") +__success __success_unpriv __retval(0x1234567890abcdef) +__naked void store_release_64(void) +{ + asm volatile ( + "r1 = 0x1234567890abcdef ll;" + ".8byte %[store_release_insn];" // store_release((u64 *)(r10 - 8), r1); + "r0 = *(u64 *)(r10 - 8);" + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_STORE_REL, BPF_REG_10, BPF_REG_1, -8)) + : __clobber_all); +} + +SEC("socket") +__description("store-release with uninitialized src_reg") +__failure __failure_unpriv __msg("R2 !read_ok") +__naked void store_release_with_uninitialized_src_reg(void) +{ + asm volatile ( + ".8byte %[store_release_insn];" // store_release((u64 *)(r10 - 8), r2); + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_STORE_REL, BPF_REG_10, BPF_REG_2, -8)) + : __clobber_all); +} + +SEC("socket") +__description("store-release with uninitialized dst_reg") +__failure __failure_unpriv __msg("R2 !read_ok") +__naked void store_release_with_uninitialized_dst_reg(void) +{ + asm volatile ( + "r1 = 0;" + ".8byte %[store_release_insn];" // store_release((u64 *)(r2 - 8), r1); + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_STORE_REL, BPF_REG_2, BPF_REG_1, -8)) + : __clobber_all); +} + +SEC("socket") +__description("store-release with non-pointer dst_reg") +__failure __failure_unpriv __msg("R1 invalid mem access 'scalar'") +__naked void store_release_with_non_pointer_dst_reg(void) +{ + asm volatile ( + "r1 = 0;" + ".8byte %[store_release_insn];" // store_release((u64 *)(r1 + 0), r1); + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_STORE_REL, BPF_REG_1, BPF_REG_1, 0)) + : __clobber_all); +} + +SEC("socket") +__description("misaligned store-release") +__failure __failure_unpriv __msg("misaligned stack access off") +__flag(BPF_F_ANY_ALIGNMENT) +__naked void store_release_misaligned(void) +{ + asm volatile ( + "w0 = 0;" + ".8byte %[store_release_insn];" // store_release((u32 *)(r10 - 5), w0); + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_W, BPF_STORE_REL, BPF_REG_10, BPF_REG_0, -5)) + : __clobber_all); +} + +SEC("socket") +__description("store-release to ctx pointer") +__failure __failure_unpriv __msg("BPF_ATOMIC stores into R1 ctx is not allowed") +__naked void store_release_to_ctx_pointer(void) +{ + asm volatile ( + "w0 = 0;" + ".8byte %[store_release_insn];" // store_release((u8 *)(r1 + 0), w0); + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_B, BPF_STORE_REL, BPF_REG_1, BPF_REG_0, 0)) + : __clobber_all); +} + +SEC("xdp") +__description("store-release to pkt pointer") +__failure __msg("BPF_ATOMIC stores into R2 pkt is not allowed") +__naked void store_release_to_pkt_pointer(void) +{ + asm volatile ( + "w0 = 0;" + "r2 = *(u32 *)(r1 + %[xdp_md_data]);" + ".8byte %[store_release_insn];" // store_release((u8 *)(r2 + 0), w0); + "exit;" + : + : __imm_const(xdp_md_data, offsetof(struct xdp_md, data)), + __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_B, BPF_STORE_REL, BPF_REG_2, BPF_REG_0, 0)) + : __clobber_all); +} + +SEC("flow_dissector") +__description("store-release to flow_keys pointer") +__failure __msg("BPF_ATOMIC stores into R2 flow_keys is not allowed") +__naked void store_release_to_flow_keys_pointer(void) +{ + asm volatile ( + "w0 = 0;" + "r2 = *(u64 *)(r1 + %[__sk_buff_flow_keys]);" + ".8byte %[store_release_insn];" // store_release((u8 *)(r2 + 0), w0); + "exit;" + : + : __imm_const(__sk_buff_flow_keys, + offsetof(struct __sk_buff, flow_keys)), + __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_B, BPF_STORE_REL, BPF_REG_2, BPF_REG_0, 0)) + : __clobber_all); +} + +SEC("sk_reuseport") +__description("store-release to sock pointer") +__failure __msg("BPF_ATOMIC stores into R2 sock is not allowed") +__naked void store_release_to_sock_pointer(void) +{ + asm volatile ( + "w0 = 0;" + "r2 = *(u64 *)(r1 + %[sk_reuseport_md_sk]);" + ".8byte %[store_release_insn];" // store_release((u8 *)(r2 + 0), w0); + "exit;" + : + : __imm_const(sk_reuseport_md_sk, offsetof(struct sk_reuseport_md, sk)), + __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_B, BPF_STORE_REL, BPF_REG_2, BPF_REG_0, 0)) + : __clobber_all); +} + +SEC("socket") +__description("store-release, leak pointer to stack") +__success __success_unpriv __retval(0) +__naked void store_release_leak_pointer_to_stack(void) +{ + asm volatile ( + ".8byte %[store_release_insn];" // store_release((u64 *)(r10 - 8), r1); + "r0 = 0;" + "exit;" + : + : __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_STORE_REL, BPF_REG_10, BPF_REG_1, -8)) + : __clobber_all); +} + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 1); + __type(key, long long); + __type(value, long long); +} map_hash_8b SEC(".maps"); + +SEC("socket") +__description("store-release, leak pointer to map") +__success __retval(0) +__failure_unpriv __msg_unpriv("R6 leaks addr into map") +__naked void store_release_leak_pointer_to_map(void) +{ + asm volatile ( + "r6 = r1;" + "r1 = %[map_hash_8b] ll;" + "r2 = 0;" + "*(u64 *)(r10 - 8) = r2;" + "r2 = r10;" + "r2 += -8;" + "call %[bpf_map_lookup_elem];" + "if r0 == 0 goto l0_%=;" + ".8byte %[store_release_insn];" // store_release((u64 *)(r0 + 0), r6); +"l0_%=:" + "r0 = 0;" + "exit;" + : + : __imm_addr(map_hash_8b), + __imm(bpf_map_lookup_elem), + __imm_insn(store_release_insn, + BPF_ATOMIC_OP(BPF_DW, BPF_STORE_REL, BPF_REG_0, BPF_REG_6, 0)) + : __clobber_all); +} + +#else + +SEC("socket") +__description("Clang version < 18, ENABLE_ATOMICS_TESTS not defined, and/or JIT doesn't support store-release, use a dummy test") +__success +int dummy_test(void) +{ + return 0; +} + +#endif + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_loader.c b/tools/testing/selftests/bpf/test_loader.c index 4d23a9c463ee2..49f2fc61061f5 100644 --- a/tools/testing/selftests/bpf/test_loader.c +++ b/tools/testing/selftests/bpf/test_loader.c @@ -793,7 +793,7 @@ static int drop_capabilities(struct cap_state *caps) err = cap_disable_effective(caps_to_drop, &caps->old_caps); if (err) { - PRINT_FAIL("failed to drop capabilities: %i, %s\n", err, strerror(err)); + PRINT_FAIL("failed to drop capabilities: %i, %s\n", err, strerror(-err)); return err; } @@ -810,7 +810,7 @@ static int restore_capabilities(struct cap_state *caps) err = cap_enable_effective(caps->old_caps, NULL); if (err) - PRINT_FAIL("failed to restore capabilities: %i, %s\n", err, strerror(err)); + PRINT_FAIL("failed to restore capabilities: %i, %s\n", err, strerror(-err)); caps->initialized = false; return err; } @@ -985,7 +985,7 @@ void run_subtest(struct test_loader *tester, if (subspec->caps) { err = cap_enable_effective(subspec->caps, NULL); if (err) { - PRINT_FAIL("failed to set capabilities: %i, %s\n", err, strerror(err)); + PRINT_FAIL("failed to set capabilities: %i, %s\n", err, strerror(-err)); goto subtest_cleanup; } } diff --git a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh b/tools/testing/selftests/bpf/test_lwt_ip_encap.sh deleted file mode 100755 index 1e565f47aca94..0000000000000 --- a/tools/testing/selftests/bpf/test_lwt_ip_encap.sh +++ /dev/null @@ -1,476 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Setup/topology: -# -# NS1 NS2 NS3 -# veth1 <---> veth2 veth3 <---> veth4 (the top route) -# veth5 <---> veth6 veth7 <---> veth8 (the bottom route) -# -# each vethN gets IPv[4|6]_N address -# -# IPv*_SRC = IPv*_1 -# IPv*_DST = IPv*_4 -# -# all tests test pings from IPv*_SRC to IPv*_DST -# -# by default, routes are configured to allow packets to go -# IP*_1 <=> IP*_2 <=> IP*_3 <=> IP*_4 (the top route) -# -# a GRE device is installed in NS3 with IPv*_GRE, and -# NS1/NS2 are configured to route packets to IPv*_GRE via IP*_8 -# (the bottom route) -# -# Tests: -# -# 1. routes NS2->IPv*_DST are brought down, so the only way a ping -# from IP*_SRC to IP*_DST can work is via IPv*_GRE -# -# 2a. in an egress test, a bpf LWT_XMIT program is installed on veth1 -# that encaps the packets with an IP/GRE header to route to IPv*_GRE -# -# ping: SRC->[encap at veth1:egress]->GRE:decap->DST -# ping replies go DST->SRC directly -# -# 2b. in an ingress test, a bpf LWT_IN program is installed on veth2 -# that encaps the packets with an IP/GRE header to route to IPv*_GRE -# -# ping: SRC->[encap at veth2:ingress]->GRE:decap->DST -# ping replies go DST->SRC directly - -BPF_FILE="test_lwt_ip_encap.bpf.o" -if [[ $EUID -ne 0 ]]; then - echo "This script must be run as root" - echo "FAIL" - exit 1 -fi - -readonly NS1="ns1-$(mktemp -u XXXXXX)" -readonly NS2="ns2-$(mktemp -u XXXXXX)" -readonly NS3="ns3-$(mktemp -u XXXXXX)" - -readonly IPv4_1="172.16.1.100" -readonly IPv4_2="172.16.2.100" -readonly IPv4_3="172.16.3.100" -readonly IPv4_4="172.16.4.100" -readonly IPv4_5="172.16.5.100" -readonly IPv4_6="172.16.6.100" -readonly IPv4_7="172.16.7.100" -readonly IPv4_8="172.16.8.100" -readonly IPv4_GRE="172.16.16.100" - -readonly IPv4_SRC=$IPv4_1 -readonly IPv4_DST=$IPv4_4 - -readonly IPv6_1="fb01::1" -readonly IPv6_2="fb02::1" -readonly IPv6_3="fb03::1" -readonly IPv6_4="fb04::1" -readonly IPv6_5="fb05::1" -readonly IPv6_6="fb06::1" -readonly IPv6_7="fb07::1" -readonly IPv6_8="fb08::1" -readonly IPv6_GRE="fb10::1" - -readonly IPv6_SRC=$IPv6_1 -readonly IPv6_DST=$IPv6_4 - -TEST_STATUS=0 -TESTS_SUCCEEDED=0 -TESTS_FAILED=0 - -TMPFILE="" - -process_test_results() -{ - if [[ "${TEST_STATUS}" -eq 0 ]] ; then - echo "PASS" - TESTS_SUCCEEDED=$((TESTS_SUCCEEDED+1)) - else - echo "FAIL" - TESTS_FAILED=$((TESTS_FAILED+1)) - fi -} - -print_test_summary_and_exit() -{ - echo "passed tests: ${TESTS_SUCCEEDED}" - echo "failed tests: ${TESTS_FAILED}" - if [ "${TESTS_FAILED}" -eq "0" ] ; then - exit 0 - else - exit 1 - fi -} - -setup() -{ - set -e # exit on error - TEST_STATUS=0 - - # create devices and namespaces - ip netns add "${NS1}" - ip netns add "${NS2}" - ip netns add "${NS3}" - - # rp_filter gets confused by what these tests are doing, so disable it - ip netns exec ${NS1} sysctl -wq net.ipv4.conf.all.rp_filter=0 - ip netns exec ${NS2} sysctl -wq net.ipv4.conf.all.rp_filter=0 - ip netns exec ${NS3} sysctl -wq net.ipv4.conf.all.rp_filter=0 - ip netns exec ${NS1} sysctl -wq net.ipv4.conf.default.rp_filter=0 - ip netns exec ${NS2} sysctl -wq net.ipv4.conf.default.rp_filter=0 - ip netns exec ${NS3} sysctl -wq net.ipv4.conf.default.rp_filter=0 - - # disable IPv6 DAD because it sometimes takes too long and fails tests - ip netns exec ${NS1} sysctl -wq net.ipv6.conf.all.accept_dad=0 - ip netns exec ${NS2} sysctl -wq net.ipv6.conf.all.accept_dad=0 - ip netns exec ${NS3} sysctl -wq net.ipv6.conf.all.accept_dad=0 - ip netns exec ${NS1} sysctl -wq net.ipv6.conf.default.accept_dad=0 - ip netns exec ${NS2} sysctl -wq net.ipv6.conf.default.accept_dad=0 - ip netns exec ${NS3} sysctl -wq net.ipv6.conf.default.accept_dad=0 - - ip link add veth1 type veth peer name veth2 - ip link add veth3 type veth peer name veth4 - ip link add veth5 type veth peer name veth6 - ip link add veth7 type veth peer name veth8 - - ip netns exec ${NS2} sysctl -wq net.ipv4.ip_forward=1 - ip netns exec ${NS2} sysctl -wq net.ipv6.conf.all.forwarding=1 - - ip link set veth1 netns ${NS1} - ip link set veth2 netns ${NS2} - ip link set veth3 netns ${NS2} - ip link set veth4 netns ${NS3} - ip link set veth5 netns ${NS1} - ip link set veth6 netns ${NS2} - ip link set veth7 netns ${NS2} - ip link set veth8 netns ${NS3} - - if [ ! -z "${VRF}" ] ; then - ip -netns ${NS1} link add red type vrf table 1001 - ip -netns ${NS1} link set red up - ip -netns ${NS1} route add table 1001 unreachable default metric 8192 - ip -netns ${NS1} -6 route add table 1001 unreachable default metric 8192 - ip -netns ${NS1} link set veth1 vrf red - ip -netns ${NS1} link set veth5 vrf red - - ip -netns ${NS2} link add red type vrf table 1001 - ip -netns ${NS2} link set red up - ip -netns ${NS2} route add table 1001 unreachable default metric 8192 - ip -netns ${NS2} -6 route add table 1001 unreachable default metric 8192 - ip -netns ${NS2} link set veth2 vrf red - ip -netns ${NS2} link set veth3 vrf red - ip -netns ${NS2} link set veth6 vrf red - ip -netns ${NS2} link set veth7 vrf red - fi - - # configure addesses: the top route (1-2-3-4) - ip -netns ${NS1} addr add ${IPv4_1}/24 dev veth1 - ip -netns ${NS2} addr add ${IPv4_2}/24 dev veth2 - ip -netns ${NS2} addr add ${IPv4_3}/24 dev veth3 - ip -netns ${NS3} addr add ${IPv4_4}/24 dev veth4 - ip -netns ${NS1} -6 addr add ${IPv6_1}/128 nodad dev veth1 - ip -netns ${NS2} -6 addr add ${IPv6_2}/128 nodad dev veth2 - ip -netns ${NS2} -6 addr add ${IPv6_3}/128 nodad dev veth3 - ip -netns ${NS3} -6 addr add ${IPv6_4}/128 nodad dev veth4 - - # configure addresses: the bottom route (5-6-7-8) - ip -netns ${NS1} addr add ${IPv4_5}/24 dev veth5 - ip -netns ${NS2} addr add ${IPv4_6}/24 dev veth6 - ip -netns ${NS2} addr add ${IPv4_7}/24 dev veth7 - ip -netns ${NS3} addr add ${IPv4_8}/24 dev veth8 - ip -netns ${NS1} -6 addr add ${IPv6_5}/128 nodad dev veth5 - ip -netns ${NS2} -6 addr add ${IPv6_6}/128 nodad dev veth6 - ip -netns ${NS2} -6 addr add ${IPv6_7}/128 nodad dev veth7 - ip -netns ${NS3} -6 addr add ${IPv6_8}/128 nodad dev veth8 - - ip -netns ${NS1} link set dev veth1 up - ip -netns ${NS2} link set dev veth2 up - ip -netns ${NS2} link set dev veth3 up - ip -netns ${NS3} link set dev veth4 up - ip -netns ${NS1} link set dev veth5 up - ip -netns ${NS2} link set dev veth6 up - ip -netns ${NS2} link set dev veth7 up - ip -netns ${NS3} link set dev veth8 up - - # configure routes: IP*_SRC -> veth1/IP*_2 (= top route) default; - # the bottom route to specific bottom addresses - - # NS1 - # top route - ip -netns ${NS1} route add ${IPv4_2}/32 dev veth1 ${VRF} - ip -netns ${NS1} route add default dev veth1 via ${IPv4_2} ${VRF} # go top by default - ip -netns ${NS1} -6 route add ${IPv6_2}/128 dev veth1 ${VRF} - ip -netns ${NS1} -6 route add default dev veth1 via ${IPv6_2} ${VRF} # go top by default - # bottom route - ip -netns ${NS1} route add ${IPv4_6}/32 dev veth5 ${VRF} - ip -netns ${NS1} route add ${IPv4_7}/32 dev veth5 via ${IPv4_6} ${VRF} - ip -netns ${NS1} route add ${IPv4_8}/32 dev veth5 via ${IPv4_6} ${VRF} - ip -netns ${NS1} -6 route add ${IPv6_6}/128 dev veth5 ${VRF} - ip -netns ${NS1} -6 route add ${IPv6_7}/128 dev veth5 via ${IPv6_6} ${VRF} - ip -netns ${NS1} -6 route add ${IPv6_8}/128 dev veth5 via ${IPv6_6} ${VRF} - - # NS2 - # top route - ip -netns ${NS2} route add ${IPv4_1}/32 dev veth2 ${VRF} - ip -netns ${NS2} route add ${IPv4_4}/32 dev veth3 ${VRF} - ip -netns ${NS2} -6 route add ${IPv6_1}/128 dev veth2 ${VRF} - ip -netns ${NS2} -6 route add ${IPv6_4}/128 dev veth3 ${VRF} - # bottom route - ip -netns ${NS2} route add ${IPv4_5}/32 dev veth6 ${VRF} - ip -netns ${NS2} route add ${IPv4_8}/32 dev veth7 ${VRF} - ip -netns ${NS2} -6 route add ${IPv6_5}/128 dev veth6 ${VRF} - ip -netns ${NS2} -6 route add ${IPv6_8}/128 dev veth7 ${VRF} - - # NS3 - # top route - ip -netns ${NS3} route add ${IPv4_3}/32 dev veth4 - ip -netns ${NS3} route add ${IPv4_1}/32 dev veth4 via ${IPv4_3} - ip -netns ${NS3} route add ${IPv4_2}/32 dev veth4 via ${IPv4_3} - ip -netns ${NS3} -6 route add ${IPv6_3}/128 dev veth4 - ip -netns ${NS3} -6 route add ${IPv6_1}/128 dev veth4 via ${IPv6_3} - ip -netns ${NS3} -6 route add ${IPv6_2}/128 dev veth4 via ${IPv6_3} - # bottom route - ip -netns ${NS3} route add ${IPv4_7}/32 dev veth8 - ip -netns ${NS3} route add ${IPv4_5}/32 dev veth8 via ${IPv4_7} - ip -netns ${NS3} route add ${IPv4_6}/32 dev veth8 via ${IPv4_7} - ip -netns ${NS3} -6 route add ${IPv6_7}/128 dev veth8 - ip -netns ${NS3} -6 route add ${IPv6_5}/128 dev veth8 via ${IPv6_7} - ip -netns ${NS3} -6 route add ${IPv6_6}/128 dev veth8 via ${IPv6_7} - - # configure IPv4 GRE device in NS3, and a route to it via the "bottom" route - ip -netns ${NS3} tunnel add gre_dev mode gre remote ${IPv4_1} local ${IPv4_GRE} ttl 255 - ip -netns ${NS3} link set gre_dev up - ip -netns ${NS3} addr add ${IPv4_GRE} dev gre_dev - ip -netns ${NS1} route add ${IPv4_GRE}/32 dev veth5 via ${IPv4_6} ${VRF} - ip -netns ${NS2} route add ${IPv4_GRE}/32 dev veth7 via ${IPv4_8} ${VRF} - - - # configure IPv6 GRE device in NS3, and a route to it via the "bottom" route - ip -netns ${NS3} -6 tunnel add name gre6_dev mode ip6gre remote ${IPv6_1} local ${IPv6_GRE} ttl 255 - ip -netns ${NS3} link set gre6_dev up - ip -netns ${NS3} -6 addr add ${IPv6_GRE} nodad dev gre6_dev - ip -netns ${NS1} -6 route add ${IPv6_GRE}/128 dev veth5 via ${IPv6_6} ${VRF} - ip -netns ${NS2} -6 route add ${IPv6_GRE}/128 dev veth7 via ${IPv6_8} ${VRF} - - TMPFILE=$(mktemp /tmp/test_lwt_ip_encap.XXXXXX) - - sleep 1 # reduce flakiness - set +e -} - -cleanup() -{ - if [ -f ${TMPFILE} ] ; then - rm ${TMPFILE} - fi - - ip netns del ${NS1} 2> /dev/null - ip netns del ${NS2} 2> /dev/null - ip netns del ${NS3} 2> /dev/null -} - -trap cleanup EXIT - -remove_routes_to_gredev() -{ - ip -netns ${NS1} route del ${IPv4_GRE} dev veth5 ${VRF} - ip -netns ${NS2} route del ${IPv4_GRE} dev veth7 ${VRF} - ip -netns ${NS1} -6 route del ${IPv6_GRE}/128 dev veth5 ${VRF} - ip -netns ${NS2} -6 route del ${IPv6_GRE}/128 dev veth7 ${VRF} -} - -add_unreachable_routes_to_gredev() -{ - ip -netns ${NS1} route add unreachable ${IPv4_GRE}/32 ${VRF} - ip -netns ${NS2} route add unreachable ${IPv4_GRE}/32 ${VRF} - ip -netns ${NS1} -6 route add unreachable ${IPv6_GRE}/128 ${VRF} - ip -netns ${NS2} -6 route add unreachable ${IPv6_GRE}/128 ${VRF} -} - -test_ping() -{ - local readonly PROTO=$1 - local readonly EXPECTED=$2 - local RET=0 - - if [ "${PROTO}" == "IPv4" ] ; then - ip netns exec ${NS1} ping -c 1 -W 1 -I veth1 ${IPv4_DST} 2>&1 > /dev/null - RET=$? - elif [ "${PROTO}" == "IPv6" ] ; then - ip netns exec ${NS1} ping6 -c 1 -W 1 -I veth1 ${IPv6_DST} 2>&1 > /dev/null - RET=$? - else - echo " test_ping: unknown PROTO: ${PROTO}" - TEST_STATUS=1 - fi - - if [ "0" != "${RET}" ]; then - RET=1 - fi - - if [ "${EXPECTED}" != "${RET}" ] ; then - echo " test_ping failed: expected: ${EXPECTED}; got ${RET}" - TEST_STATUS=1 - fi -} - -test_gso() -{ - local readonly PROTO=$1 - local readonly PKT_SZ=5000 - local IP_DST="" - : > ${TMPFILE} # trim the capture file - - # check that nc is present - command -v nc >/dev/null 2>&1 || \ - { echo >&2 "nc is not available: skipping TSO tests"; return; } - - # listen on port 9000, capture TCP into $TMPFILE - if [ "${PROTO}" == "IPv4" ] ; then - IP_DST=${IPv4_DST} - ip netns exec ${NS3} bash -c \ - "nc -4 -l -p 9000 > ${TMPFILE} &" - elif [ "${PROTO}" == "IPv6" ] ; then - IP_DST=${IPv6_DST} - ip netns exec ${NS3} bash -c \ - "nc -6 -l -p 9000 > ${TMPFILE} &" - RET=$? - else - echo " test_gso: unknown PROTO: ${PROTO}" - TEST_STATUS=1 - fi - sleep 1 # let nc start listening - - # send a packet larger than MTU - ip netns exec ${NS1} bash -c \ - "dd if=/dev/zero bs=$PKT_SZ count=1 > /dev/tcp/${IP_DST}/9000 2>/dev/null" - sleep 2 # let the packet get delivered - - # verify we received all expected bytes - SZ=$(stat -c %s ${TMPFILE}) - if [ "$SZ" != "$PKT_SZ" ] ; then - echo " test_gso failed: ${PROTO}" - TEST_STATUS=1 - fi -} - -test_egress() -{ - local readonly ENCAP=$1 - echo "starting egress ${ENCAP} encap test ${VRF}" - setup - - # by default, pings work - test_ping IPv4 0 - test_ping IPv6 0 - - # remove NS2->DST routes, ping fails - ip -netns ${NS2} route del ${IPv4_DST}/32 dev veth3 ${VRF} - ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3 ${VRF} - test_ping IPv4 1 - test_ping IPv6 1 - - # install replacement routes (LWT/eBPF), pings succeed - if [ "${ENCAP}" == "IPv4" ] ; then - ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj \ - ${BPF_FILE} sec encap_gre dev veth1 ${VRF} - ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj \ - ${BPF_FILE} sec encap_gre dev veth1 ${VRF} - elif [ "${ENCAP}" == "IPv6" ] ; then - ip -netns ${NS1} route add ${IPv4_DST} encap bpf xmit obj \ - ${BPF_FILE} sec encap_gre6 dev veth1 ${VRF} - ip -netns ${NS1} -6 route add ${IPv6_DST} encap bpf xmit obj \ - ${BPF_FILE} sec encap_gre6 dev veth1 ${VRF} - else - echo " unknown encap ${ENCAP}" - TEST_STATUS=1 - fi - test_ping IPv4 0 - test_ping IPv6 0 - - # skip GSO tests with VRF: VRF routing needs properly assigned - # source IP/device, which is easy to do with ping and hard with dd/nc. - if [ -z "${VRF}" ] ; then - test_gso IPv4 - test_gso IPv6 - fi - - # a negative test: remove routes to GRE devices: ping fails - remove_routes_to_gredev - test_ping IPv4 1 - test_ping IPv6 1 - - # another negative test - add_unreachable_routes_to_gredev - test_ping IPv4 1 - test_ping IPv6 1 - - cleanup - process_test_results -} - -test_ingress() -{ - local readonly ENCAP=$1 - echo "starting ingress ${ENCAP} encap test ${VRF}" - setup - - # need to wait a bit for IPv6 to autoconf, otherwise - # ping6 sometimes fails with "unable to bind to address" - - # by default, pings work - test_ping IPv4 0 - test_ping IPv6 0 - - # remove NS2->DST routes, pings fail - ip -netns ${NS2} route del ${IPv4_DST}/32 dev veth3 ${VRF} - ip -netns ${NS2} -6 route del ${IPv6_DST}/128 dev veth3 ${VRF} - test_ping IPv4 1 - test_ping IPv6 1 - - # install replacement routes (LWT/eBPF), pings succeed - if [ "${ENCAP}" == "IPv4" ] ; then - ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj \ - ${BPF_FILE} sec encap_gre dev veth2 ${VRF} - ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj \ - ${BPF_FILE} sec encap_gre dev veth2 ${VRF} - elif [ "${ENCAP}" == "IPv6" ] ; then - ip -netns ${NS2} route add ${IPv4_DST} encap bpf in obj \ - ${BPF_FILE} sec encap_gre6 dev veth2 ${VRF} - ip -netns ${NS2} -6 route add ${IPv6_DST} encap bpf in obj \ - ${BPF_FILE} sec encap_gre6 dev veth2 ${VRF} - else - echo "FAIL: unknown encap ${ENCAP}" - TEST_STATUS=1 - fi - test_ping IPv4 0 - test_ping IPv6 0 - - # a negative test: remove routes to GRE devices: ping fails - remove_routes_to_gredev - test_ping IPv4 1 - test_ping IPv6 1 - - # another negative test - add_unreachable_routes_to_gredev - test_ping IPv4 1 - test_ping IPv6 1 - - cleanup - process_test_results -} - -VRF="" -test_egress IPv4 -test_egress IPv6 -test_ingress IPv4 -test_ingress IPv6 - -VRF="vrf red" -test_egress IPv4 -test_egress IPv6 -test_ingress IPv4 -test_ingress IPv6 - -print_test_summary_and_exit diff --git a/tools/testing/selftests/bpf/test_lwt_seg6local.sh b/tools/testing/selftests/bpf/test_lwt_seg6local.sh deleted file mode 100755 index 0efea2292d6aa..0000000000000 --- a/tools/testing/selftests/bpf/test_lwt_seg6local.sh +++ /dev/null @@ -1,156 +0,0 @@ -#!/bin/bash -# Connects 6 network namespaces through veths. -# Each NS may have different IPv6 global scope addresses : -# NS1 ---- NS2 ---- NS3 ---- NS4 ---- NS5 ---- NS6 -# fb00::1 fd00::1 fd00::2 fd00::3 fb00::6 -# fc42::1 fd00::4 -# -# All IPv6 packets going to fb00::/16 through NS2 will be encapsulated in a -# IPv6 header with a Segment Routing Header, with segments : -# fd00::1 -> fd00::2 -> fd00::3 -> fd00::4 -# -# 3 fd00::/16 IPv6 addresses are binded to seg6local End.BPF actions : -# - fd00::1 : add a TLV, change the flags and apply a End.X action to fc42::1 -# - fd00::2 : remove the TLV, change the flags, add a tag -# - fd00::3 : apply an End.T action to fd00::4, through routing table 117 -# -# fd00::4 is a simple Segment Routing node decapsulating the inner IPv6 packet. -# Each End.BPF action will validate the operations applied on the SRH by the -# previous BPF program in the chain, otherwise the packet is dropped. -# -# An UDP datagram is sent from fb00::1 to fb00::6. The test succeeds if this -# datagram can be read on NS6 when binding to fb00::6. - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 -BPF_FILE="test_lwt_seg6local.bpf.o" -readonly NS1="ns1-$(mktemp -u XXXXXX)" -readonly NS2="ns2-$(mktemp -u XXXXXX)" -readonly NS3="ns3-$(mktemp -u XXXXXX)" -readonly NS4="ns4-$(mktemp -u XXXXXX)" -readonly NS5="ns5-$(mktemp -u XXXXXX)" -readonly NS6="ns6-$(mktemp -u XXXXXX)" - -msg="skip all tests:" -if [ $UID != 0 ]; then - echo $msg please run this as root >&2 - exit $ksft_skip -fi - -TMP_FILE="/tmp/selftest_lwt_seg6local.txt" - -cleanup() -{ - if [ "$?" = "0" ]; then - echo "selftests: test_lwt_seg6local [PASS]"; - else - echo "selftests: test_lwt_seg6local [FAILED]"; - fi - - set +e - ip netns del ${NS1} 2> /dev/null - ip netns del ${NS2} 2> /dev/null - ip netns del ${NS3} 2> /dev/null - ip netns del ${NS4} 2> /dev/null - ip netns del ${NS5} 2> /dev/null - ip netns del ${NS6} 2> /dev/null - rm -f $TMP_FILE -} - -set -e - -ip netns add ${NS1} -ip netns add ${NS2} -ip netns add ${NS3} -ip netns add ${NS4} -ip netns add ${NS5} -ip netns add ${NS6} - -trap cleanup 0 2 3 6 9 - -ip link add veth1 type veth peer name veth2 -ip link add veth3 type veth peer name veth4 -ip link add veth5 type veth peer name veth6 -ip link add veth7 type veth peer name veth8 -ip link add veth9 type veth peer name veth10 - -ip link set veth1 netns ${NS1} -ip link set veth2 netns ${NS2} -ip link set veth3 netns ${NS2} -ip link set veth4 netns ${NS3} -ip link set veth5 netns ${NS3} -ip link set veth6 netns ${NS4} -ip link set veth7 netns ${NS4} -ip link set veth8 netns ${NS5} -ip link set veth9 netns ${NS5} -ip link set veth10 netns ${NS6} - -ip netns exec ${NS1} ip link set dev veth1 up -ip netns exec ${NS2} ip link set dev veth2 up -ip netns exec ${NS2} ip link set dev veth3 up -ip netns exec ${NS3} ip link set dev veth4 up -ip netns exec ${NS3} ip link set dev veth5 up -ip netns exec ${NS4} ip link set dev veth6 up -ip netns exec ${NS4} ip link set dev veth7 up -ip netns exec ${NS5} ip link set dev veth8 up -ip netns exec ${NS5} ip link set dev veth9 up -ip netns exec ${NS6} ip link set dev veth10 up -ip netns exec ${NS6} ip link set dev lo up - -# All link scope addresses and routes required between veths -ip netns exec ${NS1} ip -6 addr add fb00::12/16 dev veth1 scope link -ip netns exec ${NS1} ip -6 route add fb00::21 dev veth1 scope link -ip netns exec ${NS2} ip -6 addr add fb00::21/16 dev veth2 scope link -ip netns exec ${NS2} ip -6 addr add fb00::34/16 dev veth3 scope link -ip netns exec ${NS2} ip -6 route add fb00::43 dev veth3 scope link -ip netns exec ${NS3} ip -6 route add fb00::65 dev veth5 scope link -ip netns exec ${NS3} ip -6 addr add fb00::43/16 dev veth4 scope link -ip netns exec ${NS3} ip -6 addr add fb00::56/16 dev veth5 scope link -ip netns exec ${NS4} ip -6 addr add fb00::65/16 dev veth6 scope link -ip netns exec ${NS4} ip -6 addr add fb00::78/16 dev veth7 scope link -ip netns exec ${NS4} ip -6 route add fb00::87 dev veth7 scope link -ip netns exec ${NS5} ip -6 addr add fb00::87/16 dev veth8 scope link -ip netns exec ${NS5} ip -6 addr add fb00::910/16 dev veth9 scope link -ip netns exec ${NS5} ip -6 route add fb00::109 dev veth9 scope link -ip netns exec ${NS5} ip -6 route add fb00::109 table 117 dev veth9 scope link -ip netns exec ${NS6} ip -6 addr add fb00::109/16 dev veth10 scope link - -ip netns exec ${NS1} ip -6 addr add fb00::1/16 dev lo -ip netns exec ${NS1} ip -6 route add fb00::6 dev veth1 via fb00::21 - -ip netns exec ${NS2} ip -6 route add fb00::6 encap bpf in obj ${BPF_FILE} sec encap_srh dev veth2 -ip netns exec ${NS2} ip -6 route add fd00::1 dev veth3 via fb00::43 scope link - -ip netns exec ${NS3} ip -6 route add fc42::1 dev veth5 via fb00::65 -ip netns exec ${NS3} ip -6 route add fd00::1 encap seg6local action End.BPF endpoint obj ${BPF_FILE} sec add_egr_x dev veth4 - -ip netns exec ${NS4} ip -6 route add fd00::2 encap seg6local action End.BPF endpoint obj ${BPF_FILE} sec pop_egr dev veth6 -ip netns exec ${NS4} ip -6 addr add fc42::1 dev lo -ip netns exec ${NS4} ip -6 route add fd00::3 dev veth7 via fb00::87 - -ip netns exec ${NS5} ip -6 route add fd00::4 table 117 dev veth9 via fb00::109 -ip netns exec ${NS5} ip -6 route add fd00::3 encap seg6local action End.BPF endpoint obj ${BPF_FILE} sec inspect_t dev veth8 - -ip netns exec ${NS6} ip -6 addr add fb00::6/16 dev lo -ip netns exec ${NS6} ip -6 addr add fd00::4/16 dev lo - -ip netns exec ${NS1} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec ${NS2} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec ${NS3} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec ${NS4} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null -ip netns exec ${NS5} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null - -ip netns exec ${NS6} sysctl net.ipv6.conf.all.seg6_enabled=1 > /dev/null -ip netns exec ${NS6} sysctl net.ipv6.conf.lo.seg6_enabled=1 > /dev/null -ip netns exec ${NS6} sysctl net.ipv6.conf.veth10.seg6_enabled=1 > /dev/null - -ip netns exec ${NS6} nc -l -6 -u -d 7330 > $TMP_FILE & -ip netns exec ${NS1} bash -c "echo 'foobar' | nc -w0 -6 -u -p 2121 -s fb00::1 fb00::6 7330" -sleep 5 # wait enough time to ensure the UDP datagram arrived to the last segment -kill -TERM $! - -if [[ $(< $TMP_FILE) != "foobar" ]]; then - exit 1 -fi - -exit 0 diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 8b40e9496af16..986ce32b113a3 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1396,9 +1396,10 @@ static void test_map_stress(void) #define MAX_DELAY_US 50000 #define MIN_DELAY_RANGE_US 5000 -static bool retry_for_again_or_busy(int err) +static bool can_retry(int err) { - return (err == EAGAIN || err == EBUSY); + return (err == EAGAIN || err == EBUSY || + (err == ENOMEM && map_opts.map_flags == BPF_F_NO_PREALLOC)); } int map_update_retriable(int map_fd, const void *key, const void *value, int flags, int attempts, @@ -1451,12 +1452,12 @@ static void test_update_delete(unsigned int fn, void *data) if (do_update) { err = map_update_retriable(fd, &key, &value, BPF_NOEXIST, MAP_RETRIES, - retry_for_again_or_busy); + can_retry); if (err) printf("error %d %d\n", err, errno); assert(err == 0); err = map_update_retriable(fd, &key, &value, BPF_EXIST, MAP_RETRIES, - retry_for_again_or_busy); + can_retry); if (err) printf("error %d %d\n", err, errno); assert(err == 0); diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 0cb759632225d..d4ec9586b98ca 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -88,7 +88,9 @@ static void stdio_hijack(char **log_buf, size_t *log_cnt) #endif } -static void stdio_restore_cleanup(void) +static pthread_mutex_t stdout_lock = PTHREAD_MUTEX_INITIALIZER; + +static void stdio_restore(void) { #ifdef __GLIBC__ if (verbose() && env.worker_id == -1) { @@ -98,6 +100,8 @@ static void stdio_restore_cleanup(void) fflush(stdout); + pthread_mutex_lock(&stdout_lock); + if (env.subtest_state) { fclose(env.subtest_state->stdout_saved); env.subtest_state->stdout_saved = NULL; @@ -106,26 +110,21 @@ static void stdio_restore_cleanup(void) } else { fclose(env.test_state->stdout_saved); env.test_state->stdout_saved = NULL; + stdout = env.stdout_saved; + stderr = env.stderr_saved; } + + pthread_mutex_unlock(&stdout_lock); #endif } -static void stdio_restore(void) +static int traffic_monitor_print_fn(const char *format, va_list args) { -#ifdef __GLIBC__ - if (verbose() && env.worker_id == -1) { - /* nothing to do, output to stdout by default */ - return; - } - - if (stdout == env.stdout_saved) - return; - - stdio_restore_cleanup(); + pthread_mutex_lock(&stdout_lock); + vfprintf(stdout, format, args); + pthread_mutex_unlock(&stdout_lock); - stdout = env.stdout_saved; - stderr = env.stderr_saved; -#endif + return 0; } /* Adapted from perf/util/string.c */ @@ -474,8 +473,6 @@ static void dump_test_log(const struct prog_test_def *test, print_test_result(test, test_state); } -static void stdio_restore(void); - /* A bunch of tests set custom affinity per-thread and/or per-process. Reset * it after each test/sub-test. */ @@ -490,13 +487,11 @@ static void reset_affinity(void) err = sched_setaffinity(0, sizeof(cpuset), &cpuset); if (err < 0) { - stdio_restore(); fprintf(stderr, "Failed to reset process affinity: %d!\n", err); exit(EXIT_ERR_SETUP_INFRA); } err = pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); if (err < 0) { - stdio_restore(); fprintf(stderr, "Failed to reset thread affinity: %d!\n", err); exit(EXIT_ERR_SETUP_INFRA); } @@ -514,7 +509,6 @@ static void save_netns(void) static void restore_netns(void) { if (setns(env.saved_netns_fd, CLONE_NEWNET) == -1) { - stdio_restore(); perror("setns(CLONE_NEWNS)"); exit(EXIT_ERR_SETUP_INFRA); } @@ -541,7 +535,8 @@ void test__end_subtest(void) test_result(subtest_state->error_cnt, subtest_state->skipped)); - stdio_restore_cleanup(); + stdio_restore(); + env.subtest_state = NULL; } @@ -1270,8 +1265,10 @@ void crash_handler(int signum) sz = backtrace(bt, ARRAY_SIZE(bt)); - if (env.stdout_saved) - stdio_restore(); + fflush(stdout); + stdout = env.stdout_saved; + stderr = env.stderr_saved; + if (env.test) { env.test_state->error_cnt++; dump_test_log(env.test, env.test_state, true, false, NULL); @@ -1400,6 +1397,8 @@ static void run_one_test(int test_num) state->tested = true; + stdio_restore(); + if (verbose() && env.worker_id == -1) print_test_result(test, state); @@ -1408,7 +1407,6 @@ static void run_one_test(int test_num) if (test->need_cgroup_cleanup) cleanup_cgroup_environment(); - stdio_restore(); free(stop_libbpf_log_capture()); dump_test_log(test, state, false, false, NULL); @@ -1943,6 +1941,9 @@ int main(int argc, char **argv) sigaction(SIGSEGV, &sigact, NULL); + env.stdout_saved = stdout; + env.stderr_saved = stderr; + env.secs_till_notify = 10; env.secs_till_kill = 120; err = argp_parse(&argp, argc, argv, 0, NULL, &env); @@ -1959,6 +1960,8 @@ int main(int argc, char **argv) libbpf_set_strict_mode(LIBBPF_STRICT_ALL); libbpf_set_print(libbpf_print_fn); + traffic_monitor_set_print(traffic_monitor_print_fn); + srand(time(NULL)); env.jit_enabled = is_jit_enabled(); @@ -1969,9 +1972,6 @@ int main(int argc, char **argv) return -1; } - env.stdout_saved = stdout; - env.stderr_saved = stderr; - env.has_testmod = true; if (!env.list_test_names) { /* ensure previous instance of the module is unloaded */ diff --git a/tools/testing/selftests/bpf/test_tunnel.sh b/tools/testing/selftests/bpf/test_tunnel.sh deleted file mode 100755 index d9661b9988ba9..0000000000000 --- a/tools/testing/selftests/bpf/test_tunnel.sh +++ /dev/null @@ -1,645 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# End-to-end eBPF tunnel test suite -# The script tests BPF network tunnel implementation. -# -# Topology: -# --------- -# root namespace | at_ns0 namespace -# | -# ----------- | ----------- -# | tnl dev | | | tnl dev | (overlay network) -# ----------- | ----------- -# metadata-mode | native-mode -# with bpf | -# | -# ---------- | ---------- -# | veth1 | --------- | veth0 | (underlay network) -# ---------- peer ---------- -# -# -# Device Configuration -# -------------------- -# Root namespace with metadata-mode tunnel + BPF -# Device names and addresses: -# veth1 IP: 172.16.1.200, IPv6: 00::22 (underlay) -# tunnel dev 11, ex: gre11, IPv4: 10.1.1.200, IPv6: 1::22 (overlay) -# -# Namespace at_ns0 with native tunnel -# Device names and addresses: -# veth0 IPv4: 172.16.1.100, IPv6: 00::11 (underlay) -# tunnel dev 00, ex: gre00, IPv4: 10.1.1.100, IPv6: 1::11 (overlay) -# -# -# End-to-end ping packet flow -# --------------------------- -# Most of the tests start by namespace creation, device configuration, -# then ping the underlay and overlay network. When doing 'ping 10.1.1.100' -# from root namespace, the following operations happen: -# 1) Route lookup shows 10.1.1.100/24 belongs to tnl dev, fwd to tnl dev. -# 2) Tnl device's egress BPF program is triggered and set the tunnel metadata, -# with remote_ip=172.16.1.100 and others. -# 3) Outer tunnel header is prepended and route the packet to veth1's egress -# 4) veth0's ingress queue receive the tunneled packet at namespace at_ns0 -# 5) Tunnel protocol handler, ex: vxlan_rcv, decap the packet -# 6) Forward the packet to the overlay tnl dev - -BPF_FILE="test_tunnel_kern.bpf.o" -BPF_PIN_TUNNEL_DIR="/sys/fs/bpf/tc/tunnel" -PING_ARG="-c 3 -w 10 -q" -ret=0 -GREEN='\033[0;92m' -RED='\033[0;31m' -NC='\033[0m' # No Color - -config_device() -{ - ip netns add at_ns0 - ip link add veth0 type veth peer name veth1 - ip link set veth0 netns at_ns0 - ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0 - ip netns exec at_ns0 ip link set dev veth0 up - ip link set dev veth1 up mtu 1500 - ip addr add dev veth1 172.16.1.200/24 -} - -add_gre_tunnel() -{ - tun_key= - if [ -n "$1" ]; then - tun_key="key $1" - fi - - # at_ns0 namespace - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE seq $tun_key \ - local 172.16.1.100 remote 172.16.1.200 - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - - # root namespace - ip link add dev $DEV type $TYPE $tun_key external - ip link set dev $DEV up - ip addr add dev $DEV 10.1.1.200/24 -} - -add_ip6gretap_tunnel() -{ - - # assign ipv6 address - ip netns exec at_ns0 ip addr add ::11/96 dev veth0 - ip netns exec at_ns0 ip link set dev veth0 up - ip addr add dev veth1 ::22/96 - ip link set dev veth1 up - - # at_ns0 namespace - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE seq flowlabel 0xbcdef key 2 \ - local ::11 remote ::22 - - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - ip netns exec at_ns0 ip addr add dev $DEV_NS fc80::100/96 - ip netns exec at_ns0 ip link set dev $DEV_NS up - - # root namespace - ip link add dev $DEV type $TYPE external - ip addr add dev $DEV 10.1.1.200/24 - ip addr add dev $DEV fc80::200/24 - ip link set dev $DEV up -} - -add_erspan_tunnel() -{ - # at_ns0 namespace - if [ "$1" == "v1" ]; then - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE seq key 2 \ - local 172.16.1.100 remote 172.16.1.200 \ - erspan_ver 1 erspan 123 - else - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE seq key 2 \ - local 172.16.1.100 remote 172.16.1.200 \ - erspan_ver 2 erspan_dir egress erspan_hwid 3 - fi - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - - # root namespace - ip link add dev $DEV type $TYPE external - ip link set dev $DEV up - ip addr add dev $DEV 10.1.1.200/24 -} - -add_ip6erspan_tunnel() -{ - - # assign ipv6 address - ip netns exec at_ns0 ip addr add ::11/96 dev veth0 - ip netns exec at_ns0 ip link set dev veth0 up - ip addr add dev veth1 ::22/96 - ip link set dev veth1 up - - # at_ns0 namespace - if [ "$1" == "v1" ]; then - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE seq key 2 \ - local ::11 remote ::22 \ - erspan_ver 1 erspan 123 - else - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE seq key 2 \ - local ::11 remote ::22 \ - erspan_ver 2 erspan_dir egress erspan_hwid 7 - fi - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - ip netns exec at_ns0 ip link set dev $DEV_NS up - - # root namespace - ip link add dev $DEV type $TYPE external - ip addr add dev $DEV 10.1.1.200/24 - ip link set dev $DEV up -} - -add_geneve_tunnel() -{ - # at_ns0 namespace - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE \ - id 2 dstport 6081 remote 172.16.1.200 - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - - # root namespace - ip link add dev $DEV type $TYPE dstport 6081 external - ip link set dev $DEV up - ip addr add dev $DEV 10.1.1.200/24 -} - -add_ip6geneve_tunnel() -{ - ip netns exec at_ns0 ip addr add ::11/96 dev veth0 - ip netns exec at_ns0 ip link set dev veth0 up - ip addr add dev veth1 ::22/96 - ip link set dev veth1 up - - # at_ns0 namespace - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE id 22 \ - remote ::22 # geneve has no local option - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - ip netns exec at_ns0 ip link set dev $DEV_NS up - - # root namespace - ip link add dev $DEV type $TYPE external - ip addr add dev $DEV 10.1.1.200/24 - ip link set dev $DEV up -} - -add_ipip_tunnel() -{ - # at_ns0 namespace - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE \ - local 172.16.1.100 remote 172.16.1.200 - ip netns exec at_ns0 ip link set dev $DEV_NS up - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - - # root namespace - ip link add dev $DEV type $TYPE external - ip link set dev $DEV up - ip addr add dev $DEV 10.1.1.200/24 -} - -add_ip6tnl_tunnel() -{ - ip netns exec at_ns0 ip addr add ::11/96 dev veth0 - ip netns exec at_ns0 ip link set dev veth0 up - ip addr add dev veth1 ::22/96 - ip link set dev veth1 up - - # at_ns0 namespace - ip netns exec at_ns0 \ - ip link add dev $DEV_NS type $TYPE \ - local ::11 remote ::22 - ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 - ip netns exec at_ns0 ip addr add dev $DEV_NS 1::11/96 - ip netns exec at_ns0 ip link set dev $DEV_NS up - - # root namespace - ip link add dev $DEV type $TYPE external - ip addr add dev $DEV 10.1.1.200/24 - ip addr add dev $DEV 1::22/96 - ip link set dev $DEV up -} - -test_gre() -{ - TYPE=gretap - DEV_NS=gretap00 - DEV=gretap11 - ret=0 - - check $TYPE - config_device - add_gre_tunnel 2 - attach_bpf $DEV gre_set_tunnel gre_get_tunnel - ping $PING_ARG 10.1.1.100 - check_err $? - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: $TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: $TYPE"${NC} -} - -test_gre_no_tunnel_key() -{ - TYPE=gre - DEV_NS=gre00 - DEV=gre11 - ret=0 - - check $TYPE - config_device - add_gre_tunnel - attach_bpf $DEV gre_set_tunnel_no_key gre_get_tunnel - ping $PING_ARG 10.1.1.100 - check_err $? - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: $TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: $TYPE"${NC} -} - -test_ip6gre() -{ - TYPE=ip6gre - DEV_NS=ip6gre00 - DEV=ip6gre11 - ret=0 - - check $TYPE - config_device - # reuse the ip6gretap function - add_ip6gretap_tunnel - attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel - # underlay - ping6 $PING_ARG ::11 - # overlay: ipv4 over ipv6 - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - ping $PING_ARG 10.1.1.100 - check_err $? - # overlay: ipv6 over ipv6 - ip netns exec at_ns0 ping6 $PING_ARG fc80::200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: $TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: $TYPE"${NC} -} - -test_ip6gretap() -{ - TYPE=ip6gretap - DEV_NS=ip6gretap00 - DEV=ip6gretap11 - ret=0 - - check $TYPE - config_device - add_ip6gretap_tunnel - attach_bpf $DEV ip6gretap_set_tunnel ip6gretap_get_tunnel - # underlay - ping6 $PING_ARG ::11 - # overlay: ipv4 over ipv6 - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - ping $PING_ARG 10.1.1.100 - check_err $? - # overlay: ipv6 over ipv6 - ip netns exec at_ns0 ping6 $PING_ARG fc80::200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: $TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: $TYPE"${NC} -} - -test_erspan() -{ - TYPE=erspan - DEV_NS=erspan00 - DEV=erspan11 - ret=0 - - check $TYPE - config_device - add_erspan_tunnel $1 - attach_bpf $DEV erspan_set_tunnel erspan_get_tunnel - ping $PING_ARG 10.1.1.100 - check_err $? - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: $TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: $TYPE"${NC} -} - -test_ip6erspan() -{ - TYPE=ip6erspan - DEV_NS=ip6erspan00 - DEV=ip6erspan11 - ret=0 - - check $TYPE - config_device - add_ip6erspan_tunnel $1 - attach_bpf $DEV ip4ip6erspan_set_tunnel ip4ip6erspan_get_tunnel - ping6 $PING_ARG ::11 - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: $TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: $TYPE"${NC} -} - -test_geneve() -{ - TYPE=geneve - DEV_NS=geneve00 - DEV=geneve11 - ret=0 - - check $TYPE - config_device - add_geneve_tunnel - attach_bpf $DEV geneve_set_tunnel geneve_get_tunnel - ping $PING_ARG 10.1.1.100 - check_err $? - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: $TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: $TYPE"${NC} -} - -test_ip6geneve() -{ - TYPE=geneve - DEV_NS=ip6geneve00 - DEV=ip6geneve11 - ret=0 - - check $TYPE - config_device - add_ip6geneve_tunnel - attach_bpf $DEV ip6geneve_set_tunnel ip6geneve_get_tunnel - ping $PING_ARG 10.1.1.100 - check_err $? - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: ip6$TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: ip6$TYPE"${NC} -} - -test_ipip() -{ - TYPE=ipip - DEV_NS=ipip00 - DEV=ipip11 - ret=0 - - check $TYPE - config_device - add_ipip_tunnel - ip link set dev veth1 mtu 1500 - attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel - ping $PING_ARG 10.1.1.100 - check_err $? - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: $TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: $TYPE"${NC} -} - -test_ipip6() -{ - TYPE=ip6tnl - DEV_NS=ipip6tnl00 - DEV=ipip6tnl11 - ret=0 - - check $TYPE - config_device - add_ip6tnl_tunnel - ip link set dev veth1 mtu 1500 - attach_bpf $DEV ipip6_set_tunnel ipip6_get_tunnel - # underlay - ping6 $PING_ARG ::11 - # ip4 over ip6 - ping $PING_ARG 10.1.1.100 - check_err $? - ip netns exec at_ns0 ping $PING_ARG 10.1.1.200 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: $TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: $TYPE"${NC} -} - -test_ip6ip6() -{ - TYPE=ip6tnl - DEV_NS=ip6ip6tnl00 - DEV=ip6ip6tnl11 - ret=0 - - check $TYPE - config_device - add_ip6tnl_tunnel - ip link set dev veth1 mtu 1500 - attach_bpf $DEV ip6ip6_set_tunnel ip6ip6_get_tunnel - # underlay - ping6 $PING_ARG ::11 - # ip6 over ip6 - ping6 $PING_ARG 1::11 - check_err $? - ip netns exec at_ns0 ping6 $PING_ARG 1::22 - check_err $? - cleanup - - if [ $ret -ne 0 ]; then - echo -e ${RED}"FAIL: ip6$TYPE"${NC} - return 1 - fi - echo -e ${GREEN}"PASS: ip6$TYPE"${NC} -} - -attach_bpf() -{ - DEV=$1 - SET=$2 - GET=$3 - mkdir -p ${BPF_PIN_TUNNEL_DIR} - bpftool prog loadall ${BPF_FILE} ${BPF_PIN_TUNNEL_DIR}/ - tc qdisc add dev $DEV clsact - tc filter add dev $DEV egress bpf da object-pinned ${BPF_PIN_TUNNEL_DIR}/$SET - tc filter add dev $DEV ingress bpf da object-pinned ${BPF_PIN_TUNNEL_DIR}/$GET -} - -cleanup() -{ - rm -rf ${BPF_PIN_TUNNEL_DIR} - - ip netns delete at_ns0 2> /dev/null - ip link del veth1 2> /dev/null - ip link del ipip11 2> /dev/null - ip link del ipip6tnl11 2> /dev/null - ip link del ip6ip6tnl11 2> /dev/null - ip link del gretap11 2> /dev/null - ip link del gre11 2> /dev/null - ip link del ip6gre11 2> /dev/null - ip link del ip6gretap11 2> /dev/null - ip link del geneve11 2> /dev/null - ip link del ip6geneve11 2> /dev/null - ip link del erspan11 2> /dev/null - ip link del ip6erspan11 2> /dev/null -} - -cleanup_exit() -{ - echo "CATCH SIGKILL or SIGINT, cleanup and exit" - cleanup - exit 0 -} - -check() -{ - ip link help 2>&1 | grep -q "\s$1\s" - if [ $? -ne 0 ];then - echo "SKIP $1: iproute2 not support" - cleanup - return 1 - fi -} - -enable_debug() -{ - echo 'file ip_gre.c +p' > /sys/kernel/debug/dynamic_debug/control - echo 'file ip6_gre.c +p' > /sys/kernel/debug/dynamic_debug/control - echo 'file geneve.c +p' > /sys/kernel/debug/dynamic_debug/control - echo 'file ipip.c +p' > /sys/kernel/debug/dynamic_debug/control -} - -check_err() -{ - if [ $ret -eq 0 ]; then - ret=$1 - fi -} - -bpf_tunnel_test() -{ - local errors=0 - - echo "Testing GRE tunnel..." - test_gre - errors=$(( $errors + $? )) - - echo "Testing GRE tunnel (without tunnel keys)..." - test_gre_no_tunnel_key - errors=$(( $errors + $? )) - - echo "Testing IP6GRE tunnel..." - test_ip6gre - errors=$(( $errors + $? )) - - echo "Testing IP6GRETAP tunnel..." - test_ip6gretap - errors=$(( $errors + $? )) - - echo "Testing ERSPAN tunnel..." - test_erspan v2 - errors=$(( $errors + $? )) - - echo "Testing IP6ERSPAN tunnel..." - test_ip6erspan v2 - errors=$(( $errors + $? )) - - echo "Testing GENEVE tunnel..." - test_geneve - errors=$(( $errors + $? )) - - echo "Testing IP6GENEVE tunnel..." - test_ip6geneve - errors=$(( $errors + $? )) - - echo "Testing IPIP tunnel..." - test_ipip - errors=$(( $errors + $? )) - - echo "Testing IPIP6 tunnel..." - test_ipip6 - errors=$(( $errors + $? )) - - echo "Testing IP6IP6 tunnel..." - test_ip6ip6 - errors=$(( $errors + $? )) - - return $errors -} - -trap cleanup 0 3 6 -trap cleanup_exit 2 9 - -cleanup -bpf_tunnel_test - -if [ $? -ne 0 ]; then - echo -e "$(basename $0): ${RED}FAIL${NC}" - exit 1 -fi -echo -e "$(basename $0): ${GREEN}PASS${NC}" -exit 0 diff --git a/tools/testing/selftests/bpf/veristat.c b/tools/testing/selftests/bpf/veristat.c index 175a03e6c5ef4..a18972ffdeb6f 100644 --- a/tools/testing/selftests/bpf/veristat.c +++ b/tools/testing/selftests/bpf/veristat.c @@ -268,10 +268,11 @@ static int append_filter(struct filter **filters, int *cnt, const char *str); static int append_filter_file(const char *path); static int append_var_preset(struct var_preset **presets, int *cnt, const char *expr); static int append_var_preset_file(const char *filename); +static int append_file(const char *path); +static int append_file_from_file(const char *path); static error_t parse_arg(int key, char *arg, struct argp_state *state) { - void *tmp; int err; switch (key) { @@ -381,14 +382,14 @@ static error_t parse_arg(int key, char *arg, struct argp_state *state) break; } case ARGP_KEY_ARG: - tmp = realloc(env.filenames, (env.filename_cnt + 1) * sizeof(*env.filenames)); - if (!tmp) - return -ENOMEM; - env.filenames = tmp; - env.filenames[env.filename_cnt] = strdup(arg); - if (!env.filenames[env.filename_cnt]) - return -ENOMEM; - env.filename_cnt++; + if (arg[0] == '@') + err = append_file_from_file(arg + 1); + else + err = append_file(arg); + if (err) { + fprintf(stderr, "Failed to collect BPF object files: %d\n", err); + return err; + } break; default: return ARGP_ERR_UNKNOWN; @@ -659,7 +660,7 @@ static int append_filter_file(const char *path) f = fopen(path, "r"); if (!f) { err = -errno; - fprintf(stderr, "Failed to open filters in '%s': %s\n", path, strerror(err)); + fprintf(stderr, "Failed to open filters in '%s': %s\n", path, strerror(-err)); return err; } @@ -689,6 +690,49 @@ static const struct stat_specs default_output_spec = { }, }; +static int append_file(const char *path) +{ + void *tmp; + + tmp = realloc(env.filenames, (env.filename_cnt + 1) * sizeof(*env.filenames)); + if (!tmp) + return -ENOMEM; + env.filenames = tmp; + env.filenames[env.filename_cnt] = strdup(path); + if (!env.filenames[env.filename_cnt]) + return -ENOMEM; + env.filename_cnt++; + return 0; +} + +static int append_file_from_file(const char *path) +{ + char buf[1024]; + int err = 0; + FILE *f; + + f = fopen(path, "r"); + if (!f) { + err = -errno; + fprintf(stderr, "Failed to open object files list in '%s': %s\n", + path, strerror(errno)); + return err; + } + + while (fscanf(f, " %1023[^\n]\n", buf) == 1) { + /* lines starting with # are comments, skip them */ + if (buf[0] == '\0' || buf[0] == '#') + continue; + err = append_file(buf); + if (err) + goto cleanup; + } + +cleanup: + fclose(f); + return err; +} + static const struct stat_specs default_csv_output_spec = { .spec_cnt = 14, .ids = { @@ -1190,13 +1234,13 @@ static void fixup_obj(struct bpf_object *obj, struct bpf_program *prog, const ch bpf_program__set_expected_attach_type(prog, attach_type); if (!env.quiet) { - printf("Using guessed program type '%s' for %s/%s...\n", + fprintf(stderr, "Using guessed program type '%s' for %s/%s...\n", libbpf_bpf_prog_type_str(prog_type), filename, prog_name); } } else { if (!env.quiet) { - printf("Failed to guess program type for freplace program with context type name '%s' for %s/%s. Consider using canonical type names to help veristat...\n", + fprintf(stderr, "Failed to guess program type for freplace program with context type name '%s' for %s/%s. Consider using canonical type names to help veristat...\n", ctx_name, filename, prog_name); } } @@ -1378,7 +1422,7 @@ static int append_var_preset_file(const char *filename) f = fopen(filename, "rt"); if (!f) { err = -errno; - fprintf(stderr, "Failed to open presets in '%s': %s\n", filename, strerror(err)); + fprintf(stderr, "Failed to open presets in '%s': %s\n", filename, strerror(-err)); return -EINVAL; }