diff --git a/.cirrus.yml b/.cirrus.yml index 23c0be84a508..cbdaf422eb54 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -401,6 +401,9 @@ bindings_task: server_log_artifacts: path: ./podman-server.log type: text/plain + cleanup_tracer_artifacts: + path: ./podman-cleanup-tracer.log + type: text/plain df_script: '$SCRIPT_BASE/logcollector.sh df' audit_log_script: '$SCRIPT_BASE/logcollector.sh audit' journal_script: '$SCRIPT_BASE/logcollector.sh journal' diff --git a/.gitignore b/.gitignore index d1813cc8fbd4..1b596b2b44fc 100644 --- a/.gitignore +++ b/.gitignore @@ -57,3 +57,4 @@ contrib/win-installer/shasums contrib/win-installer/*.wixobj contrib/win-installer/*.wixpdb contrib/win-installer/*.log +podman-cleanup-tracer.log* diff --git a/contrib/cirrus/runner.sh b/contrib/cirrus/runner.sh index ea6235eba47b..8d4dddd69d3d 100755 --- a/contrib/cirrus/runner.sh +++ b/contrib/cirrus/runner.sh @@ -458,6 +458,15 @@ msg "************************************************************" ((${SETUP_ENVIRONMENT:-0})) || \ die "Expecting setup_environment.sh to have completed successfully" +if [[ "$UID" -eq 0 ]] && ((CONTAINER==0)); then + # start ebpf cleanup tracer (#23487) + msg "start ebpf cleanup tracer" + # replace zero bytes to make the log more readable + bpftrace $GOSRC/hack/podman_cleanup_tracer.bt |& \ + tr '\0' ' ' >$GOSRC/podman-cleanup-tracer.log & + TRACER_PID=$! +fi + # shellcheck disable=SC2154 if [[ "$PRIV_NAME" == "rootless" ]] && [[ "$UID" -eq 0 ]]; then # Remove /var/lib/cni, it is not required for rootless cni. @@ -499,4 +508,9 @@ fi showrun $handler +if [[ -n "$TRACER_PID" ]]; then + # ignore any error here + kill "$TRACER_PID" || true +fi + showrun echo "finished" diff --git a/hack/podman_cleanup_tracer.bt b/hack/podman_cleanup_tracer.bt new file mode 100755 index 000000000000..42c46cde9a85 --- /dev/null +++ b/hack/podman_cleanup_tracer.bt @@ -0,0 +1,151 @@ +#!/usr/bin/env bpftrace + +// Traces podman and conmon commands for there exit status and all process that +// send signals to them. Also prints stderr for all podman container cleanup processes. +// The goal is to be able to trace all podman cleanup errors, while there is an +// option to log the errors to the syslog that only works of the process if the +// process got there and not if it was killed before. This program here will trace +// the exit status as well so we know even if it was killed by a signal. +// +// This is script uses https://github.com/bpftrace/bpftrace to compile this into +// kernel ebpf. +// +// Usage: sudo ./podman_cleanup_tracer.bt +// see below for output format + +BEGIN { + print("Output format is one of:"); + print("exec TIME PID PPID COMM ARGV(truncated)"); + print("cmd TIME PID PPID ARGV(full)"); + print("exit TIME PID PPID COMM EXIT_CODE EXIT_SIGNAL"); + print("kill TIME PID PPID COMM SIGNAL TPID RET"); + print("stderr TIME PID PPID COMM OUTPUT"); +} + +// Trace all exec calls to find all podman + conmon processes, it will also catch +// netavark and aardvark-dns as they have podman in the path as well but this is +// good so we can see any errors there as well. +tracepoint:syscalls:sys_enter_exec* +/ strcontains(str(args.argv[0]),"podman") || strcontains(str(args.argv[0]), "conmon") / +{ + + // create entry in pid map so we can check the pid later + @pids[pid] = 1; + + // Find the podman cleanup process spawned by conmon. + // I tried matching argv but there seems to be no way to iterate over it. + // In practise parent name conmon and argv0 podman should contain all the + // cleanup processes we care about. + if (comm == "conmon" && strcontains(str(args.argv[0]), "podman")) { + @cleanupPids[pid] = 1; + } + + printf("%-6s %s %-8d %-8d %-12s ", + "exec", + strftime("%H:%M:%S.%f", nsecs), + pid, + curtask->real_parent->tgid, + comm + ); + + // Unfortunately this doesn't print the full argv0 as there is some limit, + // as such we use the /proc/<>/cmdline reading hack below on exec exit. + join(args->argv); +} + + +// Print the full cmdline +tracepoint:syscalls:sys_exit_exec* +/ @pids[pid] / +{ + printf("%-6s %s %-8d %-8d ", + "cmd", + strftime("%H:%M:%S.%f", nsecs), + pid, + curtask->real_parent->tgid + ); + // This can fail to open the file it is done in user space and + // thus racy if the process exits quickly. + cat("/proc/%d/cmdline", pid); + print(""); +} + + +// Trace all exits for the pids we matched above, +// pid == tid is used to only match the main exit +// and not all thread exits which we do not care about. +tracepoint:sched:sched_process_exit +/ @pids[pid] && pid == tid / +{ + printf("%-6s %s %-8d %-8d %-12s %d %d\n", + "exit", + strftime("%H:%M:%S.%f", nsecs), + pid, + curtask->real_parent->tgid, + comm, + curtask->exit_code >> 8, // actual exit code + curtask->exit_code & 0xFF // signal number if killed + ); + + // process is done remove pid from map + delete(@pids[pid]); + delete(@cleanupPids[pid]); +} + +// Trace all kill calls that target our pids. +// This will not catch signals send via pidfd_send_signal because +// I don't see a way to translate the pidfd to our target pids and +// I don't want to log all send signals on the system. +tracepoint:syscalls:sys_enter_kill +/ @pids[args.pid] / +{ + @tpid[tid] = args.pid; + @tsig[tid] = args.sig; +} + +tracepoint:syscalls:sys_exit_kill +/ @tpid[tid] / +{ + printf("%-6s %s %-8d %-8d %-12s %d %-8d %d\n", + "kill", + strftime("%H:%M:%S.%f", nsecs), + pid, + curtask->real_parent->tgid, + comm, + @tsig[tid], + @tpid[tid], + args.ret + ); + delete(@tpid[tid]); + delete(@tsig[tid]); +} + + +// Print anything written on stderr for the podman container cleanup process. +tracepoint:syscalls:sys_enter_write +/ @cleanupPids[pid] && args.fd == 2 / +{ + printf("%-6s %s %-8d %-8d %-12s", + "stderr", + strftime("%H:%M:%S.%f", nsecs), + pid, + curtask->real_parent->tgid, + comm + ); + + // String size limit is is 64 by default, this includes the 0 byte and when we + // hit the string limit it also adds "..." when using 63 so we use 63 as len here. + // While upstream fixed these low string limits (https://github.com/bpftrace/bpftrace/issues/305) + // it is not yet in older distro version we use so we cannot use that yet. + // Thus manually print several times. + $len = 62; + $offset = 62; + printf("%s", str(args.buf, $len)); + + unroll(10) { + if ((int64)args.count > $offset ) { + printf("%s", str(args.buf + $offset, $len)); + } + $offset += $len + } +}