From 8dc33facd2e97c282856b7fcf8275b49327d07c1 Mon Sep 17 00:00:00 2001 From: Ron Federman Date: Sun, 13 Oct 2024 17:15:05 +0300 Subject: [PATCH] pass a pid ns to the ebpf program to pass pids back only if they are part of that ns --- Dockerfile | 3 +- cmd/main.go | 5 -- daemonset.yaml | 19 +------ detector.go | 18 +++---- internal/probe/bpf_arm64_bpfel.go | 7 ++- internal/probe/bpf_x86_bpfel.go | 7 ++- internal/probe/ebpf/detector.bpf.c | 83 ++++++++++++++++++++++++++---- internal/probe/probe.go | 52 +++++++++++++------ internal/probe/probe_test.go | 2 +- internal/proc/proc.go | 31 ++++++++--- internal/proc/proc_test.go | 10 ++++ 11 files changed, 167 insertions(+), 70 deletions(-) diff --git a/Dockerfile b/Dockerfile index bb16df9..93a189b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,8 +7,9 @@ RUN --mount=type=cache,target=/go/pkg \ go mod download && go mod verify COPY . . +ARG TARGETARCH RUN --mount=type=cache,target=/root/.cache/go-build \ --mount=type=cache,target=/go/pkg \ - make build + GOOS=linux GOARCH=$TARGETARCH make build CMD [ "/app/runtime-detector" ] diff --git a/cmd/main.go b/cmd/main.go index e71cdb5..6b76e2c 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -10,8 +10,6 @@ import ( detector "github.com/odigos-io/runtime-detector" ) -const envProcFS = "PROC_FS_PATH" - func newLogger() *slog.Logger { return slog.New(slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{ AddSource: true, @@ -26,9 +24,6 @@ func main() { detector.WithLogger(l), detector.WithEnvironments("NODE_OPTIONS", "PYTHONPATH", "NODE_VERSION", "PYTHON_VERSION", "JAVA_VERSION", "ODIGOS_POD_NAME", "ODIGOS_CONTAINER_NAME", "ODIGOS_WORKLOAD_NAMESPACE"), } - if p := os.Getenv(envProcFS); p != "" { - opts = append(opts, detector.WithProcFSPath(p)) - } details := make(chan *detector.Details) done := make(chan struct{}) diff --git a/daemonset.yaml b/daemonset.yaml index cbe6477..a023f53 100644 --- a/daemonset.yaml +++ b/daemonset.yaml @@ -16,8 +16,8 @@ spec: app: runtime-detect spec: containers: - - image: dev/runtime-detector:test - imagePullPolicy: IfNotPresent + - image: public.ecr.aws/y2v0v6s7/dev/runtime-detector:test + imagePullPolicy: Always name: runtime-detector resources: {} securityContext: @@ -27,17 +27,6 @@ spec: volumeMounts: - mountPath: /sys/kernel/debug name: kernel-debug - - mountPath: /procHost - name: host-proc - env: - # when deploying in a kind cluster, the /proc filesystem is not the same as the host's - # this environment variable is used to tell the runtime-detector container where to find the host's /proc filesystem - - name: PROC_FS_PATH - value: "/procHost" - # used as a dummy HTTP server to test the runtime-detector configuration - ports: - - containerPort: 8080 - protocol: TCP dnsPolicy: ClusterFirst hostPID: true restartPolicy: Always @@ -49,7 +38,3 @@ spec: hostPath: path: /sys/kernel/debug type: "" - - name: host-proc - hostPath: - path: /procHost - type: Directory diff --git a/detector.go b/detector.go index 8005a8c..d18c392 100644 --- a/detector.go +++ b/detector.go @@ -72,6 +72,10 @@ func NewDetector(ctx context.Context, output chan<- *Details, opts ...DetectorOp pids := make(chan int) + // the following steps are used to create the filters chain + // 1. ebpf probe generating events and doing basic filtering + // 2. duration filter to filter out short-lived processes + // 3. k8s filter to check if the process is running in a k8s pod k8s := k8sfilter.NewK8sFilter(c.logger, pids) durationFilter := duration.NewDurationFilter(c.logger, c.minDuration, k8s) p := probe.New(c.logger, durationFilter) @@ -130,6 +134,7 @@ func (d *Detector) Run(ctx context.Context) error { return err } + // read pid events from the filters chain and pass them to the client wg := sync.WaitGroup{} wg.Add(1) go func() { @@ -240,16 +245,9 @@ func WithMinDuration(d time.Duration) DetectorOption { }) } -// WithProcFSPath returns a [DetectorOption] that configures a [Detector] to use the specified path to the 'proc' filesystem, -// the default is /proc. In some cases, the 'proc' filesystem is mounted in a different location. -// For example when using Kind, the 'proc' filesystem is that of the container running kind. -func WithProcFSPath(p string) DetectorOption { - return fnOpt(func(_ context.Context, c detectorConfig) (detectorConfig, error) { - err := proc.SetProcFS(p) - return c, err - }) -} - +// WithEnvironments returns a [DetectorOption] that configures a [Detector] to include the specified environment +// variables in the output (in case they are set for the process). If no environment keys are provided, no environment +// variables will be included in the output. func WithEnvironments(envs ...string) DetectorOption { return fnOpt(func(_ context.Context, c detectorConfig) (detectorConfig, error) { envsMap := make(map[string]struct{}) diff --git a/internal/probe/bpf_arm64_bpfel.go b/internal/probe/bpf_arm64_bpfel.go index c4d5116..1eb3621 100644 --- a/internal/probe/bpf_arm64_bpfel.go +++ b/internal/probe/bpf_arm64_bpfel.go @@ -61,7 +61,8 @@ type bpfProgramSpecs struct { // // It can be passed ebpf.CollectionSpec.Assign. type bpfMapSpecs struct { - Events *ebpf.MapSpec `ebpf:"events"` + Events *ebpf.MapSpec `ebpf:"events"` + TrackedPidsToNsPids *ebpf.MapSpec `ebpf:"tracked_pids_to_ns_pids"` } // bpfObjects contains all objects after they have been loaded into the kernel. @@ -83,12 +84,14 @@ func (o *bpfObjects) Close() error { // // It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. type bpfMaps struct { - Events *ebpf.Map `ebpf:"events"` + Events *ebpf.Map `ebpf:"events"` + TrackedPidsToNsPids *ebpf.Map `ebpf:"tracked_pids_to_ns_pids"` } func (m *bpfMaps) Close() error { return _BpfClose( m.Events, + m.TrackedPidsToNsPids, ) } diff --git a/internal/probe/bpf_x86_bpfel.go b/internal/probe/bpf_x86_bpfel.go index b33bbdb..b1374cc 100644 --- a/internal/probe/bpf_x86_bpfel.go +++ b/internal/probe/bpf_x86_bpfel.go @@ -61,7 +61,8 @@ type bpfProgramSpecs struct { // // It can be passed ebpf.CollectionSpec.Assign. type bpfMapSpecs struct { - Events *ebpf.MapSpec `ebpf:"events"` + Events *ebpf.MapSpec `ebpf:"events"` + TrackedPidsToNsPids *ebpf.MapSpec `ebpf:"tracked_pids_to_ns_pids"` } // bpfObjects contains all objects after they have been loaded into the kernel. @@ -83,12 +84,14 @@ func (o *bpfObjects) Close() error { // // It can be passed to loadBpfObjects or ebpf.CollectionSpec.LoadAndAssign. type bpfMaps struct { - Events *ebpf.Map `ebpf:"events"` + Events *ebpf.Map `ebpf:"events"` + TrackedPidsToNsPids *ebpf.Map `ebpf:"tracked_pids_to_ns_pids"` } func (m *bpfMaps) Close() error { return _BpfClose( m.Events, + m.TrackedPidsToNsPids, ) } diff --git a/internal/probe/ebpf/detector.bpf.c b/internal/probe/ebpf/detector.bpf.c index 18ec8a9..1a7ed3c 100644 --- a/internal/probe/ebpf/detector.bpf.c +++ b/internal/probe/ebpf/detector.bpf.c @@ -2,10 +2,29 @@ #include "bpf_helpers.h" #include "bpf_core_read.h" +char __license[] SEC("license") = "Dual MIT/GPL"; + +const char odigos_env_prefix[] = "ODIGOS_POD"; +#define ODIGOS_PREFIX_LEN (10) + +#define MAX_ENV_VARS (128) +#define MAX_NS_FOR_PID (8) + +// This max is only for processes we track. +// Those which are filtered out are not counted in this limit. +#define MAX_CONCURRENT_PIDS (16384) // 2^14 + struct { __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); } events SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __type(key, u32); // the pid as return from bpf_get_current_pid_tgid() + __type(value, u32); // the pid as return from get_pid_for_configured_ns() + __uint(max_entries, MAX_CONCURRENT_PIDS); +} tracked_pids_to_ns_pids SEC(".maps"); + typedef enum { UNDEFINED = 0, PROCESS_EXEC = 1, @@ -17,12 +36,9 @@ typedef struct process_event { u32 pid; } process_event_t; -char __license[] SEC("license") = "Dual MIT/GPL"; - -const char odigos_env_prefix[] = "ODIGOS_POD"; -#define ODIGOS_PREFIX_LEN (10) - -#define MAX_ENV_VARS (128) +// This is the inode number of the PID namespace we are interested in. +// It is set by the userspace code. +volatile const u32 pid_ns_inode = 0; static __always_inline bool is_odigos_env_prefix(char *env) { // don't compare the null terminator @@ -34,8 +50,38 @@ static __always_inline bool is_odigos_env_prefix(char *env) { return true; } +static __always_inline u32 get_pid_for_configured_ns(struct task_struct *task) { + struct upid upid = {0}; + u32 inum = 0; + u32 selected_pid = 0; + unsigned int num_pids = BPF_CORE_READ(task, thread_pid, level); + + if (num_pids > MAX_NS_FOR_PID) { + bpf_printk("Number of PIDs is greater than supported: %d", num_pids); + num_pids = MAX_NS_FOR_PID; + } + + for (int i = 0; i < num_pids && i < MAX_NS_FOR_PID; i++) { + upid = BPF_CORE_READ(task, thread_pid, numbers[i]); + inum = BPF_CORE_READ(upid.ns, ns.inum); + if (inum == pid_ns_inode) { + selected_pid = upid.nr; + break; + } + } + + return selected_pid; +} + SEC("tracepoint/syscalls/sys_enter_execve") int tracepoint__syscalls__sys_enter_execve(struct syscall_trace_enter* ctx) { + /* + The format of the tracepoint args is: + args: + field:const char * filename; offset:16; size:8; signed:0; + field:const char *const * argv; offset:24; size:8; signed:0; + field:const char *const * envp; offset:32; size:8; signed:0; + */ const char **args = (const char **)(ctx->args[2]); const char *argp; // save space for a terminating null byte @@ -65,12 +111,24 @@ int tracepoint__syscalls__sys_enter_execve(struct syscall_trace_enter* ctx) { return 0; } + struct task_struct *task = (struct task_struct *)bpf_get_current_task(); + u32 selected_pid = get_pid_for_configured_ns(task); + if (selected_pid == 0) { + bpf_printk("Could not find PID for task: 0x%llx", bpf_get_current_pid_tgid()); + return 0; + } + u64 pid_tgid = bpf_get_current_pid_tgid(); - u32 tgid = pid_tgid >> 32; + u32 pid = (u32)(pid_tgid & 0xFFFFFFFF); + ret = bpf_map_update_elem(&tracked_pids_to_ns_pids, &pid, &selected_pid, BPF_ANY); + if (ret != 0) { + bpf_printk("Failed to update PID to NS PID map: %d", ret); + return 0; + } process_event_t event = { .type = PROCESS_EXEC, - .pid = tgid, + .pid = selected_pid, }; bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event)); @@ -90,11 +148,18 @@ int tracepoint__sched__sched_process_exit(struct trace_event_raw_sched_process_e return 0; } + // look this pid in the map, avoid sending exit event for PIDs we didn't send exec event for. + u32 *selected_pid = bpf_map_lookup_elem(&tracked_pids_to_ns_pids, &pid); + if (selected_pid == NULL) { + return 0; + } + process_event_t event = { .type = PROCESS_EXIT, - .pid = tgid, + .pid = *selected_pid, }; bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(event)); + bpf_map_delete_elem(&tracked_pids_to_ns_pids, &pid); return 0; } \ No newline at end of file diff --git a/internal/probe/probe.go b/internal/probe/probe.go index ebff531..9bd632c 100644 --- a/internal/probe/probe.go +++ b/internal/probe/probe.go @@ -12,6 +12,7 @@ import ( "github.com/cilium/ebpf/link" "github.com/cilium/ebpf/perf" "github.com/cilium/ebpf/rlimit" + "github.com/odigos-io/runtime-detector/internal/proc" filter "github.com/odigos-io/runtime-detector/internal/process_filter" ) @@ -19,7 +20,7 @@ import ( type Probe struct { logger *slog.Logger - bpfObjects *bpfObjects + c *ebpf.Collection links []link.Link reader *perf.Reader @@ -53,6 +54,10 @@ type processEvent struct { const ( PerfBufferDefaultSizeInPages = 128 + + eventsMapName = "events" + processExecProgramName = "tracepoint__syscalls__sys_enter_execve" + processExitProgramName = "tracepoint__sched__sched_process_exit" ) func New(logger *slog.Logger, f filter.ProcessesFilter) *Probe { @@ -63,7 +68,13 @@ func New(logger *slog.Logger, f filter.ProcessesFilter) *Probe { } func (p *Probe) LoadAndAttach() error { - if err := p.load(); err != nil { + // find the PID namespace inode + pidNS, err := proc.GetCurrentPIDNameSpaceIndoe() + if err != nil { + return fmt.Errorf("can't get current PID namespace inode: %w", err) + } + + if err := p.load(pidNS); err != nil { return fmt.Errorf("can't load probe: %w", err) } @@ -74,45 +85,54 @@ func (p *Probe) LoadAndAttach() error { return nil } -func (p *Probe) load() error { +func (p *Probe) load(ns uint32) error { // Allow the current process to lock memory for eBPF resources. if err := rlimit.RemoveMemlock(); err != nil { return err } - objs := &bpfObjects{} - // TODO: collect verifier logs when configured to. - if err := loadBpfObjects(objs, &ebpf.CollectionOptions{}); err != nil { + spec, err := loadBpf() + if err != nil { + return err + } + + err = spec.RewriteConstants(map[string]interface{}{ + "pid_ns_inode": ns, + }) + if err != nil { + return fmt.Errorf("can't rewrite constants: %w", err) + } + + c, err := ebpf.NewCollectionWithOptions(spec, ebpf.CollectionOptions{}) + if err != nil { var ve *ebpf.VerifierError if errors.As(err, &ve) { fmt.Printf("Verifier log: %-100v\n", ve) } - return err } - p.bpfObjects = objs - + p.c = c return nil } func (p *Probe) attach() error { - if p.bpfObjects == nil { - return fmt.Errorf("can't attach probe: bpf objects are not loaded") + if p.c == nil { + return errors.New("no eBPF collection loaded") } - reader, err := perf.NewReader(p.bpfObjects.Events, PerfBufferDefaultSizeInPages*os.Getpagesize()) + reader, err := perf.NewReader(p.c.Maps[eventsMapName], PerfBufferDefaultSizeInPages*os.Getpagesize()) if err != nil { return fmt.Errorf("can't create perf reader: %w", err) } p.reader = reader - l, err := link.Tracepoint("syscalls", "sys_enter_execve", p.bpfObjects.TracepointSyscallsSysEnterExecve, nil) + l, err := link.Tracepoint("syscalls", "sys_enter_execve", p.c.Programs[processExecProgramName], nil) if err != nil { return fmt.Errorf("can't attach probe sys_enter_execve: %w", err) } p.links = append(p.links, l) - l, err = link.Tracepoint("sched", "sched_process_exit", p.bpfObjects.TracepointSchedSchedProcessExit, nil) + l, err = link.Tracepoint("sched", "sched_process_exit", p.c.Programs[processExitProgramName], nil) if err != nil { return fmt.Errorf("can't attach probe sched_process_exit: %w", err) } @@ -130,8 +150,8 @@ func (p *Probe) close() error { } } - if p.bpfObjects != nil { - err = errors.Join(err, p.bpfObjects.Close()) + if p.c != nil { + p.c.Close() } if p.reader != nil { diff --git a/internal/probe/probe_test.go b/internal/probe/probe_test.go index abc0858..ef261eb 100644 --- a/internal/probe/probe_test.go +++ b/internal/probe/probe_test.go @@ -12,7 +12,7 @@ func TestLoad(t *testing.T) { logger: slog.Default(), } - err := p.load() + err := p.load(uint32(4026532561)) defer p.close() assert.NoError(t, err) } \ No newline at end of file diff --git a/internal/proc/proc.go b/internal/proc/proc.go index d9ecb6f..fa404b8 100644 --- a/internal/proc/proc.go +++ b/internal/proc/proc.go @@ -20,17 +20,34 @@ const ( odigosEnvVarKeyPrefix = "ODIGOS_POD" ) -func SetProcFS(path string) error { - _, err := os.Stat(path) +func procFile(pid int, filename string) string { + return fmt.Sprintf("%s/%d/%s", procFS, pid, filename) +} + +func GetCurrentPIDNameSpaceIndoe() (uint32, error) { + // look at the pid namespace of the root process + path := procFile(1, "ns/pid") + content, err := os.Readlink(path) if err != nil { - return fmt.Errorf("failed to set proc filesystem ti %s: %w", path, err) + return 0, fmt.Errorf("failed to read link %s: %w", path, err) } - procFS = path - return nil + + return extractNSInode(content) } -func procFile(pid int, filename string) string { - return fmt.Sprintf("%s/%d/%s", procFS, pid, filename) +func extractNSInode(content string) (uint32, error) { + parts := strings.Split(content, "[") + if len(parts) != 2 { + return 0, fmt.Errorf("unexpected content %s", content) + } + + inodeStr := strings.TrimRight(parts[1], "]") + inode, err := strconv.ParseUint(inodeStr, 10, 32) + if err != nil { + return 0, fmt.Errorf("failed to parse inode %s: %w", inodeStr, err) + } + + return uint32(inode), nil } // GetCmdline returns the command line of the process with the given PID. diff --git a/internal/proc/proc_test.go b/internal/proc/proc_test.go index 88260db..c51f11e 100644 --- a/internal/proc/proc_test.go +++ b/internal/proc/proc_test.go @@ -191,4 +191,14 @@ func TestParseEnvironments(t *testing.T) { compareEnvs(t, tt.expected, result) }) } +} + +func TestExtractNSInode(t *testing.T) { + inode, err := extractNSInode("pid:[4026531835]") + assert.NoError(t, err) + assert.Equal(t, uint32(4026531835), inode) + + inode, err = extractNSInode("pid:[12]") + assert.NoError(t, err) + assert.Equal(t, uint32(12), inode) } \ No newline at end of file