From 75f45366e9d8991f21888cd49e1ede08144fd116 Mon Sep 17 00:00:00 2001 From: Ron Federman Date: Mon, 20 May 2024 17:35:36 +0300 Subject: [PATCH 1/2] Add status conditions for ebpf SDKs describing whether they are ready or not --- cli/cmd/resources/odiglet.go | 13 +++ odiglet/cmd/main.go | 18 +-- odiglet/go.mod | 10 +- odiglet/go.sum | 12 +- odiglet/pkg/ebpf/director.go | 108 +++++++++++++++++- odiglet/pkg/ebpf/go.go | 3 +- odiglet/pkg/kube/instrumentation_ebpf/pods.go | 5 +- .../pkg/kube/instrumentation_ebpf/shared.go | 18 +-- odiglet/pkg/kube/runtime_details/shared.go | 15 +++ 9 files changed, 160 insertions(+), 42 deletions(-) diff --git a/cli/cmd/resources/odiglet.go b/cli/cmd/resources/odiglet.go index 01fe9c954..0da2fb2a5 100644 --- a/cli/cmd/resources/odiglet.go +++ b/cli/cmd/resources/odiglet.go @@ -169,6 +169,19 @@ func NewOdigletClusterRole(psp bool) *rbacv1.ClusterRole { "instrumentedapplications", }, }, + { + Verbs: []string{ + "get", + "patch", + "update", + }, + APIGroups: []string{ + "odigos.io", + }, + Resources: []string{ + "instrumentedapplications/status", + }, + }, { Verbs: []string{ "get", diff --git a/odiglet/cmd/main.go b/odiglet/cmd/main.go index f204d4ad0..a927fdacb 100644 --- a/odiglet/cmd/main.go +++ b/odiglet/cmd/main.go @@ -15,6 +15,7 @@ import ( "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/manager/signals" + "sigs.k8s.io/controller-runtime/pkg/client" ) func main() { @@ -41,11 +42,7 @@ func main() { os.Exit(-1) } - ebpfDirectors, err := initEbpf() - if err != nil { - log.Logger.Error(err, "Failed to init eBPF director") - os.Exit(-1) - } + ctx := signals.SetupSignalHandler() go startDeviceManager(clientset) @@ -55,13 +52,18 @@ func main() { os.Exit(-1) } + ebpfDirectors, err := initEbpf(ctx, mgr.GetClient()) + if err != nil { + log.Logger.Error(err, "Failed to init eBPF director") + os.Exit(-1) + } + err = kube.SetupWithManager(mgr, ebpfDirectors) if err != nil { log.Logger.Error(err, "Failed to setup controller-runtime manager") os.Exit(-1) } - ctx := signals.SetupSignalHandler() err = kube.StartManager(ctx, mgr) if err != nil { log.Logger.Error(err, "Failed to start controller-runtime manager") @@ -110,9 +112,9 @@ func startDeviceManager(clientset *kubernetes.Clientset) { manager.Run() } -func initEbpf() (ebpf.DirectorsMap, error) { +func initEbpf(ctx context.Context, client client.Client) (ebpf.DirectorsMap, error) { goInstrumentationFactory := ebpf.NewGoInstrumentationFactory() - goDirector := ebpf.NewEbpfDirector(common.GoProgrammingLanguage, goInstrumentationFactory) + goDirector := ebpf.NewEbpfDirector(ctx, client, common.GoProgrammingLanguage, goInstrumentationFactory) goDirectorKey := ebpf.DirectorKey{ Language: common.GoProgrammingLanguage, OtelSdk: common.OtelSdk{SdkType: common.EbpfOtelSdkType, SdkTier: common.CommunityOtelSdkTier}, diff --git a/odiglet/go.mod b/odiglet/go.mod index 91da823f5..415636bc9 100644 --- a/odiglet/go.mod +++ b/odiglet/go.mod @@ -13,13 +13,14 @@ require ( github.com/odigos-io/odigos/procdiscovery v0.0.0 github.com/odigos-io/opentelemetry-zap-bridge v0.0.5 github.com/otiai10/copy v1.14.0 - go.opentelemetry.io/auto v0.12.0-alpha.0.20240510155300-a8d8a98172ce + // TODO revert this change before merging + go.opentelemetry.io/auto v0.12.0-alpha go.opentelemetry.io/otel v1.26.0 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.26.0 go.uber.org/zap v1.27.0 google.golang.org/grpc v1.63.2 k8s.io/api v0.30.0 - k8s.io/apimachinery v0.30.0 + k8s.io/apimachinery v0.30.1 k8s.io/client-go v0.30.0 k8s.io/kubelet v0.30.0 sigs.k8s.io/controller-runtime v0.18.2 @@ -55,7 +56,7 @@ require ( github.com/json-iterator/go v1.1.12 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/mattn/go-colorable v0.1.8 // indirect - github.com/mattn/go-isatty v0.0.12 // indirect + github.com/mattn/go-isatty v0.0.19 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect @@ -111,3 +112,6 @@ replace ( github.com/odigos-io/odigos/common => ../common github.com/odigos-io/odigos/procdiscovery => ../procdiscovery ) + +// TODO: remove this replace before merging +replace go.opentelemetry.io/auto => github.com/RonFed/opentelemetry-go-instrumentation_fork v0.0.0-20240520102826-7e7c5a0992b5 diff --git a/odiglet/go.sum b/odiglet/go.sum index 585842a07..b361d5320 100644 --- a/odiglet/go.sum +++ b/odiglet/go.sum @@ -28,6 +28,8 @@ github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= github.com/PuerkitoBio/urlesc v0.0.0-20160726150825-5bd2802263f2/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= +github.com/RonFed/opentelemetry-go-instrumentation_fork v0.0.0-20240520102826-7e7c5a0992b5 h1:weT5NS3WNeOGQAE2WUEMIgqNqxU0hXjqsAzk9OoayEQ= +github.com/RonFed/opentelemetry-go-instrumentation_fork v0.0.0-20240520102826-7e7c5a0992b5/go.mod h1:l97HvRF4QQdOvb3LobVnUlmGb5eju7yfvccYWuJj84k= github.com/agoda-com/opentelemetry-logs-go v0.4.0 h1:XLPOlLHLOND2/VrL69TWSy6blCZnypV37t1MjfILksI= github.com/agoda-com/opentelemetry-logs-go v0.4.0/go.mod h1:CeDuVaK9yCWN+8UjOW8AciYJE0rl7K/mw4ejBntGYkc= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= @@ -222,8 +224,9 @@ github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0 github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mattn/go-colorable v0.1.8 h1:c1ghPdyEDarC70ftn0y+A/Ee++9zz8ljHG1b13eJ0s8= github.com/mattn/go-colorable v0.1.8/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= -github.com/mattn/go-isatty v0.0.12 h1:wuysRhFDzyxgEmMf5xjvJ2M9dZoWAXNNr5LSBS7uHXY= github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= +github.com/mattn/go-isatty v0.0.19 h1:JITubQf0MOLdlGRuRq+jtsDlekdYPia9ZFsB8h/APPA= +github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/matttproud/golang_protobuf_extensions v1.0.2-0.20181231171920-c182affec369/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= github.com/moby/term v0.0.0-20200312100748-672ec06f55cd/go.mod h1:DdlQx2hp0Ss5/fLikoLlEeIYiATotOjgB//nb973jeo= @@ -318,8 +321,6 @@ github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5t go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= -go.opentelemetry.io/auto v0.12.0-alpha.0.20240510155300-a8d8a98172ce h1:JngoTZelK6ssW1iA6FwJISuEHj81wwLLb2QjGlktvJs= -go.opentelemetry.io/auto v0.12.0-alpha.0.20240510155300-a8d8a98172ce/go.mod h1:l97HvRF4QQdOvb3LobVnUlmGb5eju7yfvccYWuJj84k= go.opentelemetry.io/contrib/exporters/autoexport v0.51.0 h1:imlL5MBzKu+NWhnJM62bHws6m+6LN8HMT3V9PcSTbaY= go.opentelemetry.io/contrib/exporters/autoexport v0.51.0/go.mod h1:gn1wFA1uVEKIXrM3DC7SN9ee83oJ0yALY/HbUfqMszo= go.opentelemetry.io/otel v1.26.0 h1:LQwgL5s/1W7YiiRwxf03QGnWLb2HW4pLiAhaA5cZXBs= @@ -466,6 +467,7 @@ golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= @@ -608,8 +610,8 @@ k8s.io/api v0.30.0/go.mod h1:OPlaYhoHs8EQ1ql0R/TsUgaRPhpKNxIMrKQfWUp8QSE= k8s.io/apiextensions-apiserver v0.30.0 h1:jcZFKMqnICJfRxTgnC4E+Hpcq8UEhT8B2lhBcQ+6uAs= k8s.io/apiextensions-apiserver v0.30.0/go.mod h1:N9ogQFGcrbWqAY9p2mUAL5mGxsLqwgtUce127VtRX5Y= k8s.io/apimachinery v0.19.2/go.mod h1:DnPGDnARWFvYa3pMHgSxtbZb7gpzzAZ1pTfaUNDVlmA= -k8s.io/apimachinery v0.30.0 h1:qxVPsyDM5XS96NIh9Oj6LavoVFYff/Pon9cZeDIkHHA= -k8s.io/apimachinery v0.30.0/go.mod h1:iexa2somDaxdnj7bha06bhb43Zpa6eWH8N8dbqVjTUc= +k8s.io/apimachinery v0.30.1 h1:ZQStsEfo4n65yAdlGTfP/uSHMQSoYzU/oeEbkmF7P2U= +k8s.io/apimachinery v0.30.1/go.mod h1:iexa2somDaxdnj7bha06bhb43Zpa6eWH8N8dbqVjTUc= k8s.io/client-go v0.19.2/go.mod h1:S5wPhCqyDNAlzM9CnEdgTGV4OqhsW3jGO1UM1epwfJA= k8s.io/client-go v0.30.0 h1:sB1AGGlhY/o7KCyCEQ0bPWzYDL0pwOZO4vAtTSh/gJQ= k8s.io/client-go v0.30.0/go.mod h1:g7li5O5256qe6TYdAMyX/otJqMhIiGgTapdLchhmOaY= diff --git a/odiglet/pkg/ebpf/director.go b/odiglet/pkg/ebpf/director.go index b31733ac5..a5474d025 100644 --- a/odiglet/pkg/ebpf/director.go +++ b/odiglet/pkg/ebpf/director.go @@ -4,9 +4,14 @@ import ( "context" "sync" + _ "github.com/odigos-io/odigos/api/odigos/v1alpha1" "github.com/odigos-io/odigos/common" + "github.com/odigos-io/odigos/common/k8s" + runtime_details "github.com/odigos-io/odigos/odiglet/pkg/kube/runtime_details" + "sigs.k8s.io/controller-runtime/pkg/client" "github.com/odigos-io/odigos/odiglet/pkg/log" "k8s.io/apimachinery/pkg/types" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) // This interface should be implemented by all ebpf sdks @@ -18,7 +23,7 @@ type OtelEbpfSdk interface { // users can use different eBPF otel SDKs by returning them from this function type InstrumentationFactory[T OtelEbpfSdk] interface { - CreateEbpfInstrumentation(ctx context.Context, pid int, serviceName string, podWorkload *common.PodWorkload, containerName string, podName string) (T, error) + CreateEbpfInstrumentation(ctx context.Context, pid int, serviceName string, podWorkload *common.PodWorkload, containerName string, podName string, loadedIndicator chan struct{}) (T, error) } // Director manages the instrumentation for a specific SDK in a specific language @@ -34,6 +39,23 @@ type podDetails struct { Pids []int } +type InstrumentationUnHealthyReason string + +const ( + FailedToLoad InstrumentationUnHealthyReason = "FailedToLoad" + FailedToInitialize InstrumentationUnHealthyReason = "FailedToInitialize" +) + +const ebpfSDKConditionRunning = "ebpfSDKRunning" + +type instrumentationStatus struct { + Workload common.PodWorkload + PodName types.NamespacedName + Healthy bool + Message string + Reason InstrumentationUnHealthyReason +} + type EbpfDirector[T OtelEbpfSdk] struct { mux sync.Mutex @@ -56,6 +78,16 @@ type EbpfDirector[T OtelEbpfSdk] struct { // this map can be used when we only have the workload, and need to find the pods to derive pids. workloadToPods map[common.PodWorkload]map[types.NamespacedName]struct{} + + // this channel is used to send the status of the instrumentation SDK after it is created and ran. + // the status is used to update the status conditions for the instrumentedApplication CR. + // The status can be either a failure to initialize the SDK, or a failure to load the eBPF probes or a success which + // means the eBPF probes were loaded successfully. + // TODO: this channel should probably be buffered, so we don't block the instrumentation goroutine? + instrumentationStatusChan chan instrumentationStatus + + // k8s client used to update status conditions for the instrumentedApplication CR + client client.Client } type DirectorKey struct { @@ -65,14 +97,62 @@ type DirectorKey struct { type DirectorsMap map[DirectorKey]Director -func NewEbpfDirector[T OtelEbpfSdk](language common.ProgrammingLanguage, instrumentationFactory InstrumentationFactory[T]) *EbpfDirector[T] { - return &EbpfDirector[T]{ +func NewEbpfDirector[T OtelEbpfSdk](ctx context.Context, client client.Client, language common.ProgrammingLanguage, instrumentationFactory InstrumentationFactory[T]) *EbpfDirector[T] { + director := &EbpfDirector[T]{ language: language, instrumentationFactory: instrumentationFactory, pidsToInstrumentation: make(map[int]T), pidsAttemptedInstrumentation: make(map[int]struct{}), podsToDetails: make(map[types.NamespacedName]podDetails), workloadToPods: make(map[common.PodWorkload]map[types.NamespacedName]struct{}), + instrumentationStatusChan: make(chan instrumentationStatus), + client: client, + } + + go director.observeInstrumentations(ctx) + + return director +} + +func (d *EbpfDirector[T]) observeInstrumentations(ctx context.Context) { + for { + select { + case <-ctx.Done(): + return + case status, more := <-d.instrumentationStatusChan: + if !more { + return + } + + if d.client == nil { + log.Logger.V(0).Info("Client is nil, cannot update status conditions", "workload", status.Workload) + continue + } + + runtimeDetails, err := runtime_details.GetRuntimeDetails(ctx, d.client, &status.Workload) + if err != nil { + log.Logger.Error(err, "error getting runtime details", "workload", status.Workload) + continue + } + + // write the status to the CR. Since we are writing the status to the instrumentedApplication CR, + // we might overwrite the status of another pod which corresponds to the same workload. + // this can cause the status to not represent the full state of the workload, in case some of the pods are healthy and some are not for the same workload. + if !status.Healthy { + log.Logger.Error(nil, "eBPF instrumentation unhealthy", "reason", status.Reason, "message", status.Reason, "workload", status.Workload) + msg := "Failed to load eBPF probes to pod: " + status.PodName.String() + ".message: " + status.Message + err := k8s.UpdateStatusConditions(ctx, d.client, runtimeDetails, &runtimeDetails.Status.Conditions, metav1.ConditionFalse, ebpfSDKConditionRunning, string(status.Reason), msg) + if err != nil { + log.Logger.Error(err, "error updating status conditions", "workload", status.Workload) + } + } else { + log.Logger.V(0).Info("eBPF instrumentation healthy", "workload", status.Workload) + err := k8s.UpdateStatusConditions(ctx, d.client, runtimeDetails, &runtimeDetails.Status.Conditions, metav1.ConditionTrue, ebpfSDKConditionRunning, "Success", "Successfully loaded eBPF probes to pod: " + status.PodName.String()) + if err != nil { + log.Logger.Error(err, "error updating status conditions", "workload", status.Workload) + } + } + } } } @@ -103,10 +183,25 @@ func (d *EbpfDirector[T]) Instrument(ctx context.Context, pid int, pod types.Nam } d.workloadToPods[*podWorkload][pod] = struct{}{} + loadedIndicator := make(chan struct{}) + loadedCtx, loadedObserverCancel := context.WithCancel(ctx) + go func() { + select { + case <-loadedCtx.Done(): + // TODO: remove this print before merging + log.Logger.V(0).Info("eBPF instrumentation loaded observer cancelled", "workload", podWorkload, "pod", pod) + return + case <-loadedIndicator: + d.instrumentationStatusChan <- instrumentationStatus{Healthy: true, Workload: *podWorkload, PodName: pod} + } + }() + go func() { - inst, err := d.instrumentationFactory.CreateEbpfInstrumentation(ctx, pid, appName, podWorkload, containerName, pod.Name) + // once the instrumentation finished running (either by error or successful exit), we can cancel the 'loaded' observer for this instrumentation + defer loadedObserverCancel() + inst, err := d.instrumentationFactory.CreateEbpfInstrumentation(ctx, pid, appName, podWorkload, containerName, pod.Name, loadedIndicator) if err != nil { - log.Logger.Error(err, "instrumentation setup failed", "workload", podWorkload, "pod", pod) + d.instrumentationStatusChan <- instrumentationStatus{Healthy: false, Message: err.Error(), Workload: *podWorkload, Reason: FailedToInitialize, PodName: pod} return } @@ -129,7 +224,7 @@ func (d *EbpfDirector[T]) Instrument(ctx context.Context, pid int, pod types.Nam log.Logger.V(0).Info("Running ebpf instrumentation", "workload", podWorkload, "pod", pod, "language", d.language) if err := inst.Run(context.Background()); err != nil { - log.Logger.Error(err, "instrumentation crashed after running") + d.instrumentationStatusChan <- instrumentationStatus{Healthy: false, Message: err.Error(), Workload: *podWorkload, Reason: FailedToLoad, PodName: pod} } }() @@ -180,6 +275,7 @@ func (d *EbpfDirector[T]) Cleanup(pod types.NamespacedName) { func (d *EbpfDirector[T]) Shutdown() { log.Logger.V(0).Info("Shutting down instrumentation director") + close(d.instrumentationStatusChan) for details := range d.podsToDetails { d.Cleanup(details) } diff --git a/odiglet/pkg/ebpf/go.go b/odiglet/pkg/ebpf/go.go index 6a8ce19f0..c95050e00 100644 --- a/odiglet/pkg/ebpf/go.go +++ b/odiglet/pkg/ebpf/go.go @@ -24,7 +24,7 @@ func NewGoInstrumentationFactory() InstrumentationFactory[*GoOtelEbpfSdk] { return &GoInstrumentationFactory{} } -func (g *GoInstrumentationFactory) CreateEbpfInstrumentation(ctx context.Context, pid int, serviceName string, podWorkload *common.PodWorkload, containerName string, podName string) (*GoOtelEbpfSdk, error) { +func (g *GoInstrumentationFactory) CreateEbpfInstrumentation(ctx context.Context, pid int, serviceName string, podWorkload *common.PodWorkload, containerName string, podName string, loadedIndicator chan struct{}) (*GoOtelEbpfSdk, error) { defaultExporter, err := otlptracegrpc.New( ctx, otlptracegrpc.WithInsecure(), @@ -42,6 +42,7 @@ func (g *GoInstrumentationFactory) CreateEbpfInstrumentation(ctx context.Context auto.WithServiceName(serviceName), auto.WithTraceExporter(defaultExporter), auto.WithGlobal(), + auto.WithLoadedIndicator(loadedIndicator), ) if err != nil { log.Logger.Error(err, "instrumentation setup failed") diff --git a/odiglet/pkg/kube/instrumentation_ebpf/pods.go b/odiglet/pkg/kube/instrumentation_ebpf/pods.go index be6412101..96bc9c663 100644 --- a/odiglet/pkg/kube/instrumentation_ebpf/pods.go +++ b/odiglet/pkg/kube/instrumentation_ebpf/pods.go @@ -6,6 +6,7 @@ import ( "github.com/odigos-io/odigos/common" "github.com/odigos-io/odigos/odiglet/pkg/ebpf" + runtime_details "github.com/odigos-io/odigos/odiglet/pkg/kube/runtime_details" kubeutils "github.com/odigos-io/odigos/odiglet/pkg/kube/utils" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -72,7 +73,7 @@ func (p *PodsReconciler) Reconcile(ctx context.Context, request ctrl.Request) (c } func (p *PodsReconciler) instrumentWithEbpf(ctx context.Context, pod *corev1.Pod, podWorkload *common.PodWorkload) (error, bool) { - runtimeDetails, err := getRuntimeDetails(ctx, p.Client, podWorkload) + runtimeDetails, err := runtime_details.GetRuntimeDetails(ctx, p.Client, podWorkload) if err != nil { if apierrors.IsNotFound(err) { // Probably shutdown in progress, cleanup will be done as soon as the pod object is deleted @@ -81,7 +82,7 @@ func (p *PodsReconciler) instrumentWithEbpf(ctx context.Context, pod *corev1.Pod return err, false } - return instrumentPodWithEbpf(ctx, pod, p.Directors, runtimeDetails, podWorkload) + return instrumentPodWithEbpf(ctx, p.Client, pod, p.Directors, runtimeDetails, podWorkload) } func (p *PodsReconciler) getPodWorkloadObject(ctx context.Context, pod *corev1.Pod) (*common.PodWorkload, error) { diff --git a/odiglet/pkg/kube/instrumentation_ebpf/shared.go b/odiglet/pkg/kube/instrumentation_ebpf/shared.go index ba997b189..4d705109f 100644 --- a/odiglet/pkg/kube/instrumentation_ebpf/shared.go +++ b/odiglet/pkg/kube/instrumentation_ebpf/shared.go @@ -7,7 +7,6 @@ import ( odigosv1 "github.com/odigos-io/odigos/api/odigos/v1alpha1" "github.com/odigos-io/odigos/common" - "github.com/odigos-io/odigos/common/utils" "github.com/odigos-io/odigos/odiglet/pkg/ebpf" "github.com/odigos-io/odigos/odiglet/pkg/process" corev1 "k8s.io/api/core/v1" @@ -25,7 +24,7 @@ func cleanupEbpf(directors ebpf.DirectorsMap, name types.NamespacedName) { } } -func instrumentPodWithEbpf(ctx context.Context, pod *corev1.Pod, directors ebpf.DirectorsMap, runtimeDetails *odigosv1.InstrumentedApplication, podWorkload *common.PodWorkload) (error, bool) { +func instrumentPodWithEbpf(ctx context.Context, client client.Client, pod *corev1.Pod, directors ebpf.DirectorsMap, runtimeDetails *odigosv1.InstrumentedApplication, podWorkload *common.PodWorkload) (error, bool) { logger := log.FromContext(ctx) podUid := string(pod.UID) instrumentedEbpf := false @@ -96,18 +95,3 @@ func podContainerDeviceName(container v1.Container) *string { return nil } - -func getRuntimeDetails(ctx context.Context, kubeClient client.Client, podWorkload *common.PodWorkload) (*odigosv1.InstrumentedApplication, error) { - instrumentedApplicationName := utils.GetRuntimeObjectName(podWorkload.Name, podWorkload.Kind) - - var runtimeDetails odigosv1.InstrumentedApplication - err := kubeClient.Get(ctx, client.ObjectKey{ - Namespace: podWorkload.Namespace, - Name: instrumentedApplicationName, - }, &runtimeDetails) - if err != nil { - return nil, err - } - - return &runtimeDetails, nil -} diff --git a/odiglet/pkg/kube/runtime_details/shared.go b/odiglet/pkg/kube/runtime_details/shared.go index 20dcbfdca..b53f54e98 100644 --- a/odiglet/pkg/kube/runtime_details/shared.go +++ b/odiglet/pkg/kube/runtime_details/shared.go @@ -141,3 +141,18 @@ func persistRuntimeResults(ctx context.Context, results []odigosv1.RuntimeDetail } return nil } + +func GetRuntimeDetails(ctx context.Context, kubeClient client.Client, podWorkload *common.PodWorkload) (*odigosv1.InstrumentedApplication, error) { + instrumentedApplicationName := utils.GetRuntimeObjectName(podWorkload.Name, podWorkload.Kind) + + var runtimeDetails odigosv1.InstrumentedApplication + err := kubeClient.Get(ctx, client.ObjectKey{ + Namespace: podWorkload.Namespace, + Name: instrumentedApplicationName, + }, &runtimeDetails) + if err != nil { + return nil, err + } + + return &runtimeDetails, nil +} From 3edba9003fba3767b74024f98d6d676793eade1c Mon Sep 17 00:00:00 2001 From: Ron Federman Date: Fri, 24 May 2024 17:55:22 +0300 Subject: [PATCH 2/2] Minor improvements and cleanup --- odiglet/go.mod | 6 +-- odiglet/go.sum | 4 +- odiglet/pkg/ebpf/director.go | 42 +++++++++---------- odiglet/pkg/kube/instrumentation_ebpf/pods.go | 2 +- .../pkg/kube/instrumentation_ebpf/shared.go | 3 +- 5 files changed, 25 insertions(+), 32 deletions(-) diff --git a/odiglet/go.mod b/odiglet/go.mod index df1c2577a..4ef0a3f26 100644 --- a/odiglet/go.mod +++ b/odiglet/go.mod @@ -14,8 +14,7 @@ require ( github.com/odigos-io/odigos/procdiscovery v0.0.0 github.com/odigos-io/opentelemetry-zap-bridge v0.0.5 github.com/otiai10/copy v1.14.0 - // TODO revert this change before merging - go.opentelemetry.io/auto v0.12.0-alpha + go.opentelemetry.io/auto v0.12.0-alpha.0.20240523062926-f9ad92d875aa go.opentelemetry.io/otel v1.26.0 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.26.0 go.uber.org/zap v1.27.0 @@ -114,6 +113,3 @@ replace ( github.com/odigos-io/odigos/k8sutils => ../k8sutils github.com/odigos-io/odigos/procdiscovery => ../procdiscovery ) - -// TODO: remove this replace before merging -replace go.opentelemetry.io/auto => github.com/RonFed/opentelemetry-go-instrumentation_fork v0.0.0-20240520102826-7e7c5a0992b5 diff --git a/odiglet/go.sum b/odiglet/go.sum index f36e1ec1b..ac8a6cd0c 100644 --- a/odiglet/go.sum +++ b/odiglet/go.sum @@ -28,8 +28,6 @@ github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ= github.com/PuerkitoBio/purell v1.0.0/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0= github.com/PuerkitoBio/urlesc v0.0.0-20160726150825-5bd2802263f2/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE= -github.com/RonFed/opentelemetry-go-instrumentation_fork v0.0.0-20240520102826-7e7c5a0992b5 h1:weT5NS3WNeOGQAE2WUEMIgqNqxU0hXjqsAzk9OoayEQ= -github.com/RonFed/opentelemetry-go-instrumentation_fork v0.0.0-20240520102826-7e7c5a0992b5/go.mod h1:l97HvRF4QQdOvb3LobVnUlmGb5eju7yfvccYWuJj84k= github.com/agoda-com/opentelemetry-logs-go v0.4.0 h1:XLPOlLHLOND2/VrL69TWSy6blCZnypV37t1MjfILksI= github.com/agoda-com/opentelemetry-logs-go v0.4.0/go.mod h1:CeDuVaK9yCWN+8UjOW8AciYJE0rl7K/mw4ejBntGYkc= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= @@ -321,6 +319,8 @@ github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5t go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= +go.opentelemetry.io/auto v0.12.0-alpha.0.20240523062926-f9ad92d875aa h1:EOH/Hcqgu2p/33uTHIu7RYTh4hTMdTTzfLEnwpOSqcs= +go.opentelemetry.io/auto v0.12.0-alpha.0.20240523062926-f9ad92d875aa/go.mod h1:l97HvRF4QQdOvb3LobVnUlmGb5eju7yfvccYWuJj84k= go.opentelemetry.io/contrib/exporters/autoexport v0.51.0 h1:imlL5MBzKu+NWhnJM62bHws6m+6LN8HMT3V9PcSTbaY= go.opentelemetry.io/contrib/exporters/autoexport v0.51.0/go.mod h1:gn1wFA1uVEKIXrM3DC7SN9ee83oJ0yALY/HbUfqMszo= go.opentelemetry.io/otel v1.26.0 h1:LQwgL5s/1W7YiiRwxf03QGnWLb2HW4pLiAhaA5cZXBs= diff --git a/odiglet/pkg/ebpf/director.go b/odiglet/pkg/ebpf/director.go index cae80abb1..74d4eea63 100644 --- a/odiglet/pkg/ebpf/director.go +++ b/odiglet/pkg/ebpf/director.go @@ -39,11 +39,12 @@ type podDetails struct { Pids []int } -type InstrumentationUnHealthyReason string +type InstrumentationStatusReason string const ( - FailedToLoad InstrumentationUnHealthyReason = "FailedToLoad" - FailedToInitialize InstrumentationUnHealthyReason = "FailedToInitialize" + FailedToLoad InstrumentationStatusReason = "FailedToLoad" + FailedToInitialize InstrumentationStatusReason = "FailedToInitialize" + LoadedSuccessfully InstrumentationStatusReason = "LoadedSuccessfully" ) const ebpfSDKConditionRunning = "ebpfSDKRunning" @@ -51,9 +52,9 @@ const ebpfSDKConditionRunning = "ebpfSDKRunning" type instrumentationStatus struct { Workload common.PodWorkload PodName types.NamespacedName - Healthy bool - Message string - Reason InstrumentationUnHealthyReason + Healthy bool + Message string + Reason InstrumentationStatusReason } type EbpfDirector[T OtelEbpfSdk] struct { @@ -135,22 +136,21 @@ func (d *EbpfDirector[T]) observeInstrumentations(ctx context.Context) { continue } + condStatus := metav1.ConditionTrue + if !status.Healthy { + condStatus = metav1.ConditionFalse + } + + if !status.Healthy { + log.Logger.Error(nil, "eBPF instrumentation unhealthy", "reason", status.Reason, "message", status.Message, "workload", status.Workload) + } + // write the status to the CR. Since we are writing the status to the instrumentedApplication CR, // we might overwrite the status of another pod which corresponds to the same workload. // this can cause the status to not represent the full state of the workload, in case some of the pods are healthy and some are not for the same workload. - if !status.Healthy { - log.Logger.Error(nil, "eBPF instrumentation unhealthy", "reason", status.Reason, "message", status.Reason, "workload", status.Workload) - msg := "Failed to load eBPF probes to pod: " + status.PodName.String() + ".message: " + status.Message - err := conditions.UpdateStatusConditions(ctx, d.client, runtimeDetails, &runtimeDetails.Status.Conditions, metav1.ConditionFalse, ebpfSDKConditionRunning, string(status.Reason), msg) - if err != nil { - log.Logger.Error(err, "error updating status conditions", "workload", status.Workload) - } - } else { - log.Logger.V(0).Info("eBPF instrumentation healthy", "workload", status.Workload) - err := conditions.UpdateStatusConditions(ctx, d.client, runtimeDetails, &runtimeDetails.Status.Conditions, metav1.ConditionTrue, ebpfSDKConditionRunning, "Success", "Successfully loaded eBPF probes to pod: " + status.PodName.String()) - if err != nil { - log.Logger.Error(err, "error updating status conditions", "workload", status.Workload) - } + err = conditions.UpdateStatusConditions(ctx, d.client, runtimeDetails, &runtimeDetails.Status.Conditions, condStatus, ebpfSDKConditionRunning, string(status.Reason), status.Message) + if err != nil { + log.Logger.Error(err, "error updating status conditions", "workload", status.Workload) } } } @@ -188,11 +188,9 @@ func (d *EbpfDirector[T]) Instrument(ctx context.Context, pid int, pod types.Nam go func() { select { case <-loadedCtx.Done(): - // TODO: remove this print before merging - log.Logger.V(0).Info("eBPF instrumentation loaded observer cancelled", "workload", podWorkload, "pod", pod) return case <-loadedIndicator: - d.instrumentationStatusChan <- instrumentationStatus{Healthy: true, Workload: *podWorkload, PodName: pod} + d.instrumentationStatusChan <- instrumentationStatus{Healthy: true, Message: "Successfully loaded eBPF probes to pod: " + pod.String(), Workload: *podWorkload, PodName: pod, Reason: LoadedSuccessfully} } }() diff --git a/odiglet/pkg/kube/instrumentation_ebpf/pods.go b/odiglet/pkg/kube/instrumentation_ebpf/pods.go index 96bc9c663..2d7a2948d 100644 --- a/odiglet/pkg/kube/instrumentation_ebpf/pods.go +++ b/odiglet/pkg/kube/instrumentation_ebpf/pods.go @@ -82,7 +82,7 @@ func (p *PodsReconciler) instrumentWithEbpf(ctx context.Context, pod *corev1.Pod return err, false } - return instrumentPodWithEbpf(ctx, p.Client, pod, p.Directors, runtimeDetails, podWorkload) + return instrumentPodWithEbpf(ctx, pod, p.Directors, runtimeDetails, podWorkload) } func (p *PodsReconciler) getPodWorkloadObject(ctx context.Context, pod *corev1.Pod) (*common.PodWorkload, error) { diff --git a/odiglet/pkg/kube/instrumentation_ebpf/shared.go b/odiglet/pkg/kube/instrumentation_ebpf/shared.go index 4d705109f..b869a4672 100644 --- a/odiglet/pkg/kube/instrumentation_ebpf/shared.go +++ b/odiglet/pkg/kube/instrumentation_ebpf/shared.go @@ -12,7 +12,6 @@ import ( corev1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" - "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" ) @@ -24,7 +23,7 @@ func cleanupEbpf(directors ebpf.DirectorsMap, name types.NamespacedName) { } } -func instrumentPodWithEbpf(ctx context.Context, client client.Client, pod *corev1.Pod, directors ebpf.DirectorsMap, runtimeDetails *odigosv1.InstrumentedApplication, podWorkload *common.PodWorkload) (error, bool) { +func instrumentPodWithEbpf(ctx context.Context, pod *corev1.Pod, directors ebpf.DirectorsMap, runtimeDetails *odigosv1.InstrumentedApplication, podWorkload *common.PodWorkload) (error, bool) { logger := log.FromContext(ctx) podUid := string(pod.UID) instrumentedEbpf := false