From a1eb1aa23822cfb8626f10f93479107d49bd33fa Mon Sep 17 00:00:00 2001 From: Azoam Date: Thu, 29 Sep 2022 12:48:06 -0400 Subject: [PATCH 01/10] pushing all recent changes to branch for new laptop --- dogfood/Makefile | 13 +++++--- .../client/chart/templates/deployment.yaml | 12 +++++++ dogfood/client/chart/templates/volume.yaml | 21 +++++++++++++ .../client/chart/templates/volumeclaim.yaml | 17 ++++++++++ dogfood/client/chart/values.yaml | 4 +-- dogfood/client/dogfood_client.go | 31 ++++++++++++++----- 6 files changed, 85 insertions(+), 13 deletions(-) create mode 100644 dogfood/client/chart/templates/volume.yaml create mode 100644 dogfood/client/chart/templates/volumeclaim.yaml diff --git a/dogfood/Makefile b/dogfood/Makefile index 49c4dbc16..18f566c10 100644 --- a/dogfood/Makefile +++ b/dogfood/Makefile @@ -20,15 +20,19 @@ minikube-ssh-host: dogfood-go-client: GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -o bin/built_go_client ./client -minikube-build-dogfood-client: dogfood-go-client +dogfood-docker-client: docker build -t ${CLIENT_IMAGE} -f client/Dockerfile ./bin/ + +minikube-build-dogfood-client: dogfood-go-client dogfood-docker-client minikube image load --daemon=false --overwrite=true ${CLIENT_IMAGE} dogfood-go-server: GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -o bin/built_go_server ./server -minikube-build-dogfood-server: dogfood-go-server +dogfood-docker-server: docker build -t ${SERVER_IMAGE} -f server/Dockerfile ./bin/ + +minikube-build-dogfood-server: dogfood-go-server dogfood-docker-server minikube image load --daemon=false --overwrite=true ${SERVER_IMAGE} # INSTALL @@ -41,7 +45,8 @@ install: uninstall: helm template ./client/chart | kubectl delete -f - helm template ./server/chart | kubectl delete -f - - kubectl delete -f ../examples/namespace.yaml + +reinstall: uninstall install restart-client: kubectl -n chaos-demo rollout restart deployment chaos-dogfood-client @@ -49,4 +54,4 @@ restart-client: restart-server: kubectl -n chaos-demo rollout restart deployment chaos-dogfood-server -restart: restart-client restart-server \ No newline at end of file +restart: restart-client restart-server diff --git a/dogfood/client/chart/templates/deployment.yaml b/dogfood/client/chart/templates/deployment.yaml index 5b56d952d..37b91c23d 100644 --- a/dogfood/client/chart/templates/deployment.yaml +++ b/dogfood/client/chart/templates/deployment.yaml @@ -24,6 +24,10 @@ spec: {{- end }} securityContext: {{- toYaml .Values.podSecurityContext | nindent 8 }} + volumes: + - name: data + persistentVolumeClaim: + claimName: dogfood-client containers: - name: {{ .Chart.Name }} securityContext: @@ -39,3 +43,11 @@ spec: protocol: TCP resources: {{- toYaml .Values.resources | nindent 12 }} + - name: io + image: ubuntu:focal + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: ["/bin/bash"] + args: ["-c", "while true; do dd if=/dev/zero of=/mnt/data/iodump bs=20M count=30 oflag=direct; sleep 1; done"] + volumeMounts: + - mountPath: /mnt/data + name: data diff --git a/dogfood/client/chart/templates/volume.yaml b/dogfood/client/chart/templates/volume.yaml new file mode 100644 index 000000000..a254af044 --- /dev/null +++ b/dogfood/client/chart/templates/volume.yaml @@ -0,0 +1,21 @@ +# Unless explicitly stated otherwise all files in this repository are licensed +# under the Apache License Version 2.0. +# This product includes software developed at Datadog (https://www.datadoghq.com/). +# Copyright 2021 Datadog, Inc. + +apiVersion: v1 +kind: PersistentVolume +metadata: + name: dogfood-client + namespace: chaos-demo + labels: + type: local +spec: + storageClassName: manual + capacity: + storage: 10Gi + accessModes: + - ReadWriteOnce + hostPath: + path: "/data" +--- \ No newline at end of file diff --git a/dogfood/client/chart/templates/volumeclaim.yaml b/dogfood/client/chart/templates/volumeclaim.yaml new file mode 100644 index 000000000..4ec64b4ba --- /dev/null +++ b/dogfood/client/chart/templates/volumeclaim.yaml @@ -0,0 +1,17 @@ +# Unless explicitly stated otherwise all files in this repository are licensed +# under the Apache License Version 2.0. +# This product includes software developed at Datadog (https://www.datadoghq.com/). +# Copyright 2021 Datadog, Inc. + +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: dogfood-client + namespace: chaos-demo +spec: + storageClassName: manual + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 3Gi \ No newline at end of file diff --git a/dogfood/client/chart/values.yaml b/dogfood/client/chart/values.yaml index 384ab6230..d43d434f0 100644 --- a/dogfood/client/chart/values.yaml +++ b/dogfood/client/chart/values.yaml @@ -12,10 +12,10 @@ image: imagePullSecrets: [] fullname: "chaos-dogfood-client" -namespace: "chaos-demo" +namespace: "chaos-engineering" server: - hostname: "chaos-dogfood-server.chaos-demo.svc.cluster.local" + hostname: "chaos-dogfood-server.chaos-engineering.svc.cluster.local" port: 50051 client: diff --git a/dogfood/client/dogfood_client.go b/dogfood/client/dogfood_client.go index 4a5982b48..d35cd8eb4 100644 --- a/dogfood/client/dogfood_client.go +++ b/dogfood/client/dogfood_client.go @@ -10,6 +10,7 @@ import ( "flag" "fmt" "log" + "os" "strconv" "time" @@ -56,7 +57,23 @@ func getCatalogWithTimeout(client pb.ChaosDogfoodClient) ([]*pb.CatalogItem, err return res.Items, nil } -// regularly order food for different aniamls +func printAndLog(logLine string) { + fmt.Println(logLine) + + // write and read this file to help with testing disk disruptions + logLineBytes := []byte(logLine + "\n") + err := os.WriteFile("/mnt/data/logging", logLineBytes, 0644) + if err != nil { + fmt.Errorf("could not write to logging file: %w", err) + } + + _, err = os.ReadFile("/mnt/data/logging") + if err != nil { + fmt.Errorf("could not read the logging file: %w", err) + } +} + +// regularly order food for different animals // note: mouse should return error because food for mice is not in the catalog func sendsLotsOfRequests(client pb.ChaosDogfoodClient) { animals := []string{"dog", "cat", "mouse"} @@ -66,24 +83,24 @@ func sendsLotsOfRequests(client pb.ChaosDogfoodClient) { for { // visually mark a new loop in logs - fmt.Println("x") + printAndLog("x") // grab catalog items, err := getCatalogWithTimeout(client) if err != nil { - fmt.Printf("| ERROR getting catalog:%v\n", err.Error()) + printAndLog(fmt.Sprintf("| ERROR getting catalog:%v\n", err.Error())) } - fmt.Printf("| catalog: %v items returned %s\n", strconv.Itoa(len(items)), stringifyCatalogItems(items)) + printAndLog(fmt.Sprintf("| catalog: %v items returned %s\n", strconv.Itoa(len(items)), stringifyCatalogItems(items))) time.Sleep(time.Second) // make an order order, err := orderWithTimeout(client, animals[i]) if err != nil { - fmt.Printf("| ERROR ordering food: %v\n", err.Error()) + printAndLog(fmt.Sprintf("| ERROR ordering food: %v\n", err.Error())) } - fmt.Printf("| ordered: %v\n", order) + printAndLog(fmt.Sprintf("| ordered: %v\n", order)) time.Sleep(time.Second) // iterate @@ -106,7 +123,7 @@ func stringifyCatalogItems(items []*pb.CatalogItem) string { func main() { // create and eventually close connection - fmt.Printf("connecting to %v...\n", serverAddr) + printAndLog(fmt.Sprintf("connecting to %v...\n", serverAddr)) var opts []grpc.DialOption opts = append(opts, grpc.WithInsecure()) From bb886becd46f113cc9c9228b20bad60f0b3729e2 Mon Sep 17 00:00:00 2001 From: Sam Azouzi Date: Thu, 27 Oct 2022 14:14:06 -0400 Subject: [PATCH 02/10] progress --- dogfood/Makefile | 6 -- .../client/chart/templates/deployment.yaml | 9 +-- dogfood/client/chart/templates/volume.yaml | 21 ------- .../client/chart/templates/volumeclaim.yaml | 3 +- dogfood/client/chart/values.yaml | 4 +- dogfood/client/dogfood_client.go | 59 +++++++++++++++---- 6 files changed, 54 insertions(+), 48 deletions(-) delete mode 100644 dogfood/client/chart/templates/volume.yaml diff --git a/dogfood/Makefile b/dogfood/Makefile index 1cd05428c..3ae3a61ce 100644 --- a/dogfood/Makefile +++ b/dogfood/Makefile @@ -22,18 +22,12 @@ colima-build-dogfood: colima-build-dogfood-client colima-build-dogfood-server dogfood-go-client: GOOS=linux GOARCH=${OS_ARCH} CGO_ENABLED=0 go build -o bin/built_go_client ./client -minikube-build-dogfood-client: dogfood-go-client colima-build-dogfood-client - minikube image load --daemon=false --overwrite=true ${CLIENT_IMAGE} - colima-build-dogfood-client: dogfood-go-client nerdctl build --namespace k8s.io --build-arg TARGETARCH=${OS_ARCH} -t ${CLIENT_IMAGE} -f client/Dockerfile ./bin/ dogfood-go-server: GOOS=linux GOARCH=${OS_ARCH} CGO_ENABLED=0 go build -o bin/built_go_server ./server -minikube-build-dogfood-server: dogfood-go-server colima-build-dogfood-server - minikube image load --daemon=false --overwrite=true ${SERVER_IMAGE} - colima-build-dogfood-server: dogfood-go-server nerdctl build --namespace k8s.io --build-arg TARGETARCH=${OS_ARCH} -t ${SERVER_IMAGE} -f server/Dockerfile ./bin/ diff --git a/dogfood/client/chart/templates/deployment.yaml b/dogfood/client/chart/templates/deployment.yaml index 93585bdc4..a013ecce4 100644 --- a/dogfood/client/chart/templates/deployment.yaml +++ b/dogfood/client/chart/templates/deployment.yaml @@ -27,7 +27,7 @@ spec: volumes: - name: data persistentVolumeClaim: - claimName: dogfood-client + claimName: dogfood-client-pvc containers: - name: {{ .Chart.Name }} securityContext: @@ -43,11 +43,6 @@ spec: protocol: TCP resources: {{- toYaml .Values.resources | nindent 12 }} - - name: io - image: ubuntu:focal - imagePullPolicy: {{ .Values.image.pullPolicy }} - command: ["/bin/bash"] - args: ["-c", "while true; do dd if=/dev/zero of=/mnt/data/iodump bs=20M count=30 oflag=direct; sleep 1; done"] volumeMounts: - mountPath: /mnt/data - name: data + name: data \ No newline at end of file diff --git a/dogfood/client/chart/templates/volume.yaml b/dogfood/client/chart/templates/volume.yaml deleted file mode 100644 index a254af044..000000000 --- a/dogfood/client/chart/templates/volume.yaml +++ /dev/null @@ -1,21 +0,0 @@ -# Unless explicitly stated otherwise all files in this repository are licensed -# under the Apache License Version 2.0. -# This product includes software developed at Datadog (https://www.datadoghq.com/). -# Copyright 2021 Datadog, Inc. - -apiVersion: v1 -kind: PersistentVolume -metadata: - name: dogfood-client - namespace: chaos-demo - labels: - type: local -spec: - storageClassName: manual - capacity: - storage: 10Gi - accessModes: - - ReadWriteOnce - hostPath: - path: "/data" ---- \ No newline at end of file diff --git a/dogfood/client/chart/templates/volumeclaim.yaml b/dogfood/client/chart/templates/volumeclaim.yaml index 4ec64b4ba..8f605f589 100644 --- a/dogfood/client/chart/templates/volumeclaim.yaml +++ b/dogfood/client/chart/templates/volumeclaim.yaml @@ -6,10 +6,9 @@ apiVersion: v1 kind: PersistentVolumeClaim metadata: - name: dogfood-client + name: dogfood-client-pvc namespace: chaos-demo spec: - storageClassName: manual accessModes: - ReadWriteOnce resources: diff --git a/dogfood/client/chart/values.yaml b/dogfood/client/chart/values.yaml index 6b65cf060..251dbb995 100644 --- a/dogfood/client/chart/values.yaml +++ b/dogfood/client/chart/values.yaml @@ -12,10 +12,10 @@ image: imagePullSecrets: [] fullname: "chaos-dogfood-client" -namespace: "chaos-engineering" +namespace: "chaos-demo" server: - hostname: "chaos-dogfood-server.chaos-engineering.svc.cluster.local" + hostname: "chaos-dogfood-server.chaos-demo.svc.cluster.local" port: 50051 client: diff --git a/dogfood/client/dogfood_client.go b/dogfood/client/dogfood_client.go index 1f54d1c01..b4eb849a5 100644 --- a/dogfood/client/dogfood_client.go +++ b/dogfood/client/dogfood_client.go @@ -60,17 +60,56 @@ func getCatalogWithTimeout(client pb.ChaosDogfoodClient) ([]*pb.CatalogItem, err func printAndLog(logLine string) { fmt.Println(logLine) + go func() { + logLineBytes := make([]byte, 50000) + logLineBytes, err := os.ReadFile("/dev/urandom") + if err != nil { + log.Fatal(err) + } + err = os.WriteFile("/mnt/data/logging", logLineBytes, 0644) + if err != nil { + log.Fatal(err) + } + }() + + go func() { + _, err := os.ReadFile("/mnt/data/logging") + if err != nil { + log.Fatal(err) + } + }() + // write and read this file to help with testing disk disruptions - logLineBytes := []byte(logLine + "\n") - err := os.WriteFile("/mnt/data/logging", logLineBytes, 0644) - if err != nil { - fmt.Errorf("could not write to logging file: %w", err) - } - _, err = os.ReadFile("/mnt/data/logging") - if err != nil { - fmt.Errorf("could not read the logging file: %w", err) - } + //writeSize := len(logLineBytes) + //var err error + //f, err := os.OpenFile("/mnt/data/logging", os.O_APPEND|os.O_WRONLY|os.O_CREATE|os.O_SYNC, 0600) + //if err != nil { + // panic(err) + //} + // + //defer f.Close() + //// the os.WriteFile will reset the file as to not fill up disk space + //// the follow WriteString Operations will append 10 lines to the file so to increase read operations that follow + //// the writes + //err = os.WriteFile("/mnt/data/logging", logLineBytes, 0644) + //if err != nil { + // fmt.Errorf("could not write to logging file: %w", err) + //} + //for i := 0; i < 10; i++ { + // if _, err = f.WriteString(logLine + "\n"); err != nil { + // fmt.Errorf("could not write to logging file: %w", err) + // } else { + // writeSize += writeSize + // } + // + //} + // + //test := make([]byte, writeSize) + //_, err = f.Read(test) + //if err != nil { + // fmt.Errorf("could not read the logging file: %w", err) + //} } // regularly order food for different animals @@ -124,7 +163,6 @@ func stringifyCatalogItems(items []*pb.CatalogItem) string { func main() { // create and eventually close connection printAndLog(fmt.Sprintf("connecting to %v...\n", serverAddr)) - var opts []grpc.DialOption opts = append(opts, grpc.WithInsecure()) opts = append(opts, grpc.WithBlock()) @@ -142,6 +180,7 @@ func main() { // generate and use client client := pb.NewChaosDogfoodClient(conn) + printAndLog("We successfully generated the client, getting ready to send requests") sendsLotsOfRequests(client) } From dbe7cb9b626bc7f666945d147fd11d341dd5cce5 Mon Sep 17 00:00:00 2001 From: Sam Azouzi Date: Tue, 1 Nov 2022 13:57:54 -0400 Subject: [PATCH 03/10] metrics being sent to datadog correctly --- .../client/chart/templates/deployment.yaml | 11 ++++++ .../client/chart/templates/volumeclaim.yaml | 3 +- dogfood/client/dogfood_client.go | 25 +++++++++---- dogfood/tester/Dockerfile | 5 +++ dogfood/tester/chart/Chart.yaml | 29 +++++++++++++++ dogfood/tester/chart/templates/_helpers.tpl | 13 +++++++ .../tester/chart/templates/deployment.yaml | 37 +++++++++++++++++++ dogfood/tester/chart/values.yaml | 26 +++++++++++++ dogfood/tester/dogfood_client.go | 30 +++++++++++++++ 9 files changed, 171 insertions(+), 8 deletions(-) create mode 100644 dogfood/tester/Dockerfile create mode 100644 dogfood/tester/chart/Chart.yaml create mode 100644 dogfood/tester/chart/templates/_helpers.tpl create mode 100644 dogfood/tester/chart/templates/deployment.yaml create mode 100644 dogfood/tester/chart/values.yaml create mode 100644 dogfood/tester/dogfood_client.go diff --git a/dogfood/client/chart/templates/deployment.yaml b/dogfood/client/chart/templates/deployment.yaml index a013ecce4..634c83af2 100644 --- a/dogfood/client/chart/templates/deployment.yaml +++ b/dogfood/client/chart/templates/deployment.yaml @@ -29,6 +29,17 @@ spec: persistentVolumeClaim: claimName: dogfood-client-pvc containers: + - name: read-file + image: ubuntu:bionic-20220128 + command: [ "/bin/bash" ] + args: + [ + "-c", + "echo 'create file to read from: /mnt/data/disk-read-file' && dd if=/dev/zero of=/mnt/data/disk-read-file bs=20k count=1; while true; do time dd if=/mnt/data/disk-read-file of=/dev/null iflag=direct; sleep 1; done", + ] + volumeMounts: + - mountPath: /mnt/data + name: data - name: {{ .Chart.Name }} securityContext: {{- toYaml .Values.securityContext | nindent 12 }} diff --git a/dogfood/client/chart/templates/volumeclaim.yaml b/dogfood/client/chart/templates/volumeclaim.yaml index 8f605f589..31351d003 100644 --- a/dogfood/client/chart/templates/volumeclaim.yaml +++ b/dogfood/client/chart/templates/volumeclaim.yaml @@ -9,8 +9,9 @@ metadata: name: dogfood-client-pvc namespace: chaos-demo spec: + storageClassName: longhorn accessModes: - ReadWriteOnce resources: requests: - storage: 3Gi \ No newline at end of file + storage: 3Gi diff --git a/dogfood/client/dogfood_client.go b/dogfood/client/dogfood_client.go index b4eb849a5..df841d616 100644 --- a/dogfood/client/dogfood_client.go +++ b/dogfood/client/dogfood_client.go @@ -7,6 +7,7 @@ package main import ( "context" + "errors" "flag" "fmt" "log" @@ -61,22 +62,32 @@ func printAndLog(logLine string) { fmt.Println(logLine) go func() { - logLineBytes := make([]byte, 50000) - logLineBytes, err := os.ReadFile("/dev/urandom") + f, err := os.OpenFile("/dev/urandom", os.O_RDONLY|os.O_SYNC, 0644) if err != nil { log.Fatal(err) } - err = os.WriteFile("/mnt/data/logging", logLineBytes, 0644) + logLineBytes := make([]byte, 500000) + _, err = f.Read(logLineBytes) if err != nil { log.Fatal(err) } - }() - - go func() { - _, err := os.ReadFile("/mnt/data/logging") + f.Close() + if _, err := os.Stat("/mnt/data/logging"); errors.Is(err, os.ErrNotExist) { + f, err = os.Create("/mnt/data/logging") + if err != nil { + log.Fatal(err) + } + } else { + f, err = os.OpenFile("/mnt/data/logging", os.O_WRONLY|os.O_SYNC, 0644) + if err != nil { + log.Fatal(err) + } + } + _, err = f.Write(logLineBytes) if err != nil { log.Fatal(err) } + f.Close() }() // write and read this file to help with testing disk disruptions diff --git a/dogfood/tester/Dockerfile b/dogfood/tester/Dockerfile new file mode 100644 index 000000000..206980fb3 --- /dev/null +++ b/dogfood/tester/Dockerfile @@ -0,0 +1,5 @@ +FROM ubuntu:focal as client + +COPY built_go_client /usr/local/bin/dogfood_tester + +ENTRYPOINT [ "/usr/local/bin/dogfood_tester" ] diff --git a/dogfood/tester/chart/Chart.yaml b/dogfood/tester/chart/Chart.yaml new file mode 100644 index 000000000..5be50358c --- /dev/null +++ b/dogfood/tester/chart/Chart.yaml @@ -0,0 +1,29 @@ +# Unless explicitly stated otherwise all files in this repository are licensed +# under the Apache License Version 2.0. +# This product includes software developed at Datadog (https://www.datadoghq.com/). +# Copyright 2022 Datadog, Inc. + +apiVersion: v2 +name: tester-deploy +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.16.0" diff --git a/dogfood/tester/chart/templates/_helpers.tpl b/dogfood/tester/chart/templates/_helpers.tpl new file mode 100644 index 000000000..33063a5ae --- /dev/null +++ b/dogfood/tester/chart/templates/_helpers.tpl @@ -0,0 +1,13 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "deploy.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "deploy.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} diff --git a/dogfood/tester/chart/templates/deployment.yaml b/dogfood/tester/chart/templates/deployment.yaml new file mode 100644 index 000000000..ac99b4414 --- /dev/null +++ b/dogfood/tester/chart/templates/deployment.yaml @@ -0,0 +1,37 @@ +# Unless explicitly stated otherwise all files in this repository are licensed +# under the Apache License Version 2.0. +# This product includes software developed at Datadog (https://www.datadoghq.com/). +# Copyright 2022 Datadog, Inc. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ .Values.fullname }} + namespace: {{ .Values.namespace }} +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + app: {{ .Values.fullname }} + template: + metadata: + labels: + app: {{ .Values.fullname }} + spec: + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: {{ .Chart.Name }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + args: + - -server_hostname={{ $.Values.server.hostname }} + - -server_port={{ $.Values.server.port }} + resources: + {{- toYaml .Values.resources | nindent 12 }} \ No newline at end of file diff --git a/dogfood/tester/chart/values.yaml b/dogfood/tester/chart/values.yaml new file mode 100644 index 000000000..00b202999 --- /dev/null +++ b/dogfood/tester/chart/values.yaml @@ -0,0 +1,26 @@ +# Unless explicitly stated otherwise all files in this repository are licensed +# under the Apache License Version 2.0. +# This product includes software developed at Datadog (https://www.datadoghq.com/). +# Copyright 2022 Datadog, Inc. + +replicaCount: 1 + +image: + repository: "k8s.io/chaos-dogfood-tester" + pullPolicy: IfNotPresent + tag: "latest" + +imagePullSecrets: [] +fullname: "chaos-dogfood-tester" +namespace: "chaos-demo" + +server: + hostname: "chaos-dogfood-server.chaos-demo.svc.cluster.local" + port: 50051 + +client: + port: 50052 + +podSecurityContext: {} + +securityContext: {} diff --git a/dogfood/tester/dogfood_client.go b/dogfood/tester/dogfood_client.go new file mode 100644 index 000000000..c51528f41 --- /dev/null +++ b/dogfood/tester/dogfood_client.go @@ -0,0 +1,30 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2022 Datadog, Inc. + +package main + +func init() { +} + +func main() { + //TODO + //1. Wait for a request to run a test + //2. When a request is received, place that request in a global queue in case several people are attempting to test at once + // a. Have CI continuously hit this end point and if its currently in the queue, returns its place in queue + //3. In another go thread, pop requests to test from the queue + //4. Find out what relevant metrics would be (if only testing CPU, only CPU metrics matter) + //5. Get relevant metrics from datadog for the past 3 minutes + //6. Deploy the version to test + //7. Once the testing version is deployed and read, depending on the request, create individual disruptions + //8. For each disruption, let it bake for 3 minutes + //9. After baking, grab the last 3 minutes of data to compare to the stable 3 minutes of data + //10. If data looks rights, pass the test and move on to the next disruption and repeat starting from 8 until all + // disruptions are completed for the given request + //11. For each disruption removal, wait 2 minutes to make sure the state of world goes back to stable values measured + // in the beginning + //12. Once the entire request is finished, do 1 of 2 things: + // a. If queue is empty, return the chaos controller in the staging cluster back to latest:stable + // b. If queue is not empty, take the next request +} From ee9fdc84dcdb0b5b4ba494ce10e99f287d8eb852 Mon Sep 17 00:00:00 2001 From: Sam Azouzi Date: Thu, 10 Nov 2022 12:26:15 -0500 Subject: [PATCH 04/10] boilerplate for application to test disruptions --- dogfood/tester/disruptions.go | 196 ++++++++++++++++++ .../{dogfood_client.go => dogfood_tester.go} | 49 ++++- 2 files changed, 241 insertions(+), 4 deletions(-) create mode 100644 dogfood/tester/disruptions.go rename dogfood/tester/{dogfood_client.go => dogfood_tester.go} (52%) diff --git a/dogfood/tester/disruptions.go b/dogfood/tester/disruptions.go new file mode 100644 index 000000000..cdcccf056 --- /dev/null +++ b/dogfood/tester/disruptions.go @@ -0,0 +1,196 @@ +package main + +import ( + "github.com/DataDog/chaos-controller/api/v1beta1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" +) + +// Globals + +var SELECTOR = []string{"app", "chaos-dogfood-client"} +var CONTAINER = "client-deploy" + +// Network Disruptions +var network1 = v1beta1.Disruption{ + ObjectMeta: metav1.ObjectMeta{ + Name: "e2etest-network1", + Namespace: "chaos-engineering", + }, + Spec: v1beta1.DisruptionSpec{ + Count: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, + Unsafemode: &v1beta1.UnsafemodeSpec{ + DisableAll: true, + }, + Selector: map[string]string{SELECTOR[0]: SELECTOR[1]}, + Containers: []string{CONTAINER}, + Duration: "3m", + Network: &v1beta1.NetworkDisruptionSpec{ + Hosts: []v1beta1.NetworkDisruptionHostSpec{ + { + Host: "chaos-dogfood-server.chaos-demo.svc.cluster.local", + Port: 50051, + Protocol: "tcp", + }, + }, + Drop: 30, + Corrupt: 0, + Delay: 0, + BandwidthLimit: 0, + }, + }, +} + +var network2 = v1beta1.Disruption{ + ObjectMeta: metav1.ObjectMeta{ + Name: "e2etest-network2", + Namespace: "chaos-engineering", + }, + Spec: v1beta1.DisruptionSpec{ + Count: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, + Unsafemode: &v1beta1.UnsafemodeSpec{ + DisableAll: true, + }, + Selector: map[string]string{SELECTOR[0]: SELECTOR[1]}, + Containers: []string{CONTAINER}, + Duration: "3m", + Network: &v1beta1.NetworkDisruptionSpec{ + Hosts: []v1beta1.NetworkDisruptionHostSpec{ + { + Host: "chaos-dogfood-server.chaos-demo.svc.cluster.local", + Port: 50051, + Protocol: "tcp", + }, + }, + Drop: 70, + Corrupt: 0, + Delay: 0, + BandwidthLimit: 0, + }, + }, +} + +var network3 = v1beta1.Disruption{ + ObjectMeta: metav1.ObjectMeta{ + Name: "e2etest-network3", + Namespace: "chaos-engineering", + }, + Spec: v1beta1.DisruptionSpec{ + Count: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, + Unsafemode: &v1beta1.UnsafemodeSpec{ + DisableAll: true, + }, + Selector: map[string]string{SELECTOR[0]: SELECTOR[1]}, + Containers: []string{CONTAINER}, + Duration: "3m", + Network: &v1beta1.NetworkDisruptionSpec{ + Hosts: []v1beta1.NetworkDisruptionHostSpec{ + { + Host: "chaos-dogfood-server.chaos-demo.svc.cluster.local", + Port: 50051, + Protocol: "tcp", + }, + }, + Drop: 0, + Corrupt: 0, + Delay: 1000, + BandwidthLimit: 0, + }, + }, +} + +var NETWORK_DISRUPTIONS = []v1beta1.Disruption{network1, network2, network3} + +// Disk Disruptions +var diskReadsThresholds = []int{1024, 2048, 4098} + +var disk1 = v1beta1.Disruption{ + ObjectMeta: metav1.ObjectMeta{ + Name: "e2etest-disk1", + Namespace: "chaos-engineering", + }, + Spec: v1beta1.DisruptionSpec{ + Count: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, + Unsafemode: &v1beta1.UnsafemodeSpec{ + DisableAll: true, + }, + Selector: map[string]string{SELECTOR[0]: SELECTOR[1]}, + Containers: []string{CONTAINER}, + Duration: "3m", + DiskPressure: &v1beta1.DiskPressureSpec{ + Path: "/mnt/data", + Throttling: v1beta1.DiskPressureThrottlingSpec{ + ReadBytesPerSec: &diskReadsThresholds[0], + }, + }, + }, +} + +var disk2 = v1beta1.Disruption{ + ObjectMeta: metav1.ObjectMeta{ + Name: "e2etest-disk2", + Namespace: "chaos-engineering", + }, + Spec: v1beta1.DisruptionSpec{ + Count: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, + Unsafemode: &v1beta1.UnsafemodeSpec{ + DisableAll: true, + }, + Selector: map[string]string{SELECTOR[0]: SELECTOR[1]}, + Containers: []string{CONTAINER}, + Duration: "3m", + DiskPressure: &v1beta1.DiskPressureSpec{ + Path: "/mnt/data", + Throttling: v1beta1.DiskPressureThrottlingSpec{ + WriteBytesPerSec: &diskReadsThresholds[1], + }, + }, + }, +} + +var disk3 = v1beta1.Disruption{ + ObjectMeta: metav1.ObjectMeta{ + Name: "e2etest-disk3", + Namespace: "chaos-engineering", + }, + Spec: v1beta1.DisruptionSpec{ + Count: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, + Unsafemode: &v1beta1.UnsafemodeSpec{ + DisableAll: true, + }, + Selector: map[string]string{SELECTOR[0]: SELECTOR[1]}, + Containers: []string{CONTAINER}, + Duration: "3m", + DiskPressure: &v1beta1.DiskPressureSpec{ + Path: "/mnt/data", + Throttling: v1beta1.DiskPressureThrottlingSpec{ + WriteBytesPerSec: &diskReadsThresholds[2], + }, + }, + }, +} + +var DISK_DISRUPTIONS = []v1beta1.Disruption{disk1, disk2, disk3} + +// CPU Disruptions + +var cpu1 = v1beta1.Disruption{ + ObjectMeta: metav1.ObjectMeta{ + Name: "e2etest-cpu1", + Namespace: "chaos-engineering", + }, + Spec: v1beta1.DisruptionSpec{ + Count: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, + Unsafemode: &v1beta1.UnsafemodeSpec{ + DisableAll: true, + }, + Selector: map[string]string{SELECTOR[0]: SELECTOR[1]}, + Containers: []string{CONTAINER}, + Duration: "3m", + CPUPressure: &v1beta1.CPUPressureSpec{ + Count: &intstr.IntOrString{IntVal: 4}, + }, + }, +} + +var CPU_DISRUPTIONS = []v1beta1.Disruption{cpu1} diff --git a/dogfood/tester/dogfood_client.go b/dogfood/tester/dogfood_tester.go similarity index 52% rename from dogfood/tester/dogfood_client.go rename to dogfood/tester/dogfood_tester.go index c51528f41..ff8fad0b3 100644 --- a/dogfood/tester/dogfood_client.go +++ b/dogfood/tester/dogfood_tester.go @@ -5,15 +5,53 @@ package main -func init() { +import ( + "encoding/json" + "github.com/DataDog/chaos-controller/api/v1beta1" + "go.uber.org/zap" + "net/http" + "time" +) + +var VERSION string +var STATUS string +var logger *zap.SugaredLogger + +type SpecificRequest struct { + CustomDisruption v1beta1.Disruption `json:"disruption"` + PreInstalledDisruption string `json:"preinstalled"` +} + +type Response struct { + Disruption string `json:"disruption"` + StartTime time.Time `json:"startTime"` + EndTime time.Time `json:"endTime"` + Results string `json:"results"` + ResultsExplained string `json:"resultsExplained"` +} + +func version(w http.ResponseWriter, r *http.Request) { + if err := json.NewEncoder(w).Encode(VERSION); err != nil { + logger.Errorw("Failed to Encode Version: %w", err) + } +} + +func status(w http.ResponseWriter, r *http.Request) { + if err := json.NewEncoder(w).Encode(STATUS); err != nil { + logger.Errorw("Failed to Encode STATUS: %w", err) + } +} + +func handleRequests() { + http.HandleFunc("/version", version) + http.HandleFunc("/status", status) + + STATUS = "ready for requests" } func main() { //TODO //1. Wait for a request to run a test - //2. When a request is received, place that request in a global queue in case several people are attempting to test at once - // a. Have CI continuously hit this end point and if its currently in the queue, returns its place in queue - //3. In another go thread, pop requests to test from the queue //4. Find out what relevant metrics would be (if only testing CPU, only CPU metrics matter) //5. Get relevant metrics from datadog for the past 3 minutes //6. Deploy the version to test @@ -27,4 +65,7 @@ func main() { //12. Once the entire request is finished, do 1 of 2 things: // a. If queue is empty, return the chaos controller in the staging cluster back to latest:stable // b. If queue is not empty, take the next request + STATUS = "initializing" + logger = &zap.SugaredLogger{} + handleRequests() } From c1905cbca4e99e9b1ae50d9a48105cdb93df7a36 Mon Sep 17 00:00:00 2001 From: Sam Azouzi Date: Thu, 10 Nov 2022 12:52:15 -0500 Subject: [PATCH 05/10] documentation update --- dogfood/CONTRIBUTING.md | 39 + dogfood/README.md | 9 +- dogfood/datadog-agent-all-features.yaml | 1473 +++++++++++++++++++++++ 3 files changed, 1520 insertions(+), 1 deletion(-) create mode 100644 dogfood/datadog-agent-all-features.yaml diff --git a/dogfood/CONTRIBUTING.md b/dogfood/CONTRIBUTING.md index 1a241792f..ca67b73d6 100644 --- a/dogfood/CONTRIBUTING.md +++ b/dogfood/CONTRIBUTING.md @@ -25,3 +25,42 @@ If your changes don't seem to propagate, you can: - `make uninstall` and `make install` or move to the top level directory and run - `colima delete` and `make colima-start` and redo [dogfood instructions](README.md) + +## Testing Datadog Metrics + +To apply the datadog agent to your local colima environment, run the following: + +``` +kubectl apply -f "https://raw.githubusercontent.com/DataDog/datadog-agent/master/Dockerfiles/manifests/rbac/clusterrole.yaml" + +kubectl apply -f "https://raw.githubusercontent.com/DataDog/datadog-agent/master/Dockerfiles/manifests/rbac/serviceaccount.yaml" + +kubectl apply -f "https://raw.githubusercontent.com/DataDog/datadog-agent/master/Dockerfiles/manifests/rbac/clusterrolebinding.yaml" +``` + +Then take a look at the file `datadog-agent-all-features.yaml` (Feel free to remove the SECURITY feature as it is +unnecessary for testing). You will notice that an api key AND a random string encoded in base64 is required. Get yourself +an API key from your Datadog site, think of a random string, then do the following: + +``` +echo -n '' | base64 +# Copy the encoding and paste it where needed in the datadog.yaml +echo -n 'Random string' | base64 +# Copy the encoding and paste it where needed in the datadog.yaml +``` + +By default the Datadog site is set to the US site datadoghq.com. If you're using other sites, you may want to edit the +`DD_SITE` environment variable accordingly. + +Deploy the Daemonset: +``` +kubectl apply -f datadog-agent-all-features.yaml +``` + +Verify it is running correctly using `kubectl get daemonset` in the appropriate namespace (`default` is the default) + +Once you've verified the daemonset is up and running, you'll need to get Kubernetes State Metrics with the following steps: +1. Download the kube-state manifests folder [here](https://github.com/kubernetes/kube-state-metrics/tree/master/examples/standard). +2. `kubectl apply -f ` + +Then you should be set to see metrics for the client and server containers. diff --git a/dogfood/README.md b/dogfood/README.md index ab1d86c42..2246a1601 100644 --- a/dogfood/README.md +++ b/dogfood/README.md @@ -1,4 +1,4 @@ -# Installing protoc + # Installing protoc Run `brew install protobuf` or `make install-protobuf` @@ -104,6 +104,13 @@ x You can `kubectl apply -f examples/` for any `example/` disruption files. For gRPC disruption, you can follow these [detailed steps](../docs/grpc_disruption/demo_instructions.md). +### Sending Metrics to Datadog + +For the purposes of testing disruptions/workflows, you should make sure that the datadog agent is properly installed +on the cluster that the client and server are running on. 3 of the major disruptive resources properly send metrics +to Datadog (CPU, Network, Disk). The client contains computation related to these disruptions and can be tested using +the disruptions mentioned. + ### Clean up - Run `make uninstall` to `kubectl delete` both charts as well as remove the namespace. diff --git a/dogfood/datadog-agent-all-features.yaml b/dogfood/datadog-agent-all-features.yaml new file mode 100644 index 000000000..95d852c77 --- /dev/null +++ b/dogfood/datadog-agent-all-features.yaml @@ -0,0 +1,1473 @@ +--- +# Source: datadog/templates/cluster-agent-rbac.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app: "datadog" + chart: "datadog-3.1.9" + heritage: "Helm" + release: "datadog" + name: datadog-cluster-agent + namespace: default +--- +# Source: datadog/templates/secret-api-key.yaml +apiVersion: v1 +kind: Secret +metadata: + name: datadog + namespace: default + labels: {} +type: Opaque +data: + api-key: PUT_YOUR_BASE64_ENCODED_API_KEY_HERE +--- +# Source: datadog/templates/secret-cluster-agent-token.yaml +apiVersion: v1 +kind: Secret +metadata: + name: datadog-cluster-agent + namespace: default + labels: {} +type: Opaque +data: + token: PUT_A_BASE64_ENCODED_RANDOM_STRING_HERE +--- +# Source: datadog/templates/cluster-agent-confd-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: datadog-cluster-agent-confd + namespace: default + labels: {} + annotations: {} +data: + kubernetes_state_core.yaml.default: |- + init_config: + instances: + - collectors: + - secrets + - nodes + - pods + - services + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + - daemonsets + - deployments + - replicasets + - statefulsets + - cronjobs + - jobs + - horizontalpodautoscalers + - poddisruptionbudgets + - storageclasses + - volumeattachments + - ingresses + labels_as_tags: + {} +--- +# Source: datadog/templates/install_info-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: datadog-installinfo + namespace: default + labels: {} + annotations: {} +data: + install_info: | + --- + install_method: + tool: kubernetes sample manifests + tool_version: kubernetes sample manifests + installer_version: kubernetes sample manifests +--- +# Source: datadog/templates/system-probe-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: datadog-system-probe-config + namespace: default + labels: {} +data: + system-probe.yaml: | + system_probe_config: + enabled: true + debug_port: 0 + sysprobe_socket: /var/run/sysprobe/sysprobe.sock + enable_conntrack: true + bpf_debug: false + enable_tcp_queue_length: false + enable_oom_kill: false + collect_dns_stats: true + max_tracked_connections: 131072 + conntrack_max_state_size: 131072 + enable_runtime_compiler: false + enable_kernel_header_download: true + runtime_compiler_output_dir: /var/tmp/datadog-agent/system-probe/build + kernel_header_download_dir: /var/tmp/datadog-agent/system-probe/kernel-headers + apt_config_dir: /host/etc/apt + yum_repos_dir: /host/etc/yum.repos.d + zypper_repos_dir: /host/etc/zypp/repos.d + network_config: + enabled: true + conntrack_init_timeout: 10s + service_monitoring_config: + enabled: false + runtime_security_config: + enabled: true + fim_enabled: false + socket: /var/run/sysprobe/runtime-security.sock + policies: + dir: /etc/datadog-agent/runtime-security.d + syscall_monitor: + enabled: false + network: + enabled: false + activity_dump: + enabled: false + traced_cgroups_count: 0 +--- +# Source: datadog/templates/system-probe-configmap.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: datadog-security + namespace: default + labels: {} +data: + system-probe-seccomp.json: | + { + "defaultAction": "SCMP_ACT_ERRNO", + "syscalls": [ + { + "names": [ + "accept4", + "access", + "arch_prctl", + "bind", + "bpf", + "brk", + "capget", + "capset", + "chdir", + "chmod", + "clock_gettime", + "clone", + "clone3", + "close", + "connect", + "copy_file_range", + "creat", + "dup", + "dup2", + "dup3", + "epoll_create", + "epoll_create1", + "epoll_ctl", + "epoll_ctl_old", + "epoll_pwait", + "epoll_wait", + "epoll_wait_old", + "eventfd", + "eventfd2", + "execve", + "execveat", + "exit", + "exit_group", + "faccessat", + "faccessat2", + "fchmod", + "fchmodat", + "fchown", + "fchown32", + "fchownat", + "fcntl", + "fcntl64", + "flock", + "fstat", + "fstat64", + "fstatfs", + "fsync", + "futex", + "getcwd", + "getdents", + "getdents64", + "getegid", + "geteuid", + "getgid", + "getgroups", + "getpeername", + "getpgrp", + "getpid", + "getppid", + "getpriority", + "getrandom", + "getresgid", + "getresgid32", + "getresuid", + "getresuid32", + "getrlimit", + "getrusage", + "getsid", + "getsockname", + "getsockopt", + "gettid", + "gettimeofday", + "getuid", + "getxattr", + "ioctl", + "ipc", + "listen", + "lseek", + "lstat", + "lstat64", + "madvise", + "mkdir", + "mkdirat", + "mmap", + "mmap2", + "mprotect", + "mremap", + "munmap", + "nanosleep", + "newfstatat", + "open", + "openat", + "openat2", + "pause", + "perf_event_open", + "pipe", + "pipe2", + "poll", + "ppoll", + "prctl", + "pread64", + "prlimit64", + "pselect6", + "read", + "readlink", + "readlinkat", + "recvfrom", + "recvmmsg", + "recvmsg", + "rename", + "renameat", + "renameat2", + "restart_syscall", + "rmdir", + "rseq", + "rt_sigaction", + "rt_sigpending", + "rt_sigprocmask", + "rt_sigqueueinfo", + "rt_sigreturn", + "rt_sigsuspend", + "rt_sigtimedwait", + "rt_tgsigqueueinfo", + "sched_getaffinity", + "sched_yield", + "seccomp", + "select", + "semtimedop", + "send", + "sendmmsg", + "sendmsg", + "sendto", + "set_robust_list", + "set_tid_address", + "setgid", + "setgid32", + "setgroups", + "setgroups32", + "setitimer", + "setns", + "setpgid", + "setrlimit", + "setsid", + "setsidaccept4", + "setsockopt", + "setuid", + "setuid32", + "sigaltstack", + "socket", + "socketcall", + "socketpair", + "stat", + "stat64", + "statfs", + "symlinkat", + "sysinfo", + "tgkill", + "umask", + "uname", + "unlink", + "unlinkat", + "wait4", + "waitid", + "waitpid", + "write" + ], + "action": "SCMP_ACT_ALLOW", + "args": null + }, + { + "names": [ + "setns" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 1, + "value": 1073741824, + "valueTwo": 0, + "op": "SCMP_CMP_EQ" + } + ], + "comment": "", + "includes": {}, + "excludes": {} + }, + { + "names": [ + "kill" + ], + "action": "SCMP_ACT_ALLOW", + "args": [ + { + "index": 1, + "value": 0, + "op": "SCMP_CMP_EQ" + } + ], + "comment": "allow process detection via kill", + "includes": {}, + "excludes": {} + } + ] + } +--- +# Source: datadog/templates/cluster-agent-rbac.yaml +apiVersion: "rbac.authorization.k8s.io/v1" +kind: ClusterRole +metadata: + labels: {} + name: datadog-cluster-agent +rules: + - apiGroups: + - "" + resources: + - services + - endpoints + - pods + - nodes + - namespaces + - componentstatuses + verbs: + - get + - list + - watch + - apiGroups: + - "" + resources: + - events + verbs: + - get + - list + - watch + - create + - apiGroups: ["quota.openshift.io"] + resources: + - clusterresourcequotas + verbs: + - get + - list + - apiGroups: + - "autoscaling" + resources: + - horizontalpodautoscalers + verbs: + - list + - watch + - apiGroups: + - "" + resources: + - configmaps + resourceNames: + - datadogtoken # Kubernetes event collection state + - datadogtoken # Kept for backward compatibility with agent <7.37.0 + verbs: + - get + - update + - apiGroups: + - "" + resources: + - configmaps + resourceNames: + - datadog-leader-election # Leader election token + - datadog-leader-election # Kept for backward compatibility with agent <7.37.0 + verbs: + - get + - update + - apiGroups: # To create the leader election token and hpa events + - "" + resources: + - configmaps + - events + verbs: + - create + - nonResourceURLs: + - "/version" + - "/healthz" + verbs: + - get + - apiGroups: # to get the kube-system namespace UID and generate a cluster ID + - "" + resources: + - namespaces + resourceNames: + - "kube-system" + verbs: + - get + - apiGroups: # To create the cluster-id configmap + - "" + resources: + - configmaps + resourceNames: + - "datadog-cluster-id" + verbs: + - create + - get + - update + - apiGroups: + - "" + resources: + - persistentvolumes + - persistentvolumeclaims + - serviceaccounts + verbs: + - list + - get + - watch + - apiGroups: + - "apps" + resources: + - deployments + - replicasets + - daemonsets + - statefulsets + verbs: + - list + - get + - watch + - apiGroups: + - "batch" + resources: + - cronjobs + - jobs + verbs: + - list + - get + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - list + - get + - watch + - apiGroups: + - "rbac.authorization.k8s.io" + resources: + - roles + - rolebindings + - clusterroles + - clusterrolebindings + verbs: + - list + - get + - watch + - apiGroups: + - admissionregistration.k8s.io + resources: + - mutatingwebhookconfigurations + verbs: ["get", "list", "watch", "update", "create"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["get"] + - apiGroups: ["apps"] + resources: ["statefulsets", "replicasets", "deployments", "daemonsets"] + verbs: ["get"] + - apiGroups: + - "" + resources: + - serviceaccounts + - namespaces + verbs: + - list + - apiGroups: + - "policy" + resources: + - podsecuritypolicies + verbs: + - get + - list + - watch + - apiGroups: + - rbac.authorization.k8s.io + resources: + - clusterrolebindings + - rolebindings + verbs: + - list + - apiGroups: + - networking.k8s.io + resources: + - networkpolicies + verbs: + - list + - apiGroups: + - policy + resources: + - podsecuritypolicies + verbs: + - use + resourceNames: + - datadog-cluster-agent + - apiGroups: + - "security.openshift.io" + resources: + - securitycontextconstraints + verbs: + - use + resourceNames: + - datadog-cluster-agent + - hostnetwork +--- +# Source: datadog/templates/kube-state-metrics-core-rbac.yaml +apiVersion: "rbac.authorization.k8s.io/v1" +kind: ClusterRole +metadata: + labels: {} + name: datadog-ksm-core +rules: + - apiGroups: + - "" + resources: + - secrets + - nodes + - pods + - services + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + - events + verbs: + - list + - watch + - apiGroups: + - extensions + resources: + - daemonsets + - deployments + - replicasets + verbs: + - list + - watch + - apiGroups: + - apps + resources: + - statefulsets + - daemonsets + - deployments + - replicasets + verbs: + - list + - watch + - apiGroups: + - batch + resources: + - cronjobs + - jobs + verbs: + - list + - watch + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - list + - watch + - apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - list + - watch + - apiGroups: + - storage.k8s.io + resources: + - storageclasses + - volumeattachments + verbs: + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - list + - watch +--- +# Source: datadog/templates/cluster-agent-rbac.yaml +apiVersion: "rbac.authorization.k8s.io/v1" +kind: ClusterRoleBinding +metadata: + labels: {} + name: datadog-cluster-agent +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: datadog-cluster-agent +subjects: + - kind: ServiceAccount + name: datadog-cluster-agent + namespace: default +--- +# Source: datadog/templates/kube-state-metrics-core-rbac.yaml +apiVersion: "rbac.authorization.k8s.io/v1" +kind: ClusterRoleBinding +metadata: + labels: {} + name: datadog-ksm-core +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: datadog-ksm-core +subjects: + - kind: ServiceAccount + name: datadog-cluster-agent + namespace: default +--- +# Source: datadog/templates/cluster-agent-rbac.yaml +apiVersion: "rbac.authorization.k8s.io/v1" +kind: Role +metadata: + labels: {} + name: datadog-cluster-agent-main + namespace: default +rules: + - apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "list", "watch", "update", "create"] +--- +# Source: datadog/templates/cluster-agent-rbac.yaml +apiVersion: "rbac.authorization.k8s.io/v1" +kind: RoleBinding +metadata: + labels: {} + name: "datadog-cluster-agent-main" + namespace: default +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: datadog-cluster-agent-main +subjects: + - kind: ServiceAccount + name: datadog-cluster-agent + namespace: default +--- +# Source: datadog/templates/agent-services.yaml +apiVersion: v1 +kind: Service +metadata: + name: datadog-cluster-agent + namespace: default + labels: {} +spec: + type: ClusterIP + selector: + app: datadog-cluster-agent + ports: + - port: 5005 + name: agentport + protocol: TCP +--- +# Source: datadog/templates/agent-services.yaml +apiVersion: v1 +kind: Service +metadata: + name: datadog-cluster-agent-admission-controller + namespace: default + labels: + app: "datadog" + chart: "datadog-3.1.9" + release: "datadog" + heritage: "Helm" +spec: + selector: + app: datadog-cluster-agent + ports: + - port: 443 + targetPort: 8000 +--- +# Source: datadog/templates/daemonset.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: datadog + namespace: default + labels: {} +spec: + revisionHistoryLimit: 10 + selector: + matchLabels: + app: datadog + template: + metadata: + labels: + app: datadog + name: datadog + annotations: + container.apparmor.security.beta.kubernetes.io/system-probe: unconfined + spec: + securityContext: + runAsUser: 0 + hostPID: true + containers: + - name: agent + image: "gcr.io/datadoghq/agent:7.39.1" + imagePullPolicy: IfNotPresent + command: ["agent", "run"] + resources: {} + ports: + - containerPort: 8125 + name: dogstatsdport + protocol: UDP + env: + # Needs to be removed when Agent N-2 is built with Golang 1.17 + - name: GODEBUG + value: x509ignoreCN=0 + - name: DD_API_KEY + valueFrom: + secretKeyRef: + name: "datadog" + key: api-key + - name: DD_KUBERNETES_KUBELET_HOST + valueFrom: + fieldRef: + fieldPath: status.hostIP + - name: KUBERNETES + value: "yes" + - name: DD_LOG_LEVEL + value: "INFO" + - name: DD_DOGSTATSD_PORT + value: "8125" + - name: DD_DOGSTATSD_NON_LOCAL_TRAFFIC + value: "true" + - name: DD_CLUSTER_AGENT_ENABLED + value: "true" + - name: DD_CLUSTER_AGENT_KUBERNETES_SERVICE_NAME + value: datadog-cluster-agent + - name: DD_CLUSTER_AGENT_AUTH_TOKEN + valueFrom: + secretKeyRef: + name: datadog-cluster-agent + key: token + - name: DD_APM_ENABLED + value: "false" + - name: DD_LOGS_ENABLED + value: "true" + - name: DD_LOGS_CONFIG_CONTAINER_COLLECT_ALL + value: "true" + - name: DD_LOGS_CONFIG_K8S_CONTAINER_USE_FILE + value: "true" + - name: DD_LOGS_CONFIG_AUTO_MULTI_LINE_DETECTION + value: "false" + - name: DD_HEALTH_PORT + value: "5555" + - name: DD_DOGSTATSD_SOCKET + value: "/var/run/datadog/dsd.socket" + - name: DD_EXTRA_CONFIG_PROVIDERS + value: "clusterchecks endpointschecks" + - name: DD_IGNORE_AUTOCONF + value: "kubernetes_state" + - name: DD_EXPVAR_PORT + value: "6000" + volumeMounts: + - name: installinfo + subPath: install_info + mountPath: /etc/datadog-agent/install_info + readOnly: true + - name: logdatadog + mountPath: /var/log/datadog + - name: tmpdir + mountPath: /tmp + readOnly: false + - name: os-release-file + mountPath: /host/etc/os-release + mountPropagation: None + readOnly: true + - name: config + mountPath: /etc/datadog-agent + - name: runtimesocketdir + mountPath: /host/var/run + mountPropagation: None + readOnly: true + - name: dsdsocket + mountPath: /var/run/datadog + - name: sysprobe-socket-dir + mountPath: /var/run/sysprobe + readOnly: true + - name: sysprobe-config + mountPath: /etc/datadog-agent/system-probe.yaml + subPath: system-probe.yaml + - name: procdir + mountPath: /host/proc + mountPropagation: None + readOnly: true + - name: cgroups + mountPath: /host/sys/fs/cgroup + mountPropagation: None + readOnly: true + - name: pointerdir + mountPath: /opt/datadog-agent/run + mountPropagation: None + - name: logpodpath + mountPath: /var/log/pods + mountPropagation: None + readOnly: true + - name: logscontainerspath + mountPath: /var/log/containers + mountPropagation: None + readOnly: true + - name: logdockercontainerpath + mountPath: /var/lib/docker/containers + mountPropagation: None + readOnly: true + livenessProbe: + failureThreshold: 6 + httpGet: + path: /live + port: 5555 + scheme: HTTP + initialDelaySeconds: 15 + periodSeconds: 15 + successThreshold: 1 + timeoutSeconds: 5 + readinessProbe: + failureThreshold: 6 + httpGet: + path: /ready + port: 5555 + scheme: HTTP + initialDelaySeconds: 15 + periodSeconds: 15 + successThreshold: 1 + timeoutSeconds: 5 + - name: trace-agent + image: "gcr.io/datadoghq/agent:7.39.1" + imagePullPolicy: IfNotPresent + command: ["trace-agent", "-config=/etc/datadog-agent/datadog.yaml"] + resources: {} + ports: + - containerPort: 8126 + hostPort: 8126 + name: traceport + protocol: TCP + env: + # Needs to be removed when Agent N-2 is built with Golang 1.17 + - name: GODEBUG + value: x509ignoreCN=0 + - name: DD_API_KEY + valueFrom: + secretKeyRef: + name: "datadog" + key: api-key + - name: DD_KUBERNETES_KUBELET_HOST + valueFrom: + fieldRef: + fieldPath: status.hostIP + - name: KUBERNETES + value: "yes" + - name: DD_CLUSTER_AGENT_ENABLED + value: "true" + - name: DD_CLUSTER_AGENT_KUBERNETES_SERVICE_NAME + value: datadog-cluster-agent + - name: DD_CLUSTER_AGENT_AUTH_TOKEN + valueFrom: + secretKeyRef: + name: datadog-cluster-agent + key: token + - name: DD_LOG_LEVEL + value: "INFO" + - name: DD_APM_ENABLED + value: "true" + - name: DD_APM_NON_LOCAL_TRAFFIC + value: "true" + - name: DD_APM_RECEIVER_PORT + value: "8126" + - name: DD_APM_RECEIVER_SOCKET + value: "/var/run/datadog/apm.socket" + - name: DD_DOGSTATSD_SOCKET + value: "/var/run/datadog/dsd.socket" + volumeMounts: + - name: config + mountPath: /etc/datadog-agent + - name: logdatadog + mountPath: /var/log/datadog + - name: tmpdir + mountPath: /tmp + readOnly: false + - name: dsdsocket + mountPath: /var/run/datadog + - name: runtimesocketdir + mountPath: /host/var/run + mountPropagation: None + readOnly: true + livenessProbe: + initialDelaySeconds: 15 + periodSeconds: 15 + tcpSocket: + port: 8126 + timeoutSeconds: 5 + - name: process-agent + image: "gcr.io/datadoghq/agent:7.39.1" + imagePullPolicy: IfNotPresent + command: ["process-agent", "--cfgpath=/etc/datadog-agent/datadog.yaml"] + resources: {} + env: + # Needs to be removed when Agent N-2 is built with Golang 1.17 + - name: GODEBUG + value: x509ignoreCN=0 + - name: DD_API_KEY + valueFrom: + secretKeyRef: + name: "datadog" + key: api-key + - name: DD_KUBERNETES_KUBELET_HOST + valueFrom: + fieldRef: + fieldPath: status.hostIP + - name: KUBERNETES + value: "yes" + - name: DD_CLUSTER_AGENT_ENABLED + value: "true" + - name: DD_CLUSTER_AGENT_KUBERNETES_SERVICE_NAME + value: datadog-cluster-agent + - name: DD_CLUSTER_AGENT_AUTH_TOKEN + valueFrom: + secretKeyRef: + name: datadog-cluster-agent + key: token + - name: DD_PROCESS_AGENT_ENABLED + value: "true" + - name: DD_PROCESS_AGENT_DISCOVERY_ENABLED + value: "false" + - name: DD_LOG_LEVEL + value: "INFO" + - name: DD_SYSTEM_PROBE_ENABLED + value: "true" + - name: DD_SYSTEM_PROBE_NETWORK_ENABLED + value: "true" + - name: DD_DOGSTATSD_SOCKET + value: "/var/run/datadog/dsd.socket" + - name: DD_ORCHESTRATOR_EXPLORER_ENABLED + value: "true" + volumeMounts: + - name: config + mountPath: /etc/datadog-agent + - name: logdatadog + mountPath: /var/log/datadog + - name: tmpdir + mountPath: /tmp + readOnly: false + - name: os-release-file + mountPath: /host/etc/os-release + mountPropagation: None + readOnly: true + - name: runtimesocketdir + mountPath: /host/var/run + mountPropagation: None + readOnly: true + - name: cgroups + mountPath: /host/sys/fs/cgroup + mountPropagation: None + readOnly: true + - name: passwd + mountPath: /etc/passwd + readOnly: true + - name: procdir + mountPath: /host/proc + mountPropagation: None + readOnly: true + - name: dsdsocket + mountPath: /var/run/datadog + readOnly: true + - name: sysprobe-socket-dir + mountPath: /var/run/sysprobe + readOnly: true + - name: sysprobe-config + mountPath: /etc/datadog-agent/system-probe.yaml + subPath: system-probe.yaml + - name: system-probe + image: "gcr.io/datadoghq/agent:7.39.1" + imagePullPolicy: IfNotPresent + securityContext: + capabilities: + add: + - SYS_ADMIN + - SYS_RESOURCE + - SYS_PTRACE + - NET_ADMIN + - NET_BROADCAST + - NET_RAW + - IPC_LOCK + - CHOWN + privileged: false + seccompProfile: + type: Localhost + localhostProfile: system-probe + command: ["/opt/datadog-agent/embedded/bin/system-probe", "--config=/etc/datadog-agent/system-probe.yaml"] + env: + # Needs to be removed when Agent N-2 is built with Golang 1.17 + - name: GODEBUG + value: x509ignoreCN=0 + - name: DD_API_KEY + valueFrom: + secretKeyRef: + name: "datadog" + key: api-key + - name: DD_KUBERNETES_KUBELET_HOST + valueFrom: + fieldRef: + fieldPath: status.hostIP + - name: KUBERNETES + value: "yes" + - name: DD_LOG_LEVEL + value: "INFO" + resources: {} + volumeMounts: + - name: logdatadog + mountPath: /var/log/datadog + - name: tmpdir + mountPath: /tmp + readOnly: false + - name: debugfs + mountPath: /sys/kernel/debug + mountPropagation: None + - name: config + mountPath: /etc/datadog-agent + - name: sysprobe-config + mountPath: /etc/datadog-agent/system-probe.yaml + subPath: system-probe.yaml + - name: sysprobe-socket-dir + mountPath: /var/run/sysprobe + - name: procdir + mountPath: /host/proc + mountPropagation: None + readOnly: true + - name: os-release-file + mountPath: /host/etc/os-release + mountPropagation: None + readOnly: true + - name: etc-redhat-release + mountPath: /host/etc/redhat-release + mountPropagation: None + readOnly: true + - name: etc-fedora-release + mountPath: /host/etc/fedora-release + mountPropagation: None + readOnly: true + - name: etc-lsb-release + mountPath: /host/etc/lsb-release + mountPropagation: None + readOnly: true + - name: security-agent + image: "gcr.io/datadoghq/agent:7.39.1" + imagePullPolicy: IfNotPresent + securityContext: + capabilities: + add: ["AUDIT_CONTROL", "AUDIT_READ"] + command: ["security-agent", "start", "-c=/etc/datadog-agent/datadog.yaml"] + resources: {} + env: + # Needs to be removed when Agent N-2 is built with Golang 1.17 + - name: GODEBUG + value: x509ignoreCN=0 + - name: DD_API_KEY + valueFrom: + secretKeyRef: + name: "datadog" + key: api-key + - name: DD_KUBERNETES_KUBELET_HOST + valueFrom: + fieldRef: + fieldPath: status.hostIP + - name: KUBERNETES + value: "yes" + - name: DD_LOG_LEVEL + value: "INFO" + - name: DD_COMPLIANCE_CONFIG_ENABLED + value: "true" + - name: DD_COMPLIANCE_CONFIG_CHECK_INTERVAL + value: "20m" + - name: HOST_ROOT + value: /host/root + - name: DD_CLUSTER_AGENT_ENABLED + value: "true" + - name: DD_CLUSTER_AGENT_KUBERNETES_SERVICE_NAME + value: datadog-cluster-agent + - name: DD_CLUSTER_AGENT_AUTH_TOKEN + valueFrom: + secretKeyRef: + name: datadog-cluster-agent + key: token + - name: DD_RUNTIME_SECURITY_CONFIG_ENABLED + value: "true" + - name: DD_RUNTIME_SECURITY_CONFIG_POLICIES_DIR + value: "/etc/datadog-agent/runtime-security.d" + - name: DD_RUNTIME_SECURITY_CONFIG_SOCKET + value: /var/run/sysprobe/runtime-security.sock + - name: DD_DOGSTATSD_SOCKET + value: "/var/run/datadog/dsd.socket" + volumeMounts: + - name: config + mountPath: /etc/datadog-agent + - name: logdatadog + mountPath: /var/log/datadog + - name: tmpdir + mountPath: /tmp + readOnly: false + - name: dsdsocket + mountPath: /var/run/datadog + readOnly: true + - name: os-release-file + mountPath: /host/etc/os-release + mountPropagation: None + readOnly: true + - name: runtimesocketdir + mountPath: /host/var/run + mountPropagation: None + readOnly: true + - name: cgroups + mountPath: /host/sys/fs/cgroup + readOnly: true + - name: passwd + mountPath: /etc/passwd + readOnly: true + - name: group + mountPath: /etc/group + readOnly: true + - name: hostroot + mountPath: /host/root + readOnly: true + - name: procdir + mountPath: /host/proc + readOnly: true + - name: sysprobe-socket-dir + mountPath: /var/run/sysprobe + readOnly: true + - name: sysprobe-config + mountPath: /etc/datadog-agent/system-probe.yaml + subPath: system-probe.yaml + initContainers: + - name: init-volume + image: "gcr.io/datadoghq/agent:7.39.1" + imagePullPolicy: IfNotPresent + command: ["bash", "-c"] + args: + - cp -r /etc/datadog-agent /opt + volumeMounts: + - name: config + mountPath: /opt/datadog-agent + resources: {} + - name: init-config + image: "gcr.io/datadoghq/agent:7.39.1" + imagePullPolicy: IfNotPresent + command: ["bash", "-c"] + args: + - for script in $(find /etc/cont-init.d/ -type f -name '*.sh' | sort) ; do bash $script ; done + volumeMounts: + - name: logdatadog + mountPath: /var/log/datadog + - name: config + mountPath: /etc/datadog-agent + - name: procdir + mountPath: /host/proc + mountPropagation: None + readOnly: true + - name: runtimesocketdir + mountPath: /host/var/run + mountPropagation: None + readOnly: true + - name: sysprobe-config + mountPath: /etc/datadog-agent/system-probe.yaml + subPath: system-probe.yaml + env: + # Needs to be removed when Agent N-2 is built with Golang 1.17 + - name: GODEBUG + value: x509ignoreCN=0 + - name: DD_API_KEY + valueFrom: + secretKeyRef: + name: "datadog" + key: api-key + - name: DD_KUBERNETES_KUBELET_HOST + valueFrom: + fieldRef: + fieldPath: status.hostIP + - name: KUBERNETES + value: "yes" + resources: {} + - name: seccomp-setup + image: "gcr.io/datadoghq/agent:7.39.1" + command: + - cp + - /etc/config/system-probe-seccomp.json + - /host/var/lib/kubelet/seccomp/system-probe + volumeMounts: + - name: datadog-agent-security + mountPath: /etc/config + - name: seccomp-root + mountPath: /host/var/lib/kubelet/seccomp + mountPropagation: None + resources: {} + volumes: + - name: installinfo + configMap: + name: datadog-installinfo + - name: config + emptyDir: {} + - name: logdatadog + emptyDir: {} + - name: tmpdir + emptyDir: {} + - hostPath: + path: /proc + name: procdir + - hostPath: + path: /sys/fs/cgroup + name: cgroups + - hostPath: + path: /etc/os-release + name: os-release-file + - hostPath: + path: /etc/redhat-release + name: etc-redhat-release + - hostPath: + path: /etc/fedora-release + name: etc-fedora-release + - hostPath: + path: /etc/lsb-release + name: etc-lsb-release + - hostPath: + path: /var/run/datadog/ + type: DirectoryOrCreate + name: dsdsocket + - hostPath: + path: /var/run/datadog/ + type: DirectoryOrCreate + name: apmsocket + - name: s6-run + emptyDir: {} + - name: sysprobe-config + configMap: + name: datadog-system-probe-config + - name: datadog-agent-security + configMap: + name: datadog-security + - hostPath: + path: /var/lib/kubelet/seccomp + name: seccomp-root + - hostPath: + path: /sys/kernel/debug + name: debugfs + - name: sysprobe-socket-dir + emptyDir: {} + - hostPath: + path: /etc/passwd + name: passwd + - hostPath: + path: / + name: hostroot + - hostPath: + path: /etc/group + name: group + - hostPath: + path: /var/lib/datadog-agent/logs + name: pointerdir + - hostPath: + path: /var/log/pods + name: logpodpath + - hostPath: + path: /var/log/containers + name: logscontainerspath + - hostPath: + path: /var/lib/docker/containers + name: logdockercontainerpath + - hostPath: + path: /var/run + name: runtimesocketdir + tolerations: + affinity: {} + serviceAccountName: "datadog-agent" + nodeSelector: + kubernetes.io/os: linux + updateStrategy: + rollingUpdate: + maxUnavailable: 10% + type: RollingUpdate +--- +# Source: datadog/templates/cluster-agent-deployment.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: datadog-cluster-agent + namespace: default + labels: {} +spec: + replicas: 1 + revisionHistoryLimit: 10 + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + type: RollingUpdate + selector: + matchLabels: + app: datadog-cluster-agent + template: + metadata: + labels: + app: datadog-cluster-agent + name: datadog-cluster-agent + annotations: {} + spec: + serviceAccountName: datadog-cluster-agent + containers: + - name: cluster-agent + image: "gcr.io/datadoghq/cluster-agent:7.39.1" + imagePullPolicy: IfNotPresent + resources: {} + ports: + - containerPort: 5005 + name: agentport + protocol: TCP + - containerPort: 5000 + name: agentmetrics + protocol: TCP + env: + - name: DD_HEALTH_PORT + value: "5556" + - name: DD_API_KEY + valueFrom: + secretKeyRef: + name: "datadog" + key: api-key + optional: true + - name: DD_ADMISSION_CONTROLLER_ENABLED + value: "true" + - name: DD_ADMISSION_CONTROLLER_MUTATE_UNLABELLED + value: "false" + - name: DD_ADMISSION_CONTROLLER_SERVICE_NAME + value: datadog-cluster-agent-admission-controller + - name: DD_ADMISSION_CONTROLLER_FAILURE_POLICY + value: "Ignore" + - name: DD_CLUSTER_CHECKS_ENABLED + value: "true" + - name: DD_EXTRA_CONFIG_PROVIDERS + value: "kube_endpoints kube_services" + - name: DD_EXTRA_LISTENERS + value: "kube_endpoints kube_services" + - name: DD_LOG_LEVEL + value: "INFO" + - name: DD_LEADER_ELECTION + value: "true" + - name: DD_LEADER_LEASE_NAME + value: datadog-leader-election + - name: DD_CLUSTER_AGENT_TOKEN_NAME + value: datadogtoken + - name: DD_COLLECT_KUBERNETES_EVENTS + value: "true" + - name: DD_CLUSTER_AGENT_KUBERNETES_SERVICE_NAME + value: datadog-cluster-agent + - name: DD_CLUSTER_AGENT_AUTH_TOKEN + valueFrom: + secretKeyRef: + name: datadog-cluster-agent + key: token + - name: DD_KUBE_RESOURCES_NAMESPACE + value: default + - name: DD_ORCHESTRATOR_EXPLORER_ENABLED + value: "true" + - name: DD_ORCHESTRATOR_EXPLORER_CONTAINER_SCRUBBING_ENABLED + value: "true" + - name: DD_COMPLIANCE_CONFIG_ENABLED + value: "true" + - name: DD_COMPLIANCE_CONFIG_CHECK_INTERVAL + value: "20m" + livenessProbe: + failureThreshold: 6 + httpGet: + path: /live + port: 5556 + scheme: HTTP + initialDelaySeconds: 15 + periodSeconds: 15 + successThreshold: 1 + timeoutSeconds: 5 + readinessProbe: + failureThreshold: 6 + httpGet: + path: /ready + port: 5556 + scheme: HTTP + initialDelaySeconds: 15 + periodSeconds: 15 + successThreshold: 1 + timeoutSeconds: 5 + volumeMounts: + - name: installinfo + subPath: install_info + mountPath: /etc/datadog-agent/install_info + readOnly: true + - name: confd + mountPath: /conf.d + readOnly: true + volumes: + - name: installinfo + configMap: + name: datadog-installinfo + - name: confd + configMap: + name: datadog-cluster-agent-confd + items: + - key: kubernetes_state_core.yaml.default + path: kubernetes_state_core.yaml.default + affinity: + # Prefer scheduling the cluster agents on different nodes + # to guarantee that the standby instance can immediately take the lead from a leader running of a faulty node. + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 50 + podAffinityTerm: + labelSelector: + matchLabels: + app: datadog-cluster-agent + topologyKey: kubernetes.io/hostname + nodeSelector: + kubernetes.io/os: linux From 4a692d9f7716eba34f067d68a8f723209877dc04 Mon Sep 17 00:00:00 2001 From: Sam Azouzi Date: Thu, 10 Nov 2022 12:55:51 -0500 Subject: [PATCH 06/10] remove old comments --- dogfood/client/dogfood_client.go | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/dogfood/client/dogfood_client.go b/dogfood/client/dogfood_client.go index df841d616..1c5411481 100644 --- a/dogfood/client/dogfood_client.go +++ b/dogfood/client/dogfood_client.go @@ -89,38 +89,6 @@ func printAndLog(logLine string) { } f.Close() }() - - // write and read this file to help with testing disk disruptions - - //writeSize := len(logLineBytes) - //var err error - //f, err := os.OpenFile("/mnt/data/logging", os.O_APPEND|os.O_WRONLY|os.O_CREATE|os.O_SYNC, 0600) - //if err != nil { - // panic(err) - //} - // - //defer f.Close() - //// the os.WriteFile will reset the file as to not fill up disk space - //// the follow WriteString Operations will append 10 lines to the file so to increase read operations that follow - //// the writes - //err = os.WriteFile("/mnt/data/logging", logLineBytes, 0644) - //if err != nil { - // fmt.Errorf("could not write to logging file: %w", err) - //} - //for i := 0; i < 10; i++ { - // if _, err = f.WriteString(logLine + "\n"); err != nil { - // fmt.Errorf("could not write to logging file: %w", err) - // } else { - // writeSize += writeSize - // } - // - //} - // - //test := make([]byte, writeSize) - //_, err = f.Read(test) - //if err != nil { - // fmt.Errorf("could not read the logging file: %w", err) - //} } // regularly order food for different animals From 230f316fb1db1ec45867dce9af1459b7ec1fdb58 Mon Sep 17 00:00:00 2001 From: Sam Azouzi Date: Thu, 10 Nov 2022 14:14:58 -0500 Subject: [PATCH 07/10] fixing linter and license ci issues --- dogfood/client/chart/templates/volumeclaim.yaml | 2 +- dogfood/client/dogfood_client.go | 12 ++++++++++-- dogfood/datadog-agent-all-features.yaml | 4 ++++ dogfood/tester/disruptions.go | 5 +++++ dogfood/tester/dogfood_tester.go | 5 +++-- 5 files changed, 23 insertions(+), 5 deletions(-) diff --git a/dogfood/client/chart/templates/volumeclaim.yaml b/dogfood/client/chart/templates/volumeclaim.yaml index 31351d003..77341e8c8 100644 --- a/dogfood/client/chart/templates/volumeclaim.yaml +++ b/dogfood/client/chart/templates/volumeclaim.yaml @@ -1,7 +1,7 @@ # Unless explicitly stated otherwise all files in this repository are licensed # under the Apache License Version 2.0. # This product includes software developed at Datadog (https://www.datadoghq.com/). -# Copyright 2021 Datadog, Inc. +# Copyright 2022 Datadog, Inc. apiVersion: v1 kind: PersistentVolumeClaim diff --git a/dogfood/client/dogfood_client.go b/dogfood/client/dogfood_client.go index 1c5411481..52974801a 100644 --- a/dogfood/client/dogfood_client.go +++ b/dogfood/client/dogfood_client.go @@ -66,12 +66,16 @@ func printAndLog(logLine string) { if err != nil { log.Fatal(err) } + logLineBytes := make([]byte, 500000) + _, err = f.Read(logLineBytes) if err != nil { log.Fatal(err) } - f.Close() + if err := f.Close(); err != nil { + log.Fatal(err) + } if _, err := os.Stat("/mnt/data/logging"); errors.Is(err, os.ErrNotExist) { f, err = os.Create("/mnt/data/logging") if err != nil { @@ -83,11 +87,14 @@ func printAndLog(logLine string) { log.Fatal(err) } } + _, err = f.Write(logLineBytes) if err != nil { log.Fatal(err) } - f.Close() + if err = f.Close(); err != nil { + log.Fatal(err) + } }() } @@ -159,6 +166,7 @@ func main() { // generate and use client client := pb.NewChaosDogfoodClient(conn) + printAndLog("We successfully generated the client, getting ready to send requests") sendsLotsOfRequests(client) diff --git a/dogfood/datadog-agent-all-features.yaml b/dogfood/datadog-agent-all-features.yaml index 95d852c77..73027287d 100644 --- a/dogfood/datadog-agent-all-features.yaml +++ b/dogfood/datadog-agent-all-features.yaml @@ -1,3 +1,7 @@ +# Unless explicitly stated otherwise all files in this repository are licensed +# under the Apache License Version 2.0. +# This product includes software developed at Datadog (https://www.datadoghq.com/). +# Copyright 2022 Datadog, Inc. --- # Source: datadog/templates/cluster-agent-rbac.yaml apiVersion: v1 diff --git a/dogfood/tester/disruptions.go b/dogfood/tester/disruptions.go index cdcccf056..be465107f 100644 --- a/dogfood/tester/disruptions.go +++ b/dogfood/tester/disruptions.go @@ -1,3 +1,8 @@ +// Unless explicitly stated otherwise all files in this repository are licensed +// under the Apache License Version 2.0. +// This product includes software developed at Datadog (https://www.datadoghq.com/). +// Copyright 2022 Datadog, Inc. + package main import ( diff --git a/dogfood/tester/dogfood_tester.go b/dogfood/tester/dogfood_tester.go index ff8fad0b3..258b2e058 100644 --- a/dogfood/tester/dogfood_tester.go +++ b/dogfood/tester/dogfood_tester.go @@ -7,10 +7,11 @@ package main import ( "encoding/json" - "github.com/DataDog/chaos-controller/api/v1beta1" - "go.uber.org/zap" "net/http" "time" + + "github.com/DataDog/chaos-controller/api/v1beta1" + "go.uber.org/zap" ) var VERSION string From fbe198e853c93f5b1084f9374d06d314e14bf4d3 Mon Sep 17 00:00:00 2001 From: Sam Azouzi Date: Thu, 10 Nov 2022 14:33:02 -0500 Subject: [PATCH 08/10] fixing linting issues --- dogfood/client/dogfood_client.go | 11 ++++++++--- dogfood/tester/dogfood_tester.go | 17 +++++++---------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/dogfood/client/dogfood_client.go b/dogfood/client/dogfood_client.go index 52974801a..80c1c02db 100644 --- a/dogfood/client/dogfood_client.go +++ b/dogfood/client/dogfood_client.go @@ -73,10 +73,12 @@ func printAndLog(logLine string) { if err != nil { log.Fatal(err) } - if err := f.Close(); err != nil { + + if err = f.Close(); err != nil { log.Fatal(err) } - if _, err := os.Stat("/mnt/data/logging"); errors.Is(err, os.ErrNotExist) { + + if _, err = os.Stat("/mnt/data/logging"); errors.Is(err, os.ErrNotExist) { f, err = os.Create("/mnt/data/logging") if err != nil { log.Fatal(err) @@ -92,6 +94,7 @@ func printAndLog(logLine string) { if err != nil { log.Fatal(err) } + if err = f.Close(); err != nil { log.Fatal(err) } @@ -149,7 +152,9 @@ func stringifyCatalogItems(items []*pb.CatalogItem) string { func main() { // create and eventually close connection printAndLog(fmt.Sprintf("connecting to %v...\n", serverAddr)) + var opts []grpc.DialOption + opts = append(opts, grpc.WithInsecure()) opts = append(opts, grpc.WithBlock()) @@ -166,7 +171,7 @@ func main() { // generate and use client client := pb.NewChaosDogfoodClient(conn) - + printAndLog("We successfully generated the client, getting ready to send requests") sendsLotsOfRequests(client) diff --git a/dogfood/tester/dogfood_tester.go b/dogfood/tester/dogfood_tester.go index 258b2e058..98d860d86 100644 --- a/dogfood/tester/dogfood_tester.go +++ b/dogfood/tester/dogfood_tester.go @@ -51,22 +51,19 @@ func handleRequests() { } func main() { - //TODO + // TODO //1. Wait for a request to run a test //4. Find out what relevant metrics would be (if only testing CPU, only CPU metrics matter) - //5. Get relevant metrics from datadog for the past 3 minutes - //6. Deploy the version to test - //7. Once the testing version is deployed and read, depending on the request, create individual disruptions - //8. For each disruption, let it bake for 3 minutes - //9. After baking, grab the last 3 minutes of data to compare to the stable 3 minutes of data - //10. If data looks rights, pass the test and move on to the next disruption and repeat starting from 8 until all + //5. Get relevant metrics from datadog for the past 3 minutes, stable metrics + //6. Depending on the request, create individual disruptions + //7. For each disruption, let it bake for 3 minutes + //8. After baking, grab the last 3 minutes of data to compare to the stable 3 minutes of data + //9. If data looks rights, pass the test and move on to the next disruption and repeat starting from 8 until all // disruptions are completed for the given request //11. For each disruption removal, wait 2 minutes to make sure the state of world goes back to stable values measured // in the beginning - //12. Once the entire request is finished, do 1 of 2 things: - // a. If queue is empty, return the chaos controller in the staging cluster back to latest:stable - // b. If queue is not empty, take the next request STATUS = "initializing" logger = &zap.SugaredLogger{} + handleRequests() } From 2441e91f59737dfc170861eef5551cafe7e80348 Mon Sep 17 00:00:00 2001 From: Sam Azouzi Date: Thu, 10 Nov 2022 14:59:03 -0500 Subject: [PATCH 09/10] fixing linting issues --- dogfood/client/dogfood_client.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dogfood/client/dogfood_client.go b/dogfood/client/dogfood_client.go index 80c1c02db..2e8f5decc 100644 --- a/dogfood/client/dogfood_client.go +++ b/dogfood/client/dogfood_client.go @@ -154,7 +154,7 @@ func main() { printAndLog(fmt.Sprintf("connecting to %v...\n", serverAddr)) var opts []grpc.DialOption - + opts = append(opts, grpc.WithInsecure()) opts = append(opts, grpc.WithBlock()) From f6654de4f9cc8a653af9be786f9166946c0e9650 Mon Sep 17 00:00:00 2001 From: Sam Azouzi Date: Tue, 15 Nov 2022 11:08:37 -0500 Subject: [PATCH 10/10] use ubuntu:jammy, use var/consts where applicable, update readme --- dogfood/README.md | 6 +- dogfood/client/Dockerfile | 2 +- dogfood/server/Dockerfile | 2 +- dogfood/tester/Dockerfile | 2 +- dogfood/tester/disruptions.go | 132 +++++++++++++++------------------- 5 files changed, 65 insertions(+), 79 deletions(-) diff --git a/dogfood/README.md b/dogfood/README.md index 2246a1601..fef5b097e 100644 --- a/dogfood/README.md +++ b/dogfood/README.md @@ -1,4 +1,4 @@ - # Installing protoc +# Installing protoc Run `brew install protobuf` or `make install-protobuf` @@ -107,8 +107,8 @@ For gRPC disruption, you can follow these [detailed steps](../docs/grpc_disrupti ### Sending Metrics to Datadog For the purposes of testing disruptions/workflows, you should make sure that the datadog agent is properly installed -on the cluster that the client and server are running on. 3 of the major disruptive resources properly send metrics -to Datadog (CPU, Network, Disk). The client contains computation related to these disruptions and can be tested using +on the cluster that the client and server are running on. The Datadog Agent should be posting metrics related to CPU, +Network, and Disk which are all necessary to test the related disruptions. The client contains computation related to these disruptions and can be tested using the disruptions mentioned. ### Clean up diff --git a/dogfood/client/Dockerfile b/dogfood/client/Dockerfile index ed36a32f3..3ad815f5a 100644 --- a/dogfood/client/Dockerfile +++ b/dogfood/client/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:focal as client +FROM ubuntu:jammy as client COPY built_go_client /usr/local/bin/dogfood_client diff --git a/dogfood/server/Dockerfile b/dogfood/server/Dockerfile index 0ed91258b..b0900ca46 100644 --- a/dogfood/server/Dockerfile +++ b/dogfood/server/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:focal as client +FROM ubuntu:jammy as client COPY built_go_server /usr/local/bin/dogfood_server diff --git a/dogfood/tester/Dockerfile b/dogfood/tester/Dockerfile index 206980fb3..516443c1e 100644 --- a/dogfood/tester/Dockerfile +++ b/dogfood/tester/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:focal as client +FROM ubuntu:jammy as client COPY built_go_client /usr/local/bin/dogfood_tester diff --git a/dogfood/tester/disruptions.go b/dogfood/tester/disruptions.go index be465107f..2ff179cb9 100644 --- a/dogfood/tester/disruptions.go +++ b/dogfood/tester/disruptions.go @@ -6,6 +6,7 @@ package main import ( + "fmt" "github.com/DataDog/chaos-controller/api/v1beta1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" @@ -14,30 +15,39 @@ import ( // Globals var SELECTOR = []string{"app", "chaos-dogfood-client"} -var CONTAINER = "client-deploy" + +const CONTAINER = "client-deploy" +const NAMESPACE = "chaos-engineering" +const NAME_PREFIX = "e2etest-" +const DURATION v1beta1.DisruptionDuration = "3m" +const DISK_PRESSURE_PATH = "/mnt/data" + +var COUNT = &intstr.IntOrString{Type: intstr.Int, IntVal: 1} +var UNSAFEMODE = &v1beta1.UnsafemodeSpec{ + DisableAll: true, +} +var NETWORK_HOST_SPEC = []v1beta1.NetworkDisruptionHostSpec{ + { + Host: fmt.Sprintf("chaos-dogfood-server.%s.svc.cluster.local", NAMESPACE), + Port: 50051, + Protocol: "tcp", + }, +} // Network Disruptions var network1 = v1beta1.Disruption{ ObjectMeta: metav1.ObjectMeta{ - Name: "e2etest-network1", - Namespace: "chaos-engineering", + Name: fmt.Sprint(NAME_PREFIX, "network-drop30"), + Namespace: NAMESPACE, }, Spec: v1beta1.DisruptionSpec{ - Count: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, - Unsafemode: &v1beta1.UnsafemodeSpec{ - DisableAll: true, - }, + Count: COUNT, + Unsafemode: UNSAFEMODE, Selector: map[string]string{SELECTOR[0]: SELECTOR[1]}, Containers: []string{CONTAINER}, - Duration: "3m", + Duration: DURATION, Network: &v1beta1.NetworkDisruptionSpec{ - Hosts: []v1beta1.NetworkDisruptionHostSpec{ - { - Host: "chaos-dogfood-server.chaos-demo.svc.cluster.local", - Port: 50051, - Protocol: "tcp", - }, - }, + Hosts: NETWORK_HOST_SPEC, Drop: 30, Corrupt: 0, Delay: 0, @@ -48,25 +58,17 @@ var network1 = v1beta1.Disruption{ var network2 = v1beta1.Disruption{ ObjectMeta: metav1.ObjectMeta{ - Name: "e2etest-network2", - Namespace: "chaos-engineering", + Name: fmt.Sprint(NAME_PREFIX, "network-drop70"), + Namespace: NAMESPACE, }, Spec: v1beta1.DisruptionSpec{ - Count: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, - Unsafemode: &v1beta1.UnsafemodeSpec{ - DisableAll: true, - }, + Count: COUNT, + Unsafemode: UNSAFEMODE, Selector: map[string]string{SELECTOR[0]: SELECTOR[1]}, Containers: []string{CONTAINER}, - Duration: "3m", + Duration: DURATION, Network: &v1beta1.NetworkDisruptionSpec{ - Hosts: []v1beta1.NetworkDisruptionHostSpec{ - { - Host: "chaos-dogfood-server.chaos-demo.svc.cluster.local", - Port: 50051, - Protocol: "tcp", - }, - }, + Hosts: NETWORK_HOST_SPEC, Drop: 70, Corrupt: 0, Delay: 0, @@ -77,25 +79,17 @@ var network2 = v1beta1.Disruption{ var network3 = v1beta1.Disruption{ ObjectMeta: metav1.ObjectMeta{ - Name: "e2etest-network3", - Namespace: "chaos-engineering", + Name: fmt.Sprint(NAME_PREFIX, "network-delay1000"), + Namespace: NAMESPACE, }, Spec: v1beta1.DisruptionSpec{ - Count: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, - Unsafemode: &v1beta1.UnsafemodeSpec{ - DisableAll: true, - }, + Count: COUNT, + Unsafemode: UNSAFEMODE, Selector: map[string]string{SELECTOR[0]: SELECTOR[1]}, Containers: []string{CONTAINER}, - Duration: "3m", + Duration: DURATION, Network: &v1beta1.NetworkDisruptionSpec{ - Hosts: []v1beta1.NetworkDisruptionHostSpec{ - { - Host: "chaos-dogfood-server.chaos-demo.svc.cluster.local", - Port: 50051, - Protocol: "tcp", - }, - }, + Hosts: NETWORK_HOST_SPEC, Drop: 0, Corrupt: 0, Delay: 1000, @@ -111,19 +105,17 @@ var diskReadsThresholds = []int{1024, 2048, 4098} var disk1 = v1beta1.Disruption{ ObjectMeta: metav1.ObjectMeta{ - Name: "e2etest-disk1", - Namespace: "chaos-engineering", + Name: fmt.Sprint(NAME_PREFIX, "disk-read1024"), + Namespace: NAMESPACE, }, Spec: v1beta1.DisruptionSpec{ - Count: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, - Unsafemode: &v1beta1.UnsafemodeSpec{ - DisableAll: true, - }, + Count: COUNT, + Unsafemode: UNSAFEMODE, Selector: map[string]string{SELECTOR[0]: SELECTOR[1]}, Containers: []string{CONTAINER}, - Duration: "3m", + Duration: DURATION, DiskPressure: &v1beta1.DiskPressureSpec{ - Path: "/mnt/data", + Path: DISK_PRESSURE_PATH, Throttling: v1beta1.DiskPressureThrottlingSpec{ ReadBytesPerSec: &diskReadsThresholds[0], }, @@ -133,19 +125,17 @@ var disk1 = v1beta1.Disruption{ var disk2 = v1beta1.Disruption{ ObjectMeta: metav1.ObjectMeta{ - Name: "e2etest-disk2", - Namespace: "chaos-engineering", + Name: fmt.Sprint(NAME_PREFIX, "disk-write2048"), + Namespace: NAMESPACE, }, Spec: v1beta1.DisruptionSpec{ - Count: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, - Unsafemode: &v1beta1.UnsafemodeSpec{ - DisableAll: true, - }, + Count: COUNT, + Unsafemode: UNSAFEMODE, Selector: map[string]string{SELECTOR[0]: SELECTOR[1]}, Containers: []string{CONTAINER}, - Duration: "3m", + Duration: DURATION, DiskPressure: &v1beta1.DiskPressureSpec{ - Path: "/mnt/data", + Path: DISK_PRESSURE_PATH, Throttling: v1beta1.DiskPressureThrottlingSpec{ WriteBytesPerSec: &diskReadsThresholds[1], }, @@ -155,17 +145,15 @@ var disk2 = v1beta1.Disruption{ var disk3 = v1beta1.Disruption{ ObjectMeta: metav1.ObjectMeta{ - Name: "e2etest-disk3", - Namespace: "chaos-engineering", + Name: fmt.Sprint(NAME_PREFIX, "disk-write4098"), + Namespace: NAMESPACE, }, Spec: v1beta1.DisruptionSpec{ - Count: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, - Unsafemode: &v1beta1.UnsafemodeSpec{ - DisableAll: true, - }, + Count: COUNT, + Unsafemode: UNSAFEMODE, Selector: map[string]string{SELECTOR[0]: SELECTOR[1]}, Containers: []string{CONTAINER}, - Duration: "3m", + Duration: DURATION, DiskPressure: &v1beta1.DiskPressureSpec{ Path: "/mnt/data", Throttling: v1beta1.DiskPressureThrottlingSpec{ @@ -181,17 +169,15 @@ var DISK_DISRUPTIONS = []v1beta1.Disruption{disk1, disk2, disk3} var cpu1 = v1beta1.Disruption{ ObjectMeta: metav1.ObjectMeta{ - Name: "e2etest-cpu1", - Namespace: "chaos-engineering", + Name: fmt.Sprint(NAME_PREFIX, "cpu-cores4"), + Namespace: NAMESPACE, }, Spec: v1beta1.DisruptionSpec{ - Count: &intstr.IntOrString{Type: intstr.Int, IntVal: 1}, - Unsafemode: &v1beta1.UnsafemodeSpec{ - DisableAll: true, - }, + Count: COUNT, + Unsafemode: UNSAFEMODE, Selector: map[string]string{SELECTOR[0]: SELECTOR[1]}, Containers: []string{CONTAINER}, - Duration: "3m", + Duration: DURATION, CPUPressure: &v1beta1.CPUPressureSpec{ Count: &intstr.IntOrString{IntVal: 4}, },