diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl index 5db3682c2..cc4f058c5 100644 --- a/helm/templates/_helpers.tpl +++ b/helm/templates/_helpers.tpl @@ -12,6 +12,13 @@ Name for cloudwatch-agent {{- default "cloudwatch-agent" .Values.agent.name }} {{- end }} +{{/* +Name for dcgm-exporter +*/}} +{{- define "dcgm-exporter.name" -}} +{{- default "dcgm-exporter" .Values.dcgmExporter.name }} +{{- end }} + {{/* Get the current recommended cloudwatch agent image for a region */}} @@ -48,6 +55,18 @@ Get the current recommended fluent-bit image for a region {{- printf "%s/%s:%s" $imageDomain .Values.containerLogs.fluentBit.image.repository .Values.containerLogs.fluentBit.image.tag -}} {{- end -}} +{{/* +Get the current recommended dcgm-exporter image for a region +*/}} +{{- define "dcgm-exporter.image" -}} +{{- $imageDomain := "" -}} +{{- $imageDomain = index .Values.containerLogs.dcgmExporter.image.repositoryDomainMap .Values.region -}} +{{- if not $imageDomain -}} +{{- $imageDomain = .Values.containerLogs.dcgmExporter.image.repositoryDomainMap.public -}} +{{- end -}} +{{- printf "%s/%s:%s" $imageDomain .Values.containerLogs.dcgmExporter.image.repository .Values.containerLogs.dcgmExporter.image.tag -}} +{{- end -}} + {{/* Common labels */}} diff --git a/helm/templates/cloudwatch-agent-clusterrole.yaml b/helm/templates/cloudwatch-agent-clusterrole.yaml index 5ff75d9c2..66ca69b42 100644 --- a/helm/templates/cloudwatch-agent-clusterrole.yaml +++ b/helm/templates/cloudwatch-agent-clusterrole.yaml @@ -7,11 +7,8 @@ metadata: name: {{ template "cloudwatch-agent.name" . }}-role rules: - apiGroups: [ "" ] - resources: [ "pods", "pods/logs", "nodes", "nodes/proxy", "namespaces", "endpoints" ] + resources: [ "pods", "pods/logs", "nodes", "nodes/proxy", "namespaces", "endpoints", "services" ] verbs: [ "list", "watch", "get" ] -- apiGroups: [ "" ] - resources: [ "services" ] - verbs: [ "list", "watch" ] - apiGroups: [ "apps" ] resources: [ "replicasets", "daemonsets", "deployments", "statefulsets" ] verbs: [ "list", "watch", "get" ] @@ -23,7 +20,10 @@ rules: verbs: [ "create", "get" ] - apiGroups: [ "" ] resources: [ "configmaps" ] - verbs: [ "update" ] + verbs: [ "update", "get" ] +- apiGroups: [ "extensions" ] + resources: [ "ingresses" ] + verbs: [ "list", "watch", "get" ] - nonResourceURLs: [ "/metrics" ] verbs: [ "get", "list", "watch" ] {{- end }} diff --git a/helm/templates/dcgm-exporter-configmap.yaml b/helm/templates/dcgm-exporter-configmap.yaml new file mode 100644 index 000000000..3357c92e3 --- /dev/null +++ b/helm/templates/dcgm-exporter-configmap.yaml @@ -0,0 +1,88 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: dcgm-exporter-config-map + namespace: {{ .Release.Namespace }} +data: + dcp-metrics-included.csv: | + # Format + # If line starts with a '#' it is considered a comment + # DCGM FIELD, Prometheus metric type, help message + + # Clocks + DCGM_FI_DEV_SM_CLOCK, gauge, SM clock frequency (in MHz). + DCGM_FI_DEV_MEM_CLOCK, gauge, Memory clock frequency (in MHz). + + # Temperature + DCGM_FI_DEV_MEMORY_TEMP, gauge, Memory temperature (in C). + DCGM_FI_DEV_GPU_TEMP, gauge, GPU temperature (in C). + + # Power + DCGM_FI_DEV_POWER_USAGE, gauge, Power draw (in W). + DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION, counter, Total energy consumption since boot (in mJ). + + # PCIE + # DCGM_FI_DEV_PCIE_TX_THROUGHPUT, counter, Total number of bytes transmitted through PCIe TX (in KB) via NVML. + # DCGM_FI_DEV_PCIE_RX_THROUGHPUT, counter, Total number of bytes received through PCIe RX (in KB) via NVML. + DCGM_FI_DEV_PCIE_REPLAY_COUNTER, counter, Total number of PCIe retries. + + # Utilization (the sample period varies depending on the product) + DCGM_FI_DEV_GPU_UTIL, gauge, GPU utilization (in %). + DCGM_FI_DEV_MEM_COPY_UTIL, gauge, Memory utilization (in %). + DCGM_FI_DEV_ENC_UTIL, gauge, Encoder utilization (in %). + DCGM_FI_DEV_DEC_UTIL , gauge, Decoder utilization (in %). + + # Errors and violations + DCGM_FI_DEV_XID_ERRORS, gauge, Value of the last XID error encountered. + # DCGM_FI_DEV_POWER_VIOLATION, counter, Throttling duration due to power constraints (in us). + # DCGM_FI_DEV_THERMAL_VIOLATION, counter, Throttling duration due to thermal constraints (in us). + # DCGM_FI_DEV_SYNC_BOOST_VIOLATION, counter, Throttling duration due to sync-boost constraints (in us). + # DCGM_FI_DEV_BOARD_LIMIT_VIOLATION, counter, Throttling duration due to board limit constraints (in us). + # DCGM_FI_DEV_LOW_UTIL_VIOLATION, counter, Throttling duration due to low utilization (in us). + # DCGM_FI_DEV_RELIABILITY_VIOLATION, counter, Throttling duration due to reliability constraints (in us). + + # Memory usage + DCGM_FI_DEV_FB_FREE, gauge, Framebuffer memory free (in MiB). + DCGM_FI_DEV_FB_USED, gauge, Framebuffer memory used (in MiB). + + # ECC + # DCGM_FI_DEV_ECC_SBE_VOL_TOTAL, counter, Total number of single-bit volatile ECC errors. + # DCGM_FI_DEV_ECC_DBE_VOL_TOTAL, counter, Total number of double-bit volatile ECC errors. + # DCGM_FI_DEV_ECC_SBE_AGG_TOTAL, counter, Total number of single-bit persistent ECC errors. + # DCGM_FI_DEV_ECC_DBE_AGG_TOTAL, counter, Total number of double-bit persistent ECC errors. + + # Retired pages + # DCGM_FI_DEV_RETIRED_SBE, counter, Total number of retired pages due to single-bit errors. + # DCGM_FI_DEV_RETIRED_DBE, counter, Total number of retired pages due to double-bit errors. + # DCGM_FI_DEV_RETIRED_PENDING, counter, Total number of pages pending retirement. + + # NVLink + # DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL, counter, Total number of NVLink flow-control CRC errors. + # DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL, counter, Total number of NVLink data CRC errors. + # DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL, counter, Total number of NVLink retries. + # DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL, counter, Total number of NVLink recovery errors. + DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL, counter, Total number of NVLink bandwidth counters for all lanes. + # DCGM_FI_DEV_NVLINK_BANDWIDTH_L0, counter, The number of bytes of active NVLink rx or tx data including both header and payload. + + # VGPU License status + DCGM_FI_DEV_VGPU_LICENSE_STATUS, gauge, vGPU License status + + # Remapped rows + DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for uncorrectable errors + DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS, counter, Number of remapped rows for correctable errors + DCGM_FI_DEV_ROW_REMAP_FAILURE, gauge, Whether remapping of rows has failed + + # DCP metrics + DCGM_FI_PROF_GR_ENGINE_ACTIVE, gauge, Ratio of time the graphics engine is active (in %). + # DCGM_FI_PROF_SM_ACTIVE, gauge, The ratio of cycles an SM has at least 1 warp assigned (in %). + # DCGM_FI_PROF_SM_OCCUPANCY, gauge, The ratio of number of warps resident on an SM (in %). + DCGM_FI_PROF_PIPE_TENSOR_ACTIVE, gauge, Ratio of cycles the tensor (HMMA) pipe is active (in %). + DCGM_FI_PROF_DRAM_ACTIVE, gauge, Ratio of cycles the device memory interface is active sending or receiving data (in %). + # DCGM_FI_PROF_PIPE_FP64_ACTIVE, gauge, Ratio of cycles the fp64 pipes are active (in %). + # DCGM_FI_PROF_PIPE_FP32_ACTIVE, gauge, Ratio of cycles the fp32 pipes are active (in %). + # DCGM_FI_PROF_PIPE_FP16_ACTIVE, gauge, Ratio of cycles the fp16 pipes are active (in %). + DCGM_FI_PROF_PCIE_TX_BYTES, counter, The number of bytes of active pcie tx data including both header and payload. + DCGM_FI_PROF_PCIE_RX_BYTES, counter, The number of bytes of active pcie rx data including both header and payload. + web-config.yaml: | + basic_auth_users: + cwagent: $2a$12$QMxvDp/Pfw7q4oaWzqyXxOgVEJMfiwrOk7Ezdf8SPquzYXhzr9NJi diff --git a/helm/templates/dcgm-exporter-daemonset.yaml b/helm/templates/dcgm-exporter-daemonset.yaml new file mode 100644 index 000000000..2faf551db --- /dev/null +++ b/helm/templates/dcgm-exporter-daemonset.yaml @@ -0,0 +1,71 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: dcgm-exporter + namespace: {{ .Release.Namespace }} + labels: + k8s-app: dcgm-exporter + version: v1 +spec: + selector: + matchLabels: + k8s-app: dcgm-exporter + template: + metadata: + labels: + k8s-app: dcgm-exporter + version: v1 + spec: + priorityClassName: system-node-critical + serviceAccountName: {{ template "cloudwatch-agent.serviceAccountName" . }} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: {{ .Values.gpuNodeLabelKey }} + operator: In + values: {{ .Values.gpuInstances | toYaml | nindent 16 }} + containers: + - name: dcgm-exporter + securityContext: + privileged: true + image: "{{ .Values.dcgmExporter.image.repository }}:{{ .Values.dcgmExporter.image.tag }}" + args: + {{- range $.Values.dcgmExporter.arguments }} + - {{ . }} + {{- end }} + env: + - name: "DCGM_EXPORTER_KUBERNETES" + value: "true" + - name: "DCGM_EXPORTER_LISTEN" + value: "{{ .Values.dcgmExporter.service.address }}" + - name: "DCGM_EXPORTER_COLLECTORS" + value: "/etc/dcgm-exporter/dcp-metrics-included.csv" + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + ports: + - name: "metrics" + containerPort: {{ .Values.dcgmExporter.service.port }} + volumeMounts: + - name: "pod-gpu-resources" + readOnly: true + mountPath: "/var/lib/kubelet/pod-resources" + - name: "dcgm-config" + mountPath: /etc/dcgm-exporter/ + volumes: + - name: "pod-gpu-resources" + hostPath: + path: /var/lib/kubelet/pod-resources + - name: "dcgm-config" + configMap: + name: dcgm-exporter-config-map + resources: + requests: + cpu: 250m + memory: 128Mi + limits: + cpu: 500m + memory: 250Mi diff --git a/helm/templates/dcgm-exporter-service.yaml b/helm/templates/dcgm-exporter-service.yaml new file mode 100644 index 000000000..bc1f8b5e4 --- /dev/null +++ b/helm/templates/dcgm-exporter-service.yaml @@ -0,0 +1,22 @@ +{{- if .Values.dcgmExporter.service.enable }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "dcgm-exporter.name" . }}-service + namespace: {{ .Release.Namespace }} + labels: + {{- include "amazon-cloudwatch-observability.labels" . | nindent 4 }} + k8s-app: {{ include "dcgm-exporter.name" . }}-service + annotations: + prometheus.io/scrape: "true" +spec: + type: {{ .Values.dcgmExporter.service.type }} + ports: + - name: "metrics" + port: {{ .Values.dcgmExporter.service.port }} + targetPort: {{ .Values.dcgmExporter.service.port }} + protocol: TCP + selector: + k8s-app: dcgm-exporter + internalTrafficPolicy: Local +{{- end }} diff --git a/helm/values.yaml b/helm/values.yaml index 0ddcdb09c..22086815b 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -17,6 +17,10 @@ clusterName: EKS_CLUSTER_NAME ## Provide the Region (optional when installing via EKS add-on) region: AWS_REGION_NAME +gpuNodeLabelKey: eks.amazonaws.com/nodegroup +## NVIDIA GPU instance types +gpuInstances: [ g3, g3s, g4dn, g5, g5g, p2, p3, p3dn, p4d, p5 ] + containerLogs: enabled: true fluentBit: @@ -35,10 +39,12 @@ containerLogs: manager: name: image: - repository: cloudwatch-agent-operator + # repository: cloudwatch-agent-operator + repository: cwagent-operator-release tag: 1.0.2 repositoryDomainMap: - public: public.ecr.aws/cloudwatch-agent + # public: public.ecr.aws/cloudwatch-agent + public: 231392302985.dkr.ecr.us-west-2.amazonaws.com cn-north-1: 934860584483.dkr.ecr.cn-north-1.amazonaws.com.cn cn-northwest-1: 934860584483.dkr.ecr.cn-northwest-1.amazonaws.com.cn us-gov-east-1: 743662458514.dkr.ecr.us-gov-east-1.amazonaws.com @@ -123,10 +129,13 @@ admissionWebhooks: agent: name: image: - repository: cloudwatch-agent - tag: 1.300031.1b317 + # repository: cloudwatch-agent + # tag: 1.300031.1b317 + repository: tupperware + tag: latest repositoryDomainMap: - public: public.ecr.aws/cloudwatch-agent + # public: public.ecr.aws/cloudwatch-agent + public: 231392302985.dkr.ecr.us-west-2.amazonaws.com cn-north-1: 934860584483.dkr.ecr.cn-north-1.amazonaws.com.cn cn-northwest-1: 934860584483.dkr.ecr.cn-northwest-1.amazonaws.com.cn us-gov-east-1: 743662458514.dkr.ecr.us-gov-east-1.amazonaws.com @@ -137,6 +146,9 @@ agent: config: # optional config that can be provided to override the defaultConfig defaultConfig: { + "agent": { + "debug": true + }, "logs": { "metrics_collected": { "kubernetes": { @@ -150,4 +162,32 @@ agent: "app_signals": { } } } - } \ No newline at end of file + } + enableGpuConfig: + { + "logs": { + "metrics_collected": { + "kubernetes": { + "gpu_metrics": true + }, + } + } + } + +dcgmExporter: + name: + image: + repository: nvcr.io/nvidia/k8s/dcgm-exporter + tag: 3.3.3-3.3.1-ubuntu22.04 + # arguments: ["--web-config-file=/etc/dcgm-exporter/web-config.yaml"] + service: + enable: true + type: ClusterIP + port: 9400 + address: ":9400" + securityContext: + runAsNonRoot: false + runAsUser: 0 + capabilities: + add: ["SYS_ADMIN"] + kubeletPath: "/var/lib/kubelet/pod-resources"