diff --git a/cedana-otel/README.md b/cedana-otel/README.md new file mode 100644 index 0000000..121e242 --- /dev/null +++ b/cedana-otel/README.md @@ -0,0 +1,29 @@ +## Open telemetry collector chart values for Cedana + +The helm chart installs [OpenTelemetry Collector](https://github.com/open-telemetry/opentelemetry-collector) +in kubernetes cluster. + +## Prerequisites + +- Kubernetes 1.24+ +- Helm 3.9+ + +## Installing the Charts + +Add OpenTelemetry Helm repository: + +```console +helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts +``` + +To install the chart with the release name cedana-opentelemetry-collector, run the following command: + +```console +helm install cedana-opentelemetry-collector open-telemetry/opentelemetry-collector --values values.yaml -n cedanacontroller-system +``` + +To upgrade the installed chart with new changes , run the following command: + +```console +helm upgrade -i cedana-opentelemetry-collector open-telemetry/opentelemetry-collector --values values.yaml -n cedanacontroller-system +``` diff --git a/cedana-otel/values.yaml b/cedana-otel/values.yaml new file mode 100644 index 0000000..a78b892 --- /dev/null +++ b/cedana-otel/values.yaml @@ -0,0 +1,601 @@ +# Default values for opentelemetry-collector. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +nameOverride: "cedana-otel-contrib" +fullnameOverride: "" + +# Valid values are "daemonset", "deployment", and "statefulset". +mode: "daemonset" + +# Specify which namespace should be used to deploy the resources into +namespaceOverride: "" + +# Handles basic configuration of components that +# also require k8s modifications to work correctly. +# .Values.config can be used to modify/add to a preset +# component configuration, but CANNOT be used to remove +# preset configuration. If you require removal of any +# sections of a preset configuration, you cannot use +# the preset. Instead, configure the component manually in +# .Values.config and use the other fields supplied in the +# values.yaml to configure k8s as necessary. +presets: + # Configures the collector to collect logs. + # Adds the filelog receiver to the logs pipeline + # and adds the necessary volumes and volume mounts. + # Best used with mode = daemonset. + # See https://opentelemetry.io/docs/kubernetes/collector/components/#filelog-receiver for details on the receiver. + logsCollection: + enabled: false + includeCollectorLogs: false + # Enabling this writes checkpoints in /var/lib/otelcol/ host directory. + # Note this changes collector's user to root, so that it can write to host directory. + storeCheckpoints: false + # The maximum bytes size of the recombined field. + # Once the size exceeds the limit, all received entries of the source will be combined and flushed. + maxRecombineLogSize: 102400 + # Configures the collector to collect host metrics. + # Adds the hostmetrics receiver to the metrics pipeline + # and adds the necessary volumes and volume mounts. + # Best used with mode = daemonset. + # See https://opentelemetry.io/docs/kubernetes/collector/components/#host-metrics-receiver for details on the receiver. + hostMetrics: + enabled: false + # Configures the Kubernetes Processor to add Kubernetes metadata. + # Adds the k8sattributes processor to all the pipelines + # and adds the necessary rules to ClusteRole. + # Best used with mode = daemonset. + # See https://opentelemetry.io/docs/kubernetes/collector/components/#kubernetes-attributes-processor for details on the receiver. + kubernetesAttributes: + enabled: false + # When enabled the processor will extra all labels for an associated pod and add them as resource attributes. + # The label's exact name will be the key. + extractAllPodLabels: false + # When enabled the processor will extra all annotations for an associated pod and add them as resource attributes. + # The annotation's exact name will be the key. + extractAllPodAnnotations: false + # Configures the collector to collect node, pod, and container metrics from the API server on a kubelet.. + # Adds the kubeletstats receiver to the metrics pipeline + # and adds the necessary rules to ClusteRole. + # Best used with mode = daemonset. + # See https://opentelemetry.io/docs/kubernetes/collector/components/#kubeletstats-receiver for details on the receiver. + kubeletMetrics: + enabled: false + # Configures the collector to collect kubernetes events. + # Adds the k8sobject receiver to the logs pipeline + # and collects kubernetes events by default. + # Best used with mode = deployment or statefulset. + # See https://opentelemetry.io/docs/kubernetes/collector/components/#kubernetes-objects-receiver for details on the receiver. + kubernetesEvents: + enabled: false + # Configures the Kubernetes Cluster Receiver to collect cluster-level metrics. + # Adds the k8s_cluster receiver to the metrics pipeline + # and adds the necessary rules to ClusteRole. + # Best used with mode = deployment or statefulset. + # See https://opentelemetry.io/docs/kubernetes/collector/components/#kubernetes-cluster-receiver for details on the receiver. + clusterMetrics: + enabled: false + +configMap: + # Specifies whether a configMap should be created (true by default) + create: true + # Specifies an existing ConfigMap to be mounted to the pod + # The ConfigMap MUST include the collector configuration via a key named 'relay' or the collector will not start. + existingName: "" + # Specifies the relative path to custom ConfigMap template file. This option SHOULD be used when bundling a custom + # ConfigMap template, as it enables pod restart via a template checksum annotation. + # existingPath: "" + +# Base collector configuration. +# Supports templating. To escape existing instances of {{ }}, use {{` `}}. +# For example, {{ REDACTED_EMAIL }} becomes {{` {{ REDACTED_EMAIL }} `}}. +config: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:7777 + filelog/daemonlogs: + include: + - "/var/log/cedana-daemon.log" + - "/host/var/log/cedana-daemon.log" + start_at: end + filelog/gpulogs: + include: + - "/tmp/cedana-gpucontroller.log" + - "/host/tmp/cedana-gpucontroller.log" + start_at: end + prometheus: + config: + global: + scrape_interval: 60s + scrape_configs: + - job_name: otel-collector-binary + static_configs: + - targets: + # - localhost:8888 + processors: + batch: + # faster batch for benchmarking + send_batch_size: 500 + timeout: 1s + # Ref: https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/processor/resourcedetectionprocessor/README.md + resourcedetection: + detectors: [env, system] # Before system detector, include ec2 for AWS, gcp for GCP and azure for Azure. + # Using OTEL_RESOURCE_ATTRIBUTES envvar, env detector adds custom labels. + timeout: 2s + system: + hostname_sources: [os] # alternatively, use [dns,os] for setting FQDN as host.name and os as fallback + extensions: + health_check: {} + zpages: {} + exporters: + otlp: + endpoint: "ingest.us.signoz.cloud:443" + tls: + insecure: false + headers: + "signoz-access-token": "your-signoz-access-token" # We are working on making this an env variable + service: + telemetry: + metrics: + address: 0.0.0.0:8888 + extensions: [health_check, zpages] + pipelines: + metrics: + receivers: [otlp] + processors: [batch] + exporters: [otlp] + metrics/internal: + receivers: [prometheus] + processors: [batch] + exporters: [otlp] + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlp] + logs: + receivers: [otlp, filelog/daemonlogs, filelog/gpulogs] + processors: [batch] + exporters: [otlp] + +# Components configured by presets will be injected in the same way they are for `config`. +alternateConfig: {} + +image: + # If you want to use the core image `otel/opentelemetry-collector`, you also need to change `command.name` value to `otelcol`. + repository: "otel/opentelemetry-collector-k8s" + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "" + # When digest is set to a non-empty value, images will be pulled by digest (regardless of tag value). + digest: "" +imagePullSecrets: [] + +# OpenTelemetry Collector executable +command: + name: "" + extraArgs: [] + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +clusterRole: + # Specifies whether a clusterRole should be created + # Some presets also trigger the creation of a cluster role and cluster role binding. + # If using one of those presets, this field is no-op. + create: false + # Annotations to add to the clusterRole + # Can be used in combination with presets that create a cluster role. + annotations: {} + # The name of the clusterRole to use. + # If not set a name is generated using the fullname template + # Can be used in combination with presets that create a cluster role. + name: "" + # A set of rules as documented here : https://kubernetes.io/docs/reference/access-authn-authz/rbac/ + # Can be used in combination with presets that create a cluster role to add additional rules. + rules: [] + # - apiGroups: + # - '' + # resources: + # - 'pods' + # - 'nodes' + # verbs: + # - 'get' + # - 'list' + # - 'watch' + + clusterRoleBinding: + # Annotations to add to the clusterRoleBinding + # Can be used in combination with presets that create a cluster role binding. + annotations: {} + # The name of the clusterRoleBinding to use. + # If not set a name is generated using the fullname template + # Can be used in combination with presets that create a cluster role binding. + name: "" + +podSecurityContext: {} +securityContext: {} + +nodeSelector: {} +tolerations: [] +affinity: {} +topologySpreadConstraints: [] + +# Allows for pod scheduler prioritisation +priorityClassName: "" + +extraEnvs: [] +extraEnvsFrom: [] +# This also supports template content, which will eventually be converted to yaml. +extraVolumes: [] + +# This also supports template content, which will eventually be converted to yaml. +extraVolumeMounts: [] + +# Configuration for ports +# nodePort is also allowed +ports: + otlp: + enabled: true + containerPort: 4317 + servicePort: 4317 + hostPort: 4317 + protocol: TCP + # nodePort: 30317 + appProtocol: grpc + otlp-http: + enabled: true + containerPort: 4318 + servicePort: 4318 + hostPort: 4318 + protocol: TCP + jaeger-compact: + enabled: true + containerPort: 6831 + servicePort: 6831 + hostPort: 6831 + protocol: UDP + jaeger-thrift: + enabled: true + containerPort: 14268 + servicePort: 14268 + hostPort: 14268 + protocol: TCP + jaeger-grpc: + enabled: true + containerPort: 14250 + servicePort: 14250 + hostPort: 14250 + protocol: TCP + zipkin: + enabled: true + containerPort: 9411 + servicePort: 9411 + hostPort: 9411 + protocol: TCP + metrics: + # The metrics port is disabled by default. However you need to enable the port + # in order to use the ServiceMonitor (serviceMonitor.enabled) or PodMonitor (podMonitor.enabled). + enabled: false + containerPort: 8888 + servicePort: 8888 + protocol: TCP + +# When enabled, the chart will set the GOMEMLIMIT env var to 80% of the configured resources.limits.memory. +# If no resources.limits.memory are defined then enabling does nothing. +# It is HIGHLY recommend to enable this setting and set a value for resources.limits.memory. +useGOMEMLIMIT: true + +# Resource limits & requests. +# It is HIGHLY recommended to set resource limits. +resources: {} +# resources: +# limits: +# cpu: 250m +# memory: 512Mi + +podAnnotations: {} + +podLabels: {} + +# Common labels to add to all otel-collector resources. Evaluated as a template. +additionalLabels: {} +# app.kubernetes.io/part-of: my-app + +# Host networking requested for this pod. Use the host's network namespace. +hostNetwork: true + +# Adding entries to Pod /etc/hosts with HostAliases +# https://kubernetes.io/docs/tasks/network/customize-hosts-file-for-pods/ +hostAliases: [] + # - ip: "1.2.3.4" + # hostnames: + # - "my.host.com" + +# Pod DNS policy ClusterFirst, ClusterFirstWithHostNet, None, Default, None +dnsPolicy: "" + +# Custom DNS config. Required when DNS policy is None. +dnsConfig: {} + +# only used with deployment mode +replicaCount: 1 + +# only used with deployment mode +revisionHistoryLimit: 10 + +annotations: {} + +# List of extra sidecars to add. +# This also supports template content, which will eventually be converted to yaml. +extraContainers: [] +# extraContainers: +# - name: test +# command: +# - cp +# args: +# - /bin/sleep +# - /test/sleep +# image: busybox:latest +# volumeMounts: +# - name: test +# mountPath: /test + +# List of init container specs, e.g. for copying a binary to be executed as a lifecycle hook. +# This also supports template content, which will eventually be converted to yaml. +# Another usage of init containers is e.g. initializing filesystem permissions to the OTLP Collector user `10001` in case you are using persistence and the volume is producing a permission denied error for the OTLP Collector container. +initContainers: [] +# initContainers: +# - name: test +# image: busybox:latest +# command: +# - cp +# args: +# - /bin/sleep +# - /test/sleep +# volumeMounts: +# - name: test +# mountPath: /test +# - name: init-fs +# image: busybox:latest +# command: +# - sh +# - '-c' +# - 'chown -R 10001: /var/lib/storage/otc' # use the path given as per `extensions.file_storage.directory` & `extraVolumeMounts[x].mountPath` +# volumeMounts: +# - name: opentelemetry-collector-data # use the name of the volume used for persistence +# mountPath: /var/lib/storage/otc # use the path given as per `extensions.file_storage.directory` & `extraVolumeMounts[x].mountPath` + +# Pod lifecycle policies. +lifecycleHooks: {} +# lifecycleHooks: +# preStop: +# exec: +# command: +# - /test/sleep +# - "5" + +# liveness probe configuration +# Ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/ +## +livenessProbe: + # Number of seconds after the container has started before startup, liveness or readiness probes are initiated. + # initialDelaySeconds: 1 + # How often in seconds to perform the probe. + # periodSeconds: 10 + # Number of seconds after which the probe times out. + # timeoutSeconds: 1 + # Minimum consecutive failures for the probe to be considered failed after having succeeded. + # failureThreshold: 1 + # Duration in seconds the pod needs to terminate gracefully upon probe failure. + # terminationGracePeriodSeconds: 10 + httpGet: + port: 13133 + path: / + +# readiness probe configuration +# Ref: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/ +## +readinessProbe: + # Number of seconds after the container has started before startup, liveness or readiness probes are initiated. + # initialDelaySeconds: 1 + # How often (in seconds) to perform the probe. + # periodSeconds: 10 + # Number of seconds after which the probe times out. + # timeoutSeconds: 1 + # Minimum consecutive successes for the probe to be considered successful after having failed. + # successThreshold: 1 + # Minimum consecutive failures for the probe to be considered failed after having succeeded. + # failureThreshold: 1 + httpGet: + port: 13133 + path: / + +service: + # Enable the creation of a Service. + # By default, it's enabled on mode != daemonset. + # However, to enable it on mode = daemonset, its creation must be explicitly enabled + # enabled: true + + type: ClusterIP + # type: LoadBalancer + # loadBalancerIP: 1.2.3.4 + # loadBalancerSourceRanges: [] + + # By default, Service of type 'LoadBalancer' will be created setting 'externalTrafficPolicy: Cluster' + # unless other value is explicitly set. + # Possible values are Cluster or Local (https://kubernetes.io/docs/tasks/access-application-cluster/create-external-load-balancer/#preserving-the-client-source-ip) + # externalTrafficPolicy: Cluster + + annotations: {} + + # By default, Service will be created setting 'internalTrafficPolicy: Local' on mode = daemonset + # unless other value is explicitly set. + # Setting 'internalTrafficPolicy: Cluster' on a daemonset is not recommended + # internalTrafficPolicy: Cluster + +ingress: + enabled: false + # annotations: {} + # ingressClassName: nginx + # hosts: + # - host: collector.example.com + # paths: + # - path: / + # pathType: Prefix + # port: 4318 + # tls: + # - secretName: collector-tls + # hosts: + # - collector.example.com + + # Additional ingresses - only created if ingress.enabled is true + # Useful for when differently annotated ingress services are required + # Each additional ingress needs key "name" set to something unique + additionalIngresses: [] + # - name: cloudwatch + # ingressClassName: nginx + # annotations: {} + # hosts: + # - host: collector.example.com + # paths: + # - path: / + # pathType: Prefix + # port: 4318 + # tls: + # - secretName: collector-tls + # hosts: + # - collector.example.com + +podMonitor: + # The pod monitor by default scrapes the metrics port. + # The metrics port needs to be enabled as well. + enabled: false + metricsEndpoints: + - port: metrics + # interval: 15s + + # additional labels for the PodMonitor + extraLabels: {} + # release: kube-prometheus-stack + +serviceMonitor: + # The service monitor by default scrapes the metrics port. + # The metrics port needs to be enabled as well. + enabled: false + metricsEndpoints: + - port: metrics + # interval: 15s + + # additional labels for the ServiceMonitor + extraLabels: {} + # release: kube-prometheus-stack + # Used to set relabeling and metricRelabeling configs on the ServiceMonitor + # https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config + relabelings: [] + metricRelabelings: [] + +# PodDisruptionBudget is used only if deployment enabled +podDisruptionBudget: + enabled: false +# minAvailable: 2 +# maxUnavailable: 1 + +# autoscaling is used only if mode is "deployment" or "statefulset" +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 10 + behavior: {} + targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + +rollout: + rollingUpdate: {} + # When 'mode: daemonset', maxSurge cannot be used when hostPort is set for any of the ports + # maxSurge: 25% + # maxUnavailable: 0 + strategy: RollingUpdate + +prometheusRule: + enabled: false + groups: [] + # Create default rules for monitoring the collector + defaultRules: + enabled: false + + # additional labels for the PrometheusRule + extraLabels: {} + +statefulset: + # volumeClaimTemplates for a statefulset + volumeClaimTemplates: [] + podManagementPolicy: "Parallel" + # Controls if and how PVCs created by the StatefulSet are deleted. Available in Kubernetes 1.23+. + persistentVolumeClaimRetentionPolicy: + enabled: false + whenDeleted: Retain + whenScaled: Retain + +networkPolicy: + enabled: false + + # Annotations to add to the NetworkPolicy + annotations: {} + + # Configure the 'from' clause of the NetworkPolicy. + # By default this will restrict traffic to ports enabled for the Collector. If + # you wish to further restrict traffic to other hosts or specific namespaces, + # see the standard NetworkPolicy 'spec.ingress.from' definition for more info: + # https://kubernetes.io/docs/reference/kubernetes-api/policy-resources/network-policy-v1/ + allowIngressFrom: [] + # # Allow traffic from any pod in any namespace, but not external hosts + # - namespaceSelector: {} + # # Allow external access from a specific cidr block + # - ipBlock: + # cidr: 192.168.1.64/32 + # # Allow access from pods in specific namespaces + # - namespaceSelector: + # matchExpressions: + # - key: kubernetes.io/metadata.name + # operator: In + # values: + # - "cats" + # - "dogs" + + # Add additional ingress rules to specific ports + # Useful to allow external hosts/services to access specific ports + # An example is allowing an external prometheus server to scrape metrics + # + # See the standard NetworkPolicy 'spec.ingress' definition for more info: + # https://kubernetes.io/docs/reference/kubernetes-api/policy-resources/network-policy-v1/ + extraIngressRules: [] + # - ports: + # - port: metrics + # protocol: TCP + # from: + # - ipBlock: + # cidr: 192.168.1.64/32 + + # Restrict egress traffic from the OpenTelemetry collector pod + # See the standard NetworkPolicy 'spec.egress' definition for more info: + # https://kubernetes.io/docs/reference/kubernetes-api/policy-resources/network-policy-v1/ + egressRules: [] + # - to: + # - namespaceSelector: {} + # - ipBlock: + # cidr: 192.168.10.10/24 + # ports: + # - port: 1234 + # protocol: TCP + +# Allow containers to share processes across pod namespace +shareProcessNamespace: false