From f476b3a794fdf884e83c62f11e6309d5d779f9bf Mon Sep 17 00:00:00 2001 From: Amol Agrawal Date: Tue, 8 Aug 2023 14:43:47 -0700 Subject: [PATCH] set mdsd limits (#1027) * set mdsd limit using container memory limit --------- Co-authored-by: Amol Agrawal --- .../scripts/tomlparser-agent-config.rb | 29 ++++++++++--------- .../scripts/tomlparser-prom-agent-config.rb | 24 +++++++++++++++ .../templates/ama-logs-daemonset-windows.yaml | 5 ++++ .../templates/ama-logs-daemonset.yaml | 10 +++++++ .../templates/ama-logs-deployment.yaml | 5 ++++ kubernetes/linux/main.sh | 22 ++++++++++++++ source/plugins/go/src/telemetry.go | 5 ++++ source/plugins/ruby/in_kube_nodes.rb | 7 +++++ 8 files changed, 94 insertions(+), 13 deletions(-) diff --git a/build/common/installer/scripts/tomlparser-agent-config.rb b/build/common/installer/scripts/tomlparser-agent-config.rb index 64589b85e..ba5f8e6c4 100644 --- a/build/common/installer/scripts/tomlparser-agent-config.rb +++ b/build/common/installer/scripts/tomlparser-agent-config.rb @@ -83,6 +83,7 @@ # Checking to see if container is not prometheus sidecar. # CONTAINER_TYPE is populated only for prometheus sidecar container. @containerType = ENV["CONTAINER_TYPE"] +@containerMemoryLimitInBytes = ENV["CONTAINER_MEMORY_LIMIT_IN_BYTES"] @promFbitChunkSize = 0 @promFbitBufferSize = 0 @@ -268,11 +269,12 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "Using config map value: require_ack_response = #{@requireAckResponse}" end end - # ama-logs daemonset only settings - if !@controllerType.nil? && !@controllerType.empty? && @controllerType.strip.casecmp(@daemonset) == 0 && @containerType.nil? - # mdsd settings - mdsd_config = parsedConfig[:agent_settings][:mdsd_config] - if !mdsd_config.nil? + + # mdsd settings + mdsd_config = parsedConfig[:agent_settings][:mdsd_config] + if !mdsd_config.nil? + # ama-logs daemonset only settings + if !@controllerType.nil? && !@controllerType.empty? && @controllerType.strip.casecmp(@daemonset) == 0 && @containerType.nil? mdsdMonitoringMaxEventRate = mdsd_config[:monitoring_max_event_rate] if is_valid_number?(mdsdMonitoringMaxEventRate) @mdsdMonitoringMaxEventRate = mdsdMonitoringMaxEventRate.to_i @@ -288,13 +290,6 @@ def populateSettingValuesFromConfigMap(parsedConfig) @mdsdUploadFrequencyInSeconds = mdsdUploadFrequencyInSeconds.to_i puts "Using config map value: upload_frequency_seconds = #{@mdsdUploadFrequencyInSeconds}" end - mdsdBackPressureThresholdInMB = mdsd_config[:backpressure_memory_threshold_in_mb] - if is_valid_number?(mdsdBackPressureThresholdInMB) && mdsdBackPressureThresholdInMB.to_i > 100 - @mdsdBackPressureThresholdInMB = mdsdBackPressureThresholdInMB.to_i - puts "Using config map value: backpressure_memory_threshold_in_mb = #{@mdsdBackPressureThresholdInMB}" - else - puts "Ignoring mdsd backpressure limit. Check input values for correctness." - end mdsdCompressionLevel = mdsd_config[:compression_level] if is_number?(mdsdCompressionLevel) && mdsdCompressionLevel.to_i >= 0 && mdsdCompressionLevel.to_i < 10 # supported levels from 0 to 9 @mdsdCompressionLevel = mdsdCompressionLevel.to_i @@ -303,6 +298,14 @@ def populateSettingValuesFromConfigMap(parsedConfig) puts "Ignoring mdsd compression_level level since its not supported level. Check input values for correctness." end end + + mdsdBackPressureThresholdInMB = mdsd_config[:backpressure_memory_threshold_in_mb] + if is_valid_number?(mdsdBackPressureThresholdInMB) && is_valid_number?(@containerMemoryLimitInBytes) && mdsdBackPressureThresholdInMB.to_i < (@containerMemoryLimitInBytes.to_i / 1048576) && mdsdBackPressureThresholdInMB.to_i > 100 + @mdsdBackPressureThresholdInMB = mdsdBackPressureThresholdInMB.to_i + puts "Using config map value: backpressure_memory_threshold_in_mb = #{@mdsdBackPressureThresholdInMB}" + else + puts "Ignoring mdsd backpressure limit. Check input values for correctness. Configmap value in mb: #{mdsdBackPressureThresholdInMB}, container limit in bytes: #{@containerMemoryLimitInBytes}" + end end prom_fbit_config = nil @@ -443,7 +446,7 @@ def populateSettingValuesFromConfigMap(parsedConfig) end if @mdsdBackPressureThresholdInMB > 0 - file.write("export MDSD_BACKPRESSURE_MONITOR_MEMORY_THRESHOLD_IN_MB=#{@mdsdBackPressureThresholdInMB}\n") + file.write("export BACKPRESSURE_THRESHOLD_IN_MB=#{@mdsdBackPressureThresholdInMB}\n") end if @mdsdCompressionLevel >= 0 diff --git a/build/common/installer/scripts/tomlparser-prom-agent-config.rb b/build/common/installer/scripts/tomlparser-prom-agent-config.rb index 0d449ef1a..ff269d5da 100644 --- a/build/common/installer/scripts/tomlparser-prom-agent-config.rb +++ b/build/common/installer/scripts/tomlparser-prom-agent-config.rb @@ -16,11 +16,18 @@ @waittime_port_25226 = 45 @waittime_port_25228 = 120 @waittime_port_25229 = 45 +@containerMemoryLimitInBytes = ENV["CONTAINER_MEMORY_LIMIT_IN_BYTES"] +@mdsdBackPressureThresholdInMB = 0 def is_number?(value) true if Integer(value) rescue false end +# check if it is number and greater than 0 +def is_valid_number?(value) + return !value.nil? && is_number?(value) && value.to_i > 0 +end + # check if it is a valid waittime def is_valid_waittime?(value, default) return !value.nil? && is_number?(value) && value.to_i >= default/2 && value.to_i <= 3*default @@ -94,6 +101,18 @@ def populateSettingValuesFromConfigMap(parsedConfig) end end + # mdsd settings + mdsd_config = parsedConfig[:agent_settings][:mdsd_config] + if !mdsd_config.nil? + mdsdBackPressureThresholdInMB = mdsd_config[:backpressure_memory_threshold_in_mb] + if is_valid_number?(mdsdBackPressureThresholdInMB) && is_valid_number?(@containerMemoryLimitInBytes) && mdsdBackPressureThresholdInMB.to_i < (@containerMemoryLimitInBytes.to_i / 1048576) && mdsdBackPressureThresholdInMB.to_i > 100 + @mdsdBackPressureThresholdInMB = mdsdBackPressureThresholdInMB.to_i + puts "Using config map value: backpressure_memory_threshold_in_mb = #{@mdsdBackPressureThresholdInMB}" + else + puts "Ignoring mdsd backpressure limit. Check input values for correctness. Configmap value in mb: #{mdsdBackPressureThresholdInMB}, container limit in bytes: #{@containerMemoryLimitInBytes}" + end + end + end rescue => errorStr puts "config::error:Exception while reading config settings for sidecar agent configuration setting - #{errorStr}, using defaults" @@ -124,6 +143,11 @@ def populateSettingValuesFromConfigMap(parsedConfig) file.write("export WAITTIME_PORT_25226=#{@waittime_port_25226}\n") file.write("export WAITTIME_PORT_25228=#{@waittime_port_25228}\n") file.write("export WAITTIME_PORT_25229=#{@waittime_port_25229}\n") + + if @mdsdBackPressureThresholdInMB > 0 + file.write("export BACKPRESSURE_THRESHOLD_IN_MB=#{@mdsdBackPressureThresholdInMB}\n") + end + # Close file after writing all environment variables file.close else diff --git a/charts/azuremonitor-containers/templates/ama-logs-daemonset-windows.yaml b/charts/azuremonitor-containers/templates/ama-logs-daemonset-windows.yaml index 47325152f..2a22fe19a 100644 --- a/charts/azuremonitor-containers/templates/ama-logs-daemonset-windows.yaml +++ b/charts/azuremonitor-containers/templates/ama-logs-daemonset-windows.yaml @@ -85,6 +85,11 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName + - name: CONTAINER_MEMORY_LIMIT_IN_BYTES + valueFrom: + resourceFieldRef: + containerName: ama-logs-windows + resource: limits.memory - name: NODE_IP valueFrom: fieldRef: diff --git a/charts/azuremonitor-containers/templates/ama-logs-daemonset.yaml b/charts/azuremonitor-containers/templates/ama-logs-daemonset.yaml index 1e6d38caf..7f413da4c 100644 --- a/charts/azuremonitor-containers/templates/ama-logs-daemonset.yaml +++ b/charts/azuremonitor-containers/templates/ama-logs-daemonset.yaml @@ -83,6 +83,11 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: CONTAINER_MEMORY_LIMIT_IN_BYTES + valueFrom: + resourceFieldRef: + containerName: ama-logs + resource: limits.memory {{- if not (empty .Values.Azure.Extension.Name) }} - name: ARC_K8S_EXTENSION_NAME value: {{ .Values.Azure.Extension.Name | quote }} @@ -210,6 +215,11 @@ spec: valueFrom: fieldRef: fieldPath: status.hostIP + - name: CONTAINER_MEMORY_LIMIT_IN_BYTES + valueFrom: + resourceFieldRef: + containerName: ama-logs-prometheus + resource: limits.memory - name: ISTEST value: {{ .Values.amalogs.ISTEST | quote }} - name: HOSTNAME diff --git a/charts/azuremonitor-containers/templates/ama-logs-deployment.yaml b/charts/azuremonitor-containers/templates/ama-logs-deployment.yaml index a0e4f4c9f..bfd69eb0d 100644 --- a/charts/azuremonitor-containers/templates/ama-logs-deployment.yaml +++ b/charts/azuremonitor-containers/templates/ama-logs-deployment.yaml @@ -58,6 +58,11 @@ spec: resourceFieldRef: containerName: ama-logs resource: limits.cpu + - name: CONTAINER_MEMORY_LIMIT_IN_BYTES + valueFrom: + resourceFieldRef: + containerName: ama-logs + resource: limits.memory {{- if ne .Values.amalogs.env.clusterId "" }} - name: AKS_RESOURCE_ID value: {{ .Values.amalogs.env.clusterId | quote }} diff --git a/kubernetes/linux/main.sh b/kubernetes/linux/main.sh index d73aba353..d0916b415 100644 --- a/kubernetes/linux/main.sh +++ b/kubernetes/linux/main.sh @@ -855,6 +855,28 @@ else fi source ~/.bashrc +# manually set backpressure value using container limit only when neither backpressure or fbit tail buffer is provided through configmap +if [ -n "${BACKPRESSURE_THRESHOLD_IN_MB}" ]; then + export MDSD_BACKPRESSURE_MONITOR_MEMORY_THRESHOLD_IN_MB=${BACKPRESSURE_THRESHOLD_IN_MB} + echo "export MDSD_BACKPRESSURE_MONITOR_MEMORY_THRESHOLD_IN_MB=$MDSD_BACKPRESSURE_MONITOR_MEMORY_THRESHOLD_IN_MB" >> ~/.bashrc + echo "Setting MDSD backpressure threshold from configmap: ${MDSD_BACKPRESSURE_MONITOR_MEMORY_THRESHOLD_IN_MB} MB" + source ~/.bashrc +elif [ -z "${FBIT_TAIL_MEM_BUF_LIMIT}" ]; then + if [ -n "${CONTAINER_MEMORY_LIMIT_IN_BYTES}" ]; then + echo "Container limit in bytes: ${CONTAINER_MEMORY_LIMIT_IN_BYTES}" + limit_in_mebibytes=$((CONTAINER_MEMORY_LIMIT_IN_BYTES / 1048576)) + + export MDSD_BACKPRESSURE_MONITOR_MEMORY_THRESHOLD_IN_MB=$((limit_in_mebibytes * 50 / 100)) + echo "export MDSD_BACKPRESSURE_MONITOR_MEMORY_THRESHOLD_IN_MB=$MDSD_BACKPRESSURE_MONITOR_MEMORY_THRESHOLD_IN_MB" >> ~/.bashrc + echo "Setting MDSD backpressure threshold as 50 percent of container limit: ${MDSD_BACKPRESSURE_MONITOR_MEMORY_THRESHOLD_IN_MB} MB" + source ~/.bashrc + else + echo "Container limit not found. Not setting mdsd backpressure threshold" + fi +else + echo "MDSD backpressure threshold not set since tail_mem_buf_limit_megabytes is used in configmap. Use backpressure_memory_threshold_in_mb in configmap to set it." +fi + if [ "${CONTAINER_TYPE}" == "PrometheusSidecar" ]; then if [ "${MUTE_PROM_SIDECAR}" != "true" ]; then echo "starting mdsd with mdsd-port=26130, fluentport=26230 and influxport=26330 in sidecar container..." diff --git a/source/plugins/go/src/telemetry.go b/source/plugins/go/src/telemetry.go index c1d1f0109..2e3b52847 100644 --- a/source/plugins/go/src/telemetry.go +++ b/source/plugins/go/src/telemetry.go @@ -185,6 +185,11 @@ func SendContainerLogPluginMetrics(telemetryPushIntervalProperty string) { telemetryDimensions["PromFbitBufferSize"] = os.Getenv("AZMON_SIDECAR_FBIT_BUFFER_SIZE") telemetryDimensions["PromFbitMemBufLimit"] = os.Getenv("AZMON_SIDECAR_FBIT_MEM_BUF_LIMIT") + mdsdBackPressureThresholdInMB := os.Getenv("MDSD_BACKPRESSURE_MONITOR_MEMORY_THRESHOLD_IN_MB") + if mdsdBackPressureThresholdInMB != "" { + telemetryDimensions["mdsdBackPressureThresholdInMB"] = mdsdBackPressureThresholdInMB + } + SendEvent(eventNameCustomPrometheusSidecarHeartbeat, telemetryDimensions) } else { diff --git a/source/plugins/ruby/in_kube_nodes.rb b/source/plugins/ruby/in_kube_nodes.rb index 209e4ae4c..aeb7ad869 100644 --- a/source/plugins/ruby/in_kube_nodes.rb +++ b/source/plugins/ruby/in_kube_nodes.rb @@ -50,6 +50,7 @@ def initialize(is_unit_test_mode = nil, kubernetesApiClient = nil, @@rsPromMonitorPodsFieldSelectorLength = @env["TELEMETRY_RS_PROM_FIELD_SELECTOR_LENGTH"] @@collectAllKubeEvents = @env["AZMON_CLUSTER_COLLECT_ALL_KUBE_EVENTS"] @@osmNamespaceCount = @env["TELEMETRY_OSM_CONFIGURATION_NAMESPACES_COUNT"] + @@mdsdBackPressureThresholdInMB = @env["MDSD_BACKPRESSURE_MONITOR_MEMORY_THRESHOLD_IN_MB"] @ContainerNodeInventoryTag = "oneagent.containerInsights.CONTAINER_NODE_INVENTORY_BLOB" @insightsMetricsTag = "oneagent.containerInsights.INSIGHTS_METRICS_BLOB" @@ -393,6 +394,12 @@ def parse_and_emit_records(nodeInventory, batchTime = Time.utc.iso8601) if (File.file?(@@osmConfigMountPath)) properties["osmNamespaceCount"] = @@osmNamespaceCount end + + # telemetry about mdsd backpressure limits for replicaset + if (!@@mdsdBackPressureThresholdInMB.nil?) && (!@@mdsdBackPressureThresholdInMB.empty?) + properties["mdsdBackPressureThresholdInMB"] = @@mdsdBackPressureThresholdInMB + end + @applicationInsightsUtility.sendMetricTelemetry("NodeCoreCapacity", capacityInfo["cpu"], properties) telemetrySent = true rescue => errorStr