Skip to content

Commit

Permalink
streamline prometheus configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
kondratyevd committed Feb 12, 2025
1 parent 23cf464 commit 0b6e3cd
Show file tree
Hide file tree
Showing 11 changed files with 94 additions and 81 deletions.
7 changes: 4 additions & 3 deletions helm/supersonic/cfg/envoy-filter.lua
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ function envoy_on_request(request_handle)
local query_response_template = '"value":%[%d+%.%d+,"([%d%.]+)"%]'
local encoded_query = encode_query(query)

request_handle:logInfo("Prometheus URL: " .. "PROMETHEUS_URL")
request_handle:logInfo("Prometheus scheme: " .. "PROMETHEUS_SCHEME")
request_handle:logInfo("Prometheus host: " .. "PROMETHEUS_HOST")
request_handle:logInfo("Prometheus port: " .. "PROMETHEUS_PORT")
request_handle:logInfo("Query: " .. query)
request_handle:logInfo("Encoded query: " .. encoded_query)

Expand All @@ -24,8 +25,8 @@ function envoy_on_request(request_handle)
{
[":method"] = "GET",
[":path"] = "/api/v1/query?query=" .. encoded_query,
[":authority"] = "PROMETHEUS_URL",
[":scheme"] = "PROMETHEUS_SCHEME"
[":scheme"] = "PROMETHEUS_SCHEME",
[":authority"] = "PROMETHEUS_HOST" .. ":" .. "PROMETHEUS_PORT"
},
"",
5000
Expand Down
2 changes: 1 addition & 1 deletion helm/supersonic/templates/NOTES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ SuperSONIC chart successfully installed!
| gRPC endpoint: {{ include "supersonic.grpcEndpoint" . }}
{{- end }}
|
{{- if (or (not .Values.prometheus.external) .Values.prometheus.url) }}
{{- if or (.Values.prometheus.external.enabled) .Values.prometheus.enabled }}
| Prometheus UI: {{ include "supersonic.prometheusUrl" . }}
{{- end }}
|
Expand Down
77 changes: 54 additions & 23 deletions helm/supersonic/templates/_prometheus.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,60 @@ Get Prometheus name
{{- printf "%s-prometheus" (include "supersonic.name" .) | trunc 63 | trimSuffix "-" -}}
{{- end -}}

{{/*
Get Prometheus scheme
*/}}
{{- define "supersonic.prometheusScheme" -}}
{{- if .Values.prometheus.external.enabled -}}
{{- .Values.prometheus.external.scheme -}}
{{- else if .Values.prometheus.enabled -}}
{{- if and .Values.prometheus.server.ingress.enabled .Values.prometheus.server.ingress.tls -}}
{{- printf "https" -}}
{{- else -}}
{{- printf "http" -}}
{{- end -}}
{{- end -}}
{{- end -}}

{{/*
Get Prometheus host
*/}}
{{- define "supersonic.prometheusHost" -}}
{{- if .Values.prometheus.external.enabled -}}
{{- .Values.prometheus.external.url -}}
{{- else if .Values.prometheus.enabled -}}
{{- if and .Values.prometheus.server.ingress.enabled .Values.prometheus.server.ingress.hosts -}}
{{- first .Values.prometheus.server.ingress.hosts -}}
{{- else -}}
{{- printf "%s-prometheus-server.%s.svc.cluster.local" (include "supersonic.name" .) .Release.Namespace -}}
{{- end -}}
{{- end -}}
{{- end -}}

{{/*
Get Prometheus port
*/}}
{{- define "supersonic.prometheusPort" -}}
{{- if .Values.prometheus.external.enabled -}}
{{- .Values.prometheus.external.port -}}
{{- else if .Values.prometheus.enabled -}}
{{- if and .Values.prometheus.server.ingress.enabled .Values.prometheus.server.ingress.tls -}}
{{- printf "443" -}}
{{- else if .Values.prometheus.server.ingress.enabled -}}
{{- printf "80" -}}
{{- else -}}
{{- .Values.prometheus.server.service.servicePort | default "9090" -}}
{{- end -}}
{{- end -}}
{{- end -}}

{{/*
Get full Prometheus URL
*/}}
{{- define "supersonic.prometheusUrl" -}}
{{- printf "%s://%s:%s" (include "supersonic.prometheusScheme" .) (include "supersonic.prometheusHost" .) (include "supersonic.prometheusPort" .) -}}
{{- end -}}

{{/*
Check if Prometheus exists in the namespace (from any release)
*/}}
Expand Down Expand Up @@ -39,29 +93,6 @@ Get existing Prometheus service name (from any release)
{{- end }}
{{- end -}}

{{/*
Get Prometheus URL (handles external, ingress, existing, and new instances)
*/}}
{{- define "supersonic.prometheusUrl" -}}
{{- if .Values.prometheus.external -}}
{{- if .Values.prometheus.url -}}
{{ .Values.prometheus.scheme }}://{{ .Values.prometheus.url }}
{{- end -}}
{{- else if .Values.prometheus.enabled -}}
{{- if .Values.prometheus.ingress.enabled -}}
https://{{ .Values.prometheus.ingress.hostName }}
{{- else -}}
http://{{ include "supersonic.prometheusName" . }}.{{ .Release.Namespace }}.svc.cluster.local:{{ .Values.prometheus.server.service.servicePort }}
{{- end -}}
{{- else -}}
{{- if .Values.prometheus.ingress.enabled -}}
https://{{ .Values.prometheus.ingress.hostName }}
{{- else -}}
http://{{ include "supersonic.prometheusName" . }}.{{ .Release.Namespace }}.svc.cluster.local:9090
{{- end -}}
{{- end -}}
{{- end }}

{{/*
Validate RBAC permissions for Prometheus
*/}}
Expand Down
8 changes: 4 additions & 4 deletions helm/supersonic/templates/_scaling-metric.tpl
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
{{/*
{{/*
Get default scaling metric
*/}}
{{- define "supersonic.defaultMetric" -}}
{{- if not ( eq .Values.prometheus.serverLoadMetric "" ) }}
{{- printf "%s" .Values.prometheus.serverLoadMetric -}}
{{- if not ( eq .Values.serverLoadMetric "" ) }}
{{- printf "%s" .Values.serverLoadMetric -}}
{{- else }}
sum by (release) (
rate(nv_inference_queue_duration_us{release=~"{{ include "supersonic.name" . }}"}[15s])
Expand All @@ -19,5 +19,5 @@ sum by (release) (
Get server load threshold (defaults to 100 if not set)
*/}}
{{- define "supersonic.serverLoadThreshold" -}}
{{- default 100 .Values.prometheus.serverLoadThreshold -}}
{{- default 100 .Values.serverLoadThreshold -}}
{{- end -}}
23 changes: 7 additions & 16 deletions helm/supersonic/templates/envoy-configmaps.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,9 @@ static_resources:
- endpoint:
address:
socket_address:
{{- if $.prometheus.external }}
address: {{ $.prometheus.url }}
port_value: {{ $.prometheus.port }}
{{- else }}
address: {{ printf "%s-prometheus.%s.svc.cluster.local" (include "supersonic.name" $.root) $.root.Release.Namespace }}
port_value: 9090
{{- end }}
{{- if and $.prometheus.external (eq $.prometheus.scheme "https") }}
address: {{ include "supersonic.prometheusHost" $.root }}
port_value: {{ include "supersonic.prometheusPort" $.root }}
{{- if eq (include "supersonic.prometheusScheme" $.root) "https" }}
transport_socket:
name: envoy.transport_sockets.tls
typed_config:
Expand Down Expand Up @@ -212,14 +207,10 @@ data:
{{- /* Read and process the Lua configuration file */}}
{{- $luaConfig := $.Files.Get .Values.envoy.rate_limiter.prometheus_based.luaConfig | nindent 4 }}
{{- $luaConfig = $luaConfig | replace "SERVER_LOAD_METRIC" (include "supersonic.defaultMetric" . | quote) }}
{{- $luaConfig = $luaConfig | replace "SERVER_LOAD_THRESHOLD" (quote .Values.prometheus.serverLoadThreshold) }}
{{- if .Values.prometheus.external }}
{{- $luaConfig = $luaConfig | replace "PROMETHEUS_URL" .Values.prometheus.url }}
{{- $luaConfig = $luaConfig | replace "PROMETHEUS_SCHEME" .Values.prometheus.scheme }}
{{- else }}
{{- $luaConfig = $luaConfig | replace "PROMETHEUS_URL" (printf "%s-prometheus.%s.svc.cluster.local" (include "supersonic.name" .) .Release.Namespace) }}
{{- $luaConfig = $luaConfig | replace "PROMETHEUS_SCHEME" "http" }}
{{- end }}
{{- $luaConfig = $luaConfig | replace "SERVER_LOAD_THRESHOLD" (quote .Values.serverLoadThreshold) }}
{{- $luaConfig = $luaConfig | replace "PROMETHEUS_SCHEME" (include "supersonic.prometheusScheme" .) }}
{{- $luaConfig = $luaConfig | replace "PROMETHEUS_HOST" (include "supersonic.prometheusHost" .) }}
{{- $luaConfig = $luaConfig | replace "PROMETHEUS_PORT" (include "supersonic.prometheusPort" .) }}
{{ $luaConfig | indent 4 }}
---
Expand Down
2 changes: 1 addition & 1 deletion helm/supersonic/templates/keda-so.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ spec:
metadata:
serverAddress: {{ include "supersonic.prometheusUrl" . }}
metricName: autoscaler-metric
threshold: {{ .Values.prometheus.serverLoadThreshold | quote }}
threshold: {{ .Values.serverLoadThreshold | quote }}
query: |-
{{ include "supersonic.defaultMetric" . | nindent 8 }}
---
Expand Down
2 changes: 1 addition & 1 deletion helm/supersonic/templates/prometheus-configmap.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{{- if and .Values.prometheus.enabled (not .Values.prometheus.external) }}
{{- if and .Values.prometheus.enabled (not .Values.prometheus.external.enabled) }}
apiVersion: v1
kind: ConfigMap
metadata:
Expand Down
2 changes: 1 addition & 1 deletion helm/supersonic/templates/prometheus-rbac.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{{- if not .Values.prometheus.external }}
{{- if and .Values.prometheus.enabled (not .Values.prometheus.external.enabled) }}
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
Expand Down
45 changes: 18 additions & 27 deletions helm/supersonic/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@
# -- Unique identifier of SuperSONIC instance (equal to release name by default)
nameOverride: ""

# -- A metric used by both KEDA autoscaler and Envoy's prometheus-based rate limiter.
## Default metric (inference queue latency) is defined in templates/_helpers.tpl
serverLoadMetric: ""

# -- Threshold for the metric
serverLoadThreshold: 100

triton:
# -- Number of Triton server instances (if autoscaling is disabled)
replicas: 1
Expand Down Expand Up @@ -167,34 +174,18 @@ autoscaler:

# -- Connection to a Prometheus server is required for KEDA autoscaler and Envoy's prometheus-based rate limiter
prometheus:
# -- Whether to use external Prometheus instance (true) or deploy internal one (false)
external: true

# -- Enable or disable Prometheus deployment via subchart
enabled: false # Set to true to deploy Prometheus via subchart

# -- External Prometheus server url and port number (find in documentation of a given cluster or ask admins)
# Only used when external=true
url: ""
port: 443

# -- Specify whether external Prometheus endpoint is exposed as http or https
# Only used when external=true
scheme: "https"

# -- A metric used by both KEDA autoscaler and Envoy's prometheus-based rate limiter.
## Default metric (inference queue latency) is defined in templates/_helpers.tpl
serverLoadMetric: ""

# -- Threshold for the metric
serverLoadThreshold: 100

# -- Ingress configuration for internal Prometheus web UI (only used when external=false)
ingress:
external:
# -- Enable external Prometheus instance
enabled: false
hostName: ""
ingressClassName: ""
annotations: {}
# -- External Prometheus server url
url: ""
# -- External Prometheus server port number
port: 443
# -- Specify whether external Prometheus endpoint is exposed as http or https
scheme: "https"

# -- Enable or disable Prometheus subchart deployment
enabled: false

# -- Prometheus Helm chart configuration (https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus)
server:
Expand Down
6 changes: 3 additions & 3 deletions values/values-cms-ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ envoy:
tokens_per_fill: 1
fill_interval: 12s
prometheus:
external: false
enabled: true
grafana:
enabled: true
autoscaler:
enabled: true
minReplicas: 1
maxReplicas: 2
ingress:
enabled: false
grafana:
enabled: true
1 change: 0 additions & 1 deletion values/values-nautilus-cms.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ envoy:
loadBalancerPolicy: "ROUND_ROBIN"
prometheus:
enabled: true
external: false
server:
ingress:
enabled: true
Expand Down

0 comments on commit 0b6e3cd

Please sign in to comment.