diff --git a/cloud/observability/promql-to-dd-go/prometheus/http.go b/cloud/observability/promql-to-dd-go/prometheus/http.go index c25c0c9..7da1485 100644 --- a/cloud/observability/promql-to-dd-go/prometheus/http.go +++ b/cloud/observability/promql-to-dd-go/prometheus/http.go @@ -45,6 +45,9 @@ func (c *HttpClient) Do(ctx context.Context, req *http.Request) (*http.Response, if ctx != nil { req = req.WithContext(ctx) } + + req.Header.Set("User-Agent", "promql-to-dd") + resp, err := c.Client.Do(req) defer func() { if resp != nil { diff --git a/cloud/observability/promql-to-scrape/Dockerfile b/cloud/observability/promql-to-scrape/Dockerfile new file mode 100644 index 0000000..7930c45 --- /dev/null +++ b/cloud/observability/promql-to-scrape/Dockerfile @@ -0,0 +1,11 @@ +FROM golang:1.21-alpine + +WORKDIR /usr/src/app + +COPY go.mod go.sum ./ +RUN go mod download && go mod verify + +COPY . . +RUN go build -v -o /usr/local/bin/promql-to-scrape ./cmd/promql-to-scrape/main.go + +ENTRYPOINT ["/usr/local/bin/promql-to-scrape"] \ No newline at end of file diff --git a/cloud/observability/promql-to-scrape/README.md b/cloud/observability/promql-to-scrape/README.md new file mode 100644 index 0000000..95141c5 --- /dev/null +++ b/cloud/observability/promql-to-scrape/README.md @@ -0,0 +1,47 @@ +# promql-to-scrape + +This basic application is meant to provide an example for how one could use the Temporal Cloud Observability endpoint to expose a typical Prometheus `/metrics` endpoint. + +**This example is provided as-is, without support. It is intended as reference material only.** + +## How to Use + +Grab your client cert and key and place them at `client.crt`, `tls.key`, and your Temporal Cloud account number that has the observability endpoint enabled. + +``` +go mod tidy +go build -o promql-to-scrape cmd/promql-to-scrape/main.go +./promql-to-scrape -client-cert client.crt -client- +key tls.key -prom-endpoint https://.tmprl.cloud/prometheus --config-file examples/config.yaml --debug +~~~ +time=2023-11-16T17:43:20.260-06:00 level=DEBUG msg="successful metric retrieval" time=3.529039083s +``` + +This means you can now hit http://localhost:9001/metrics on your machine and see your metrics. + +### Important Usability Information + +**Important:** When you go to scrape this, you should do so with a **60s** scrape interval, unless you are meaningfully modifying this code. The example queries all assume a 1 minute rate and you'll want these to be equal. + +**Very Important:** The data you will see here is approximately 1 minute delayed (should you conform to the guidance above). Due to the aggregation that happens before metrics are presented to you, it's necessary for us to send the queries from this application to look 60 seconds in the past. Otherwise data aggregation would not be complete, and there would be no results for each query. + +## Deployment + +Some example Kubernetes manifests are provided in the `/examples` directory. Filling in your certificates and account should get you going pretty quickly. + +## Generating Config + +There is a second binary you can build that can help you build a default configuration of queries to scrape and export. + +``` +go build -o genconfig cmd/genconfig/main.go +./genconfig -client-cert client.crt -client-key tls.key -prom-endpoint https://.tmprl.cloud/prometheus +... +``` + +This will generate an example config at `config.yaml` that you may use. It looks for all the existing metrics and generates a reasonable query for you to export. +- For counters, a `rate(counter[1m])` +- For gauges, it simply queries for `gauge` +- For histograms, it does a p99 aggregated by `temporal_namespace` and `operation`. `histogram_quantile(0.99, sum(rate(metric[1m])) by (le, operation, temporal_namespace)` + +Modify at your own risk. You may find you'd like to add a global latency across all namespaces for instance. You can add those queries to your config file. \ No newline at end of file diff --git a/cloud/observability/promql-to-scrape/cmd/genconfig/main.go b/cloud/observability/promql-to-scrape/cmd/genconfig/main.go new file mode 100644 index 0000000..0ea94e7 --- /dev/null +++ b/cloud/observability/promql-to-scrape/cmd/genconfig/main.go @@ -0,0 +1,84 @@ +package main + +import ( + "flag" + "fmt" + "log" + "os" + "sort" + + "github.com/temporalio/samples-server/cloud/observability/promql-to-scrape/internal" + + "gopkg.in/yaml.v3" +) + +func main() { + set := flag.NewFlagSet("app", flag.ExitOnError) + promURL := set.String("prom-endpoint", "", "Required Prometheus API endpoint for the server eg. https://.tmprl.cloud/prometheus") + serverRootCACert := set.String("server-root-ca-cert", "", "Optional path to root server CA cert") + clientCert := set.String("client-cert", "", "Required path to client cert") + clientKey := set.String("client-key", "", "Required path to client key") + serverName := set.String("server-name", "", "Optional server name to use for verifying the server's certificate") + insecureSkipVerify := set.Bool("insecure-skip-verify", false, "Skip verification of the server's certificate and host name") + + if err := set.Parse(os.Args[1:]); err != nil { + log.Fatalf("failed parsing args: %s", err) + } else if *clientCert == "" || *clientKey == "" { + log.Fatalf("-client-cert and -client-key are required") + } + + client, err := internal.NewAPIClient( + internal.APIConfig{ + TargetHost: *promURL, + ServerRootCACert: *serverRootCACert, + ClientCert: *clientCert, + ClientKey: *clientKey, + ServerName: *serverName, + InsecureSkipVerify: *insecureSkipVerify, + }, + ) + if err != nil { + log.Fatalf("Failed to create Prometheus client: %s", err) + } + + counters, gauges, histograms, err := client.ListMetrics("temporal_cloud_v0") + if err != nil { + log.Fatalf("Failed to pull metric names: %s", err) + } + fmt.Println(counters) + fmt.Println(gauges) + fmt.Println(histograms) + + conf := internal.Config{} + + for _, counter := range counters { + conf.Metrics = append(conf.Metrics, internal.Metric{ + MetricName: fmt.Sprintf("%s:rate1m", counter), + Query: fmt.Sprintf("rate(%s[1m])", counter), + }) + } + for _, gauge := range gauges { + conf.Metrics = append(conf.Metrics, internal.Metric{ + MetricName: gauge, + Query: gauge, + }) + } + for _, histogram := range histograms { + conf.Metrics = append(conf.Metrics, internal.Metric{ + MetricName: fmt.Sprintf("%s:histogram_quantile_p99_1m", histogram), + Query: fmt.Sprintf("histogram_quantile(0.99, sum(rate(%s[1m])) by (le, operation, temporal_namespace))", histogram), + }) + } + + sort.Sort(internal.ByMetricName(conf.Metrics)) + + yamlData, err := yaml.Marshal(&conf) + if err != nil { + log.Fatalf("error marshalling yaml: %v", err) + } + + err = os.WriteFile("config.yaml", yamlData, 0644) + if err != nil { + log.Fatalf("error: %v", err) + } +} diff --git a/cloud/observability/promql-to-scrape/cmd/promql-to-scrape/main.go b/cloud/observability/promql-to-scrape/cmd/promql-to-scrape/main.go new file mode 100644 index 0000000..54ca858 --- /dev/null +++ b/cloud/observability/promql-to-scrape/cmd/promql-to-scrape/main.go @@ -0,0 +1,59 @@ +package main + +import ( + "flag" + "log" + "os" + + "github.com/temporalio/samples-server/cloud/observability/promql-to-scrape/internal" + + "golang.org/x/exp/slog" +) + +func main() { + set := flag.NewFlagSet("promql-to-scrape", flag.ExitOnError) + promURL := set.String("prom-endpoint", "", "Required Prometheus API endpoint for the server eg. https://.tmprl.cloud/prometheus") + configFile := set.String("config-file", "", "Config file for promql-to-scrape") + serverRootCACert := set.String("server-root-ca-cert", "", "Optional path to root server CA cert") + clientCert := set.String("client-cert", "", "Required path to client cert") + clientKey := set.String("client-key", "", "Required path to client key") + serverName := set.String("server-name", "", "Optional server name to use for verifying the server's certificate") + insecureSkipVerify := set.Bool("insecure-skip-verify", false, "Skip verification of the server's certificate and host name") + serverAddr := set.String("bind", "0.0.0.0:9001", "address:port to expose the metrics server on") + debugLogging := set.Bool("debug", false, "Toggle debug logging") + + if err := set.Parse(os.Args[1:]); err != nil { + log.Fatalf("failed parsing args: %v", err) + } else if *clientCert == "" || *clientKey == "" || *configFile == "" || *promURL == "" { + log.Fatalf("-client-cert, -client-key, -config-file, -prom-endpoint are required") + } + + logLevel := slog.LevelInfo + if *debugLogging { + logLevel = slog.LevelDebug + } + h := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: logLevel}) + slog.SetDefault(slog.New(h)) + + client, err := internal.NewAPIClient( + internal.APIConfig{ + TargetHost: *promURL, + ServerRootCACert: *serverRootCACert, + ClientCert: *clientCert, + ClientKey: *clientKey, + ServerName: *serverName, + InsecureSkipVerify: *insecureSkipVerify, + }, + ) + if err != nil { + log.Fatalf("failed to create Prometheus client: %v", err) + } + + conf, err := internal.LoadConfig(*configFile) + if err != nil { + log.Fatalf("failed to load config file: %v", err) + } + + s := internal.NewPromToScrapeServer(client, conf, *serverAddr) + s.Start() +} diff --git a/cloud/observability/promql-to-scrape/examples/config.yaml b/cloud/observability/promql-to-scrape/examples/config.yaml new file mode 100644 index 0000000..184c746 --- /dev/null +++ b/cloud/observability/promql-to-scrape/examples/config.yaml @@ -0,0 +1,43 @@ +metrics: + - metric_name: temporal_cloud_v0_frontend_service_error_count:rate1m + query: rate(temporal_cloud_v0_frontend_service_error_count[1m]) + - metric_name: temporal_cloud_v0_frontend_service_pending_requests + query: temporal_cloud_v0_frontend_service_pending_requests + - metric_name: temporal_cloud_v0_frontend_service_request_count:rate1m + query: rate(temporal_cloud_v0_frontend_service_request_count[1m]) + - metric_name: temporal_cloud_v0_poll_success_count:rate1m + query: rate(temporal_cloud_v0_poll_success_count[1m]) + - metric_name: temporal_cloud_v0_poll_success_sync_count:rate1m + query: rate(temporal_cloud_v0_poll_success_sync_count[1m]) + - metric_name: temporal_cloud_v0_poll_timeout_count:rate1m + query: rate(temporal_cloud_v0_poll_timeout_count[1m]) + - metric_name: temporal_cloud_v0_resource_exhausted_error_count:rate1m + query: rate(temporal_cloud_v0_resource_exhausted_error_count[1m]) + - metric_name: temporal_cloud_v0_schedule_action_success_count:rate1m + query: rate(temporal_cloud_v0_schedule_action_success_count[1m]) + - metric_name: temporal_cloud_v0_schedule_buffer_overruns_count:rate1m + query: rate(temporal_cloud_v0_schedule_buffer_overruns_count[1m]) + - metric_name: temporal_cloud_v0_schedule_missed_catchup_window_count:rate1m + query: rate(temporal_cloud_v0_schedule_missed_catchup_window_count[1m]) + - metric_name: temporal_cloud_v0_service_latency_bucket:histogram_quantile_p99_1m + query: histogram_quantile(0.99, sum(rate(temporal_cloud_v0_service_latency_bucket[1m])) by (le, operation, temporal_namespace)) + - metric_name: temporal_cloud_v0_service_latency_count:rate1m + query: rate(temporal_cloud_v0_service_latency_count[1m]) + - metric_name: temporal_cloud_v0_service_latency_sum:rate1m + query: rate(temporal_cloud_v0_service_latency_sum[1m]) + - metric_name: temporal_cloud_v0_state_transition_count:rate1m + query: rate(temporal_cloud_v0_state_transition_count[1m]) + - metric_name: temporal_cloud_v0_total_action_count:rate1m + query: rate(temporal_cloud_v0_total_action_count[1m]) + - metric_name: temporal_cloud_v0_workflow_cancel_count:rate1m + query: rate(temporal_cloud_v0_workflow_cancel_count[1m]) + - metric_name: temporal_cloud_v0_workflow_continued_as_new_count:rate1m + query: rate(temporal_cloud_v0_workflow_continued_as_new_count[1m]) + - metric_name: temporal_cloud_v0_workflow_failed_count:rate1m + query: rate(temporal_cloud_v0_workflow_failed_count[1m]) + - metric_name: temporal_cloud_v0_workflow_success_count:rate1m + query: rate(temporal_cloud_v0_workflow_success_count[1m]) + - metric_name: temporal_cloud_v0_workflow_terminate_count:rate1m + query: rate(temporal_cloud_v0_workflow_terminate_count[1m]) + - metric_name: temporal_cloud_v0_workflow_timeout_count:rate1m + query: rate(temporal_cloud_v0_workflow_timeout_count[1m]) diff --git a/cloud/observability/promql-to-scrape/examples/configmap.yaml b/cloud/observability/promql-to-scrape/examples/configmap.yaml new file mode 100644 index 0000000..01d0b5a --- /dev/null +++ b/cloud/observability/promql-to-scrape/examples/configmap.yaml @@ -0,0 +1,49 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: promql-to-scrape-config +data: + config.yaml: | + metrics: + - metric_name: temporal_cloud_v0_frontend_service_error_count:rate1m + query: rate(temporal_cloud_v0_frontend_service_error_count[1m]) + - metric_name: temporal_cloud_v0_frontend_service_pending_requests + query: temporal_cloud_v0_frontend_service_pending_requests + - metric_name: temporal_cloud_v0_frontend_service_request_count:rate1m + query: rate(temporal_cloud_v0_frontend_service_request_count[1m]) + - metric_name: temporal_cloud_v0_poll_success_count:rate1m + query: rate(temporal_cloud_v0_poll_success_count[1m]) + - metric_name: temporal_cloud_v0_poll_success_sync_count:rate1m + query: rate(temporal_cloud_v0_poll_success_sync_count[1m]) + - metric_name: temporal_cloud_v0_poll_timeout_count:rate1m + query: rate(temporal_cloud_v0_poll_timeout_count[1m]) + - metric_name: temporal_cloud_v0_resource_exhausted_error_count:rate1m + query: rate(temporal_cloud_v0_resource_exhausted_error_count[1m]) + - metric_name: temporal_cloud_v0_schedule_action_success_count:rate1m + query: rate(temporal_cloud_v0_schedule_action_success_count[1m]) + - metric_name: temporal_cloud_v0_schedule_buffer_overruns_count:rate1m + query: rate(temporal_cloud_v0_schedule_buffer_overruns_count[1m]) + - metric_name: temporal_cloud_v0_schedule_missed_catchup_window_count:rate1m + query: rate(temporal_cloud_v0_schedule_missed_catchup_window_count[1m]) + - metric_name: temporal_cloud_v0_service_latency_bucket:histogram_quantile_p99_1m + query: histogram_quantile(0.99, sum(rate(temporal_cloud_v0_service_latency_bucket[1m])) by (le, operation, temporal_namespace)) + - metric_name: temporal_cloud_v0_service_latency_count:rate1m + query: rate(temporal_cloud_v0_service_latency_count[1m]) + - metric_name: temporal_cloud_v0_service_latency_sum:rate1m + query: rate(temporal_cloud_v0_service_latency_sum[1m]) + - metric_name: temporal_cloud_v0_state_transition_count:rate1m + query: rate(temporal_cloud_v0_state_transition_count[1m]) + - metric_name: temporal_cloud_v0_total_action_count:rate1m + query: rate(temporal_cloud_v0_total_action_count[1m]) + - metric_name: temporal_cloud_v0_workflow_cancel_count:rate1m + query: rate(temporal_cloud_v0_workflow_cancel_count[1m]) + - metric_name: temporal_cloud_v0_workflow_continued_as_new_count:rate1m + query: rate(temporal_cloud_v0_workflow_continued_as_new_count[1m]) + - metric_name: temporal_cloud_v0_workflow_failed_count:rate1m + query: rate(temporal_cloud_v0_workflow_failed_count[1m]) + - metric_name: temporal_cloud_v0_workflow_success_count:rate1m + query: rate(temporal_cloud_v0_workflow_success_count[1m]) + - metric_name: temporal_cloud_v0_workflow_terminate_count:rate1m + query: rate(temporal_cloud_v0_workflow_terminate_count[1m]) + - metric_name: temporal_cloud_v0_workflow_timeout_count:rate1m + query: rate(temporal_cloud_v0_workflow_timeout_count[1m]) \ No newline at end of file diff --git a/cloud/observability/promql-to-scrape/examples/deployment.yaml b/cloud/observability/promql-to-scrape/examples/deployment.yaml new file mode 100644 index 0000000..b2a8558 --- /dev/null +++ b/cloud/observability/promql-to-scrape/examples/deployment.yaml @@ -0,0 +1,47 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: promql-to-scrape + labels: + app: promql-to-scrape +spec: + replicas: 1 + selector: + matchLabels: + app: promql-to-scrape + template: + metadata: + labels: + app: promql-to-scrape + spec: + containers: + - name: promql-to-scrape + image: ghcr.io/temporalio/promql-to-scrape:7c0e91a + args: + - --client-cert=/var/run/secrets/ca_crt + - --client-key=/var/run/secrets/ca_key + - --prom-endpoint=https://.tmprl.cloud/prometheus + - --config-file=/etc/promql-to-scrape/config.yaml + - --debug + ports: + - containerPort: 9001 + volumeMounts: + - name: secrets + mountPath: /var/run/secrets + readOnly: true + - name: config-volume + mountPath: /etc/promql-to-scrape + resources: + limits: + cpu: "100m" + memory: "256Mi" + volumes: + - name: secrets + secret: + secretName: promql-to-scrape-secrets + - name: config-volume + configMap: + name: promql-to-scrape-config + items: + - key: config.yaml + path: config.yaml \ No newline at end of file diff --git a/cloud/observability/promql-to-scrape/examples/secret.yaml b/cloud/observability/promql-to-scrape/examples/secret.yaml new file mode 100644 index 0000000..6a09b96 --- /dev/null +++ b/cloud/observability/promql-to-scrape/examples/secret.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: Secret +type: Opaque +metadata: + name: promql-to-scrape-secrets + labels: + app: promql-to-scrape +data: + ca_crt: "" + ca_key: "" diff --git a/cloud/observability/promql-to-scrape/go.mod b/cloud/observability/promql-to-scrape/go.mod new file mode 100644 index 0000000..d4b727d --- /dev/null +++ b/cloud/observability/promql-to-scrape/go.mod @@ -0,0 +1,17 @@ +module github.com/temporalio/samples-server/cloud/observability/promql-to-scrape + +go 1.21 + +require ( + github.com/prometheus/client_golang v1.17.0 + github.com/prometheus/common v0.45.0 + golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa + gopkg.in/yaml.v3 v3.0.1 +) + +require ( + github.com/json-iterator/go v1.1.12 // indirect + github.com/kr/text v0.2.0 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect +) diff --git a/cloud/observability/promql-to-scrape/go.sum b/cloud/observability/promql-to-scrape/go.sum new file mode 100644 index 0000000..231b5c6 --- /dev/null +++ b/cloud/observability/promql-to-scrape/go.sum @@ -0,0 +1,65 @@ +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= +github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= +github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= +github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 h1:jWpvCLoY8Z/e3VKvlsiIGKtc+UG6U5vzxaoagmhXfyg= +github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0/go.mod h1:QUyp042oQthUoa9bqDv0ER0wrtXnBruoNd7aNjkbP+k= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_golang v1.17.0 h1:rl2sfwZMtSthVU752MqfjQozy7blglC+1SOtjMAMh+Q= +github.com/prometheus/client_golang v1.17.0/go.mod h1:VeL+gMmOAxkS2IqfCq0ZmHSL+LjWfWDUmp1mBz9JgUY= +github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 h1:v7DLqVdK4VrYkVD5diGdl4sxJurKJEMnODWRJlxV9oM= +github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16/go.mod h1:oMQmHW1/JoDwqLtg57MGgP/Fb1CJEYF2imWWhWtMkYU= +github.com/prometheus/common v0.45.0 h1:2BGz0eBc2hdMDLnO/8n0jeB3oPrt2D08CekT0lneoxM= +github.com/prometheus/common v0.45.0/go.mod h1:YJmSTw9BoKxJplESWWxlbyttQR4uaEcGyv9MZjVOJsY= +github.com/prometheus/procfs v0.11.1 h1:xRC8Iq1yyca5ypa9n1EZnWZkt7dwcoRPQwX/5gwaUuI= +github.com/prometheus/procfs v0.11.1/go.mod h1:eesXgaPo1q7lBpVMoMy0ZOFTth9hBn4W/y0/p/ScXhY= +github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= +github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa h1:FRnLl4eNAQl8hwxVVC17teOw8kdjVDVAiFMtgUdTSRQ= +golang.org/x/exp v0.0.0-20231110203233-9a3e6036ecaa/go.mod h1:zk2irFbV9DP96SEBUUAy67IdHUaZuSnrz1n472HUCLE= +golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM= +golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= +golang.org/x/oauth2 v0.12.0 h1:smVPGxink+n1ZI5pkQa8y6fZT0RW0MgCO5bFpepy4B4= +golang.org/x/oauth2 v0.12.0/go.mod h1:A74bZ3aGXgCY0qaIC9Ahg6Lglin4AMAco8cIv9baba4= +golang.org/x/sys v0.14.0 h1:Vz7Qs629MkJkGyHxUlRHizWJRG2j8fbQKjELVSNhy7Q= +golang.org/x/sys v0.14.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +google.golang.org/appengine v1.6.7 h1:FZR1q0exgwxzPzp/aF+VccGrSfxfPpkBqjIIEq3ru6c= +google.golang.org/appengine v1.6.7/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= +google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= +google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/cloud/observability/promql-to-scrape/internal/client.go b/cloud/observability/promql-to-scrape/internal/client.go new file mode 100644 index 0000000..4a7ade5 --- /dev/null +++ b/cloud/observability/promql-to-scrape/internal/client.go @@ -0,0 +1,100 @@ +package internal + +import ( + "context" + "fmt" + "log" + "net/http" + "strings" + "time" + + promapi "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" +) + +type ( + Querier interface { + ListMetrics(metricPrefix string) ([]string, []string, []string, error) + QueryMetricsInstant(promql string) (model.Matrix, error) + } + + APIClient struct { + promapi.API + } +) + +type APIConfig struct { + TargetHost string + ServerRootCACert string + ClientCert string + ClientKey string + ServerName string + InsecureSkipVerify bool +} + +func NewAPIClient(cfg APIConfig) (*APIClient, error) { + tlsCfg, err := BuildTLSConfig( + cfg.ClientCert, + cfg.ClientKey, + cfg.ServerRootCACert, + cfg.ServerName, + cfg.InsecureSkipVerify, + ) + if err != nil { + return nil, fmt.Errorf("failed to build tls config %w", err) + } + + httpClient := &http.Client{ + Transport: &http.Transport{TLSClientConfig: tlsCfg}, + } + + client, err := NewHttpClient(cfg.TargetHost, httpClient) + if err != nil { + return nil, fmt.Errorf("failed to build tls client %w", err) + } + + return &APIClient{promapi.NewAPI(client)}, nil +} + +func (c *APIClient) ListMetrics(metricPrefix string) ([]string, []string, []string, error) { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + values, _, err := c.LabelValues(ctx, "__name__", nil, time.Time{}, time.Time{}) + if err != nil { + return nil, nil, nil, fmt.Errorf("failed to fetch Prometheus metric names: %w", err) + } + counts := []string{} + gauges := []string{} + histograms := []string{} + for _, v := range values { + if !strings.HasPrefix(string(v), metricPrefix) { + continue + } + if strings.HasSuffix(string(v), "_bucket") { + histograms = append(histograms, string(v)) + } else if strings.HasSuffix(string(v), "_count") || strings.HasSuffix(string(v), "_sum") { + counts = append(counts, string(v)) + } else { + gauges = append(gauges, string(v)) + } + } + return counts, gauges, histograms, nil +} + +func (c *APIClient) QueryMetricsInstant(promql string) (model.Vector, error) { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + result, warnings, err := c.API.Query(ctx, promql, time.Now().Add(-60*time.Second), promapi.WithTimeout(10*time.Second)) + if err != nil { + return nil, fmt.Errorf("failed to query Temporal Cloud: %w", err) + } + if len(warnings) > 0 { + log.Printf("warning while querying Temporal Cloud: %v\n", warnings) + } + promVector, ok := result.(model.Vector) + if !ok { + log.Printf("unexpected type %T returned for bucket metric", result) + } + return promVector, nil +} diff --git a/cloud/observability/promql-to-scrape/internal/config.go b/cloud/observability/promql-to-scrape/internal/config.go new file mode 100644 index 0000000..7ec65f5 --- /dev/null +++ b/cloud/observability/promql-to-scrape/internal/config.go @@ -0,0 +1,47 @@ +package internal + +import ( + "os" + + "gopkg.in/yaml.v3" +) + +type Config struct { + Metrics []Metric +} + +type Metric struct { + MetricName string `yaml:"metric_name"` + Query string `yaml:"query"` +} + +func LoadConfig(filename string) (*Config, error) { + var config Config + + bytes, err := os.ReadFile(filename) + if err != nil { + return nil, err + } + + err = yaml.Unmarshal(bytes, &config) + if err != nil { + return nil, err + } + + return &config, nil +} + +// ByMetricName lets us sort metrics +type ByMetricName []Metric + +func (m ByMetricName) Len() int { + return len(m) +} + +func (m ByMetricName) Less(i, j int) bool { + return m[i].MetricName < m[j].MetricName +} + +func (m ByMetricName) Swap(i, j int) { + m[i], m[j] = m[j], m[i] +} diff --git a/cloud/observability/promql-to-scrape/internal/http.go b/cloud/observability/promql-to-scrape/internal/http.go new file mode 100644 index 0000000..fa0330b --- /dev/null +++ b/cloud/observability/promql-to-scrape/internal/http.go @@ -0,0 +1,82 @@ +package internal + +import ( + "bytes" + "context" + "net/http" + "net/url" + "path" + "strings" +) + +type HttpClient struct { + Endpoint *url.URL + Client *http.Client +} + +func NewHttpClient(addr string, httpClient *http.Client) (*HttpClient, error) { + u, err := url.Parse(addr) + if err != nil { + return nil, err + } + u.Path = strings.TrimRight(u.Path, "/") + + return &HttpClient{ + Endpoint: u, + Client: httpClient, + }, nil +} + +func (c *HttpClient) URL(ep string, args map[string]string) *url.URL { + p := path.Join(c.Endpoint.Path, ep) + + for arg, val := range args { + arg = ":" + arg + p = strings.ReplaceAll(p, arg, val) + } + + u := *c.Endpoint + u.Path = p + + return &u +} + +func (c *HttpClient) Do(ctx context.Context, req *http.Request) (*http.Response, []byte, error) { + if ctx != nil { + req = req.WithContext(ctx) + } + + req.Header.Set("User-Agent", "promql-to-scrape") + + resp, err := c.Client.Do(req) + defer func() { + if resp != nil { + resp.Body.Close() + } + }() + + if err != nil { + return nil, nil, err + } + + var body []byte + done := make(chan struct{}) + go func() { + var buf bytes.Buffer + _, err = buf.ReadFrom(resp.Body) + body = buf.Bytes() + close(done) + }() + + select { + case <-ctx.Done(): + <-done + err = resp.Body.Close() + if err == nil { + err = ctx.Err() + } + case <-done: + } + + return resp, body, err +} diff --git a/cloud/observability/promql-to-scrape/internal/query.go b/cloud/observability/promql-to-scrape/internal/query.go new file mode 100644 index 0000000..626e742 --- /dev/null +++ b/cloud/observability/promql-to-scrape/internal/query.go @@ -0,0 +1,25 @@ +package internal + +import ( + "fmt" + + "github.com/prometheus/common/model" +) + +type Data map[string][]*model.Sample + +func QueryMetrics(conf *Config, client *APIClient) (Data, error) { + // https://pkg.go.dev/github.com/prometheus/common/model#Sample + queriedMetrics := map[string][]*model.Sample{} + + for _, metric := range conf.Metrics { + result, err := client.QueryMetricsInstant(metric.Query) + if err != nil { + return nil, fmt.Errorf("failed to query for %s: %v", metric.MetricName, err) + } + + queriedMetrics[metric.MetricName] = []*model.Sample(result) + } + + return Data(queriedMetrics), nil +} diff --git a/cloud/observability/promql-to-scrape/internal/serialize.go b/cloud/observability/promql-to-scrape/internal/serialize.go new file mode 100644 index 0000000..7d0225e --- /dev/null +++ b/cloud/observability/promql-to-scrape/internal/serialize.go @@ -0,0 +1,122 @@ +package internal + +import ( + "math" + "strconv" + "strings" + "sync" + + "github.com/prometheus/common/model" +) + +// Everything below this point is pretty shamelessly stolen and slightly modified from +// https://github.com/kubernetes/kube-state-metrics/blob/main/pkg/metric/metric.go + +var ( + escapeWithDoubleQuote = strings.NewReplacer("\\", `\\`, "\n", `\n`, "\"", `\"`) + initialNumBufSize = 24 + numBufPool = sync.Pool{ + New: func() interface{} { + b := make([]byte, 0, initialNumBufSize) + return &b + }, + } +) + +// SamplesToString turns our queried metrics map into something compatible with the Prometheus exposition format +func SamplesToString(queriedMetrics map[string][]*model.Sample) string { + var sb strings.Builder + + for metricName, samples := range queriedMetrics { + nameWithoutSuffix := trimSuffixes(metricName, []string{"_count", "_sum", "_bucket"}) + sb.WriteString("# HELP ") + sb.WriteString(nameWithoutSuffix) + sb.WriteByte(' ') + sb.WriteString("https://docs.temporal.io/cloud/metrics#available-metrics") + sb.WriteByte('\n') + + sb.WriteString("# TYPE ") + sb.WriteString(nameWithoutSuffix) + sb.WriteByte(' ') + sb.WriteString("gauge") + sb.WriteByte('\n') + + for _, s := range samples { + sb.WriteString(metricName) + + // write labels + var separator byte = '{' + if len(s.Metric) == 1 { + sb.WriteByte(separator) + } + for k, v := range model.LabelSet(s.Metric) { + name := string(k) + if name == "__name__" || name == "__rollup__" || name == "temporal_service_type" { + continue + } + sb.WriteByte(separator) + sb.WriteString(string(k)) + sb.WriteString("=\"") + escapeString(&sb, string(v)) + sb.WriteByte('"') + separator = ',' + } + sb.WriteByte('}') + + // write value + sb.WriteByte(' ') + writeFloat(&sb, float64(s.Value)) + + // write timestamp in milliseconds since epoch + // do we want timestamp? i doubt it + //sb.WriteByte(' ') + //writeInt(&sb, s.Timestamp.Unix()*1000) //nolint + + // end + sb.WriteByte('\n') + } + } + + return sb.String() +} + +// escapeString replaces '\' by '\\', new line character by '\n', and '"' by +// '\"'. +// Taken from github.com/prometheus/common/expfmt/text_create.go. +func escapeString(m *strings.Builder, v string) { + escapeWithDoubleQuote.WriteString(m, v) //nolint +} + +// writeFloat is equivalent to fmt.Fprint with a float64 argument but hardcodes +// a few common cases for increased efficiency. For non-hardcoded cases, it uses +// strconv.AppendFloat to avoid allocations, similar to writeInt. +// Taken from github.com/prometheus/common/expfmt/text_create.go. +func writeFloat(w *strings.Builder, f float64) { + switch { + case f == 1: + w.WriteByte('1') + case f == 0: + w.WriteByte('0') + case f == -1: + w.WriteString("-1") + case math.IsNaN(f): + w.WriteString("NaN") + case math.IsInf(f, +1): + w.WriteString("+Inf") + case math.IsInf(f, -1): + w.WriteString("-Inf") + default: + bp := numBufPool.Get().(*[]byte) + *bp = strconv.AppendFloat((*bp)[:0], f, 'g', -1, 64) + w.Write(*bp) + numBufPool.Put(bp) + } +} + +func trimSuffixes(str string, suffixes []string) string { + for _, suffix := range suffixes { + str = strings.TrimSuffix(str, suffix) + } + + return str +} diff --git a/cloud/observability/promql-to-scrape/internal/server.go b/cloud/observability/promql-to-scrape/internal/server.go new file mode 100644 index 0000000..789dff2 --- /dev/null +++ b/cloud/observability/promql-to-scrape/internal/server.go @@ -0,0 +1,92 @@ +package internal + +import ( + "fmt" + "net/http" + "sync" + "time" + + "golang.org/x/exp/slog" +) + +type PromToScrapeServer struct { + client *APIClient + conf *Config + server http.Server + data string + lastSuccessfulTime time.Time + + sync.RWMutex +} + +func NewPromToScrapeServer(client *APIClient, conf *Config, addr string) *PromToScrapeServer { + s := &PromToScrapeServer{ + client: client, + conf: conf, + data: "", + } + mux := http.NewServeMux() + mux.HandleFunc("/metrics", s.metricsHandler) + mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { + http.Redirect(w, r, "/metrics", http.StatusMovedPermanently) + }) + + s.server = http.Server{ + Addr: addr, + Handler: mux, + } + + go s.run() + + return s +} + +// metricsHandler is the HTTP handler for the "/metrics" endpoint. +func (s *PromToScrapeServer) metricsHandler(w http.ResponseWriter, r *http.Request) { + s.RLock() + defer s.Unlock() + if time.Since(s.lastSuccessfulTime) < 5*time.Minute { + fmt.Fprint(w, s.data) + } else { + w.WriteHeader(http.StatusInternalServerError) + slog.Error("can't serve metrics", "error", "metrics queried are stale (more than 5 minutes old)") + } +} + +// Run on loop getting the metrics data we need +func (s *PromToScrapeServer) run() string { + s.queryMetrics() + // to provide some jitter + ticker := time.NewTicker(59 * time.Second) + + for { + select { + case <-ticker.C: + s.queryMetrics() + } + } +} + +// there's an alternate way to implement this: +// +// keep the objects returned from the query, or convert them into something a bit more ergonomic +// and create ConstMetrics with the prometheus client. I happened to have the code lying around for working +// with model.Sample, but the CosntMetrics route is probably more idiomatic and safe. +func (s *PromToScrapeServer) queryMetrics() { + start := time.Now() + queriedMetrics, err := QueryMetrics(s.conf, s.client) + if err != nil { + slog.Error("failed to query metrics", "error", err) + return + } + s.Lock() + s.data = SamplesToString(queriedMetrics) + s.lastSuccessfulTime = time.Now() + s.Unlock() + slog.Debug("successful metric retrieval", "time", time.Since(start)) +} + +// Start runs the embedded http.Server. +func (s *PromToScrapeServer) Start() error { + return s.server.ListenAndServe() +} diff --git a/cloud/observability/promql-to-scrape/internal/tls.go b/cloud/observability/promql-to-scrape/internal/tls.go new file mode 100644 index 0000000..7547821 --- /dev/null +++ b/cloud/observability/promql-to-scrape/internal/tls.go @@ -0,0 +1,35 @@ +package internal + +import ( + "crypto/tls" + "crypto/x509" + "fmt" + "log" + "os" +) + +func BuildTLSConfig(clientCert, clientKey, serverRootCACert, serverName string, insecureSkipVerify bool) (*tls.Config, error) { + cert, err := tls.LoadX509KeyPair(clientCert, clientKey) + if err != nil { + log.Fatalf("failed load key pairs: %s", err) + } + + // Load server CA if given + var serverCAPool *x509.CertPool + if serverRootCACert != "" { + serverCAPool = x509.NewCertPool() + b, err := os.ReadFile(serverRootCACert) + if err != nil { + return nil, fmt.Errorf("failed reading server CA: %w", err) + } else if !serverCAPool.AppendCertsFromPEM(b) { + return nil, fmt.Errorf("server CA PEM file invalid") + } + } + + return &tls.Config{ + Certificates: []tls.Certificate{cert}, + RootCAs: serverCAPool, + ServerName: serverName, + InsecureSkipVerify: insecureSkipVerify, + }, nil +}