From 4377b800a3a7a926203d7c91fca726a655a18ec3 Mon Sep 17 00:00:00 2001
From: Nick Pillitteri <56quarters@users.noreply.github.com>
Date: Thu, 17 Oct 2024 16:59:54 -0400
Subject: [PATCH] Use CacheKeyGenerator implementation in error caching
 middleware (#9644)

Use an implementation of the CacheKeyGenerator interface instead of bare
function to generate cache keys for error caching. This allows downstream
consumers (GEM) to inject custom logic for caching based on LBAC rules.

Signed-off-by: Nick Pillitteri <nick.pillitteri@grafana.com>
---
 CHANGELOG.md                                       |  1 +
 pkg/frontend/querymiddleware/error_caching.go      | 13 ++++---------
 pkg/frontend/querymiddleware/error_caching_test.go | 10 ++++++----
 pkg/frontend/querymiddleware/results_cache.go      | 13 ++++++++++---
 pkg/frontend/querymiddleware/roundtrip.go          |  2 +-
 5 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index da28cd2ad71..1d57df8a132 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -49,6 +49,7 @@
 * [BUGFIX] Fix issue where negation of native histograms (eg. `-some_native_histogram_series`) did nothing. #9508
 * [BUGFIX] Fix issue where `metric might not be a counter, name does not end in _total/_sum/_count/_bucket` annotation would be emitted even if `rate` or `increase` did not have enough samples to compute a result. #9508
 * [BUGFIX] Fix issue where sharded queries could return annotations with incorrect or confusing position information. #9536
+* [BUGFIX] Fix issue where downstream consumers may not generate correct cache keys for experimental error caching. #9644
 
 ### Mixin
 
diff --git a/pkg/frontend/querymiddleware/error_caching.go b/pkg/frontend/querymiddleware/error_caching.go
index 6415e370e45..930b61393ac 100644
--- a/pkg/frontend/querymiddleware/error_caching.go
+++ b/pkg/frontend/querymiddleware/error_caching.go
@@ -5,7 +5,6 @@ package querymiddleware
 import (
 	"context"
 	"errors"
-	"fmt"
 	"time"
 
 	"github.com/go-kit/log"
@@ -26,7 +25,7 @@ const (
 	reasonNotCacheableError = "not-cacheable-api-error"
 )
 
-func newErrorCachingMiddleware(cache cache.Cache, limits Limits, shouldCacheReq shouldCacheFn, logger log.Logger, reg prometheus.Registerer) MetricsQueryMiddleware {
+func newErrorCachingMiddleware(cache cache.Cache, limits Limits, shouldCacheReq shouldCacheFn, keyGen CacheKeyGenerator, logger log.Logger, reg prometheus.Registerer) MetricsQueryMiddleware {
 	cacheLoadAttempted := promauto.With(reg).NewCounter(prometheus.CounterOpts{
 		Name: "cortex_frontend_query_error_cache_requests_total",
 		Help: "Number of requests that check the results cache for an error.",
@@ -50,6 +49,7 @@ func newErrorCachingMiddleware(cache cache.Cache, limits Limits, shouldCacheReq
 			cache:               cache,
 			limits:              limits,
 			shouldCacheReq:      shouldCacheReq,
+			keyGen:              keyGen,
 			logger:              logger,
 			cacheLoadAttempted:  cacheLoadAttempted,
 			cacheLoadHits:       cacheLoadHits,
@@ -64,6 +64,7 @@ type errorCachingHandler struct {
 	cache          cache.Cache
 	limits         Limits
 	shouldCacheReq shouldCacheFn
+	keyGen         CacheKeyGenerator
 	logger         log.Logger
 
 	cacheLoadAttempted  prometheus.Counter
@@ -85,7 +86,7 @@ func (e *errorCachingHandler) Do(ctx context.Context, request MetricsQueryReques
 	}
 
 	e.cacheLoadAttempted.Inc()
-	key := errorCachingKey(tenant.JoinTenantIDs(tenantIDs), request)
+	key := e.keyGen.QueryRequestError(ctx, tenant.JoinTenantIDs(tenantIDs), request)
 	hashedKey := cacheHashKey(key)
 
 	if cachedErr := e.loadErrorFromCache(ctx, key, hashedKey, spanLog); cachedErr != nil {
@@ -176,9 +177,3 @@ func (e *errorCachingHandler) isCacheable(apiErr *apierror.APIError) (bool, stri
 
 	return true, ""
 }
-
-// errorCachingKey returns the key for caching and error query response. Standalone function
-// to allow for easier testing.
-func errorCachingKey(tenantID string, r MetricsQueryRequest) string {
-	return fmt.Sprintf("EC:%s:%s:%d:%d:%d", tenantID, r.GetQuery(), r.GetStart(), r.GetEnd(), r.GetStep())
-}
diff --git a/pkg/frontend/querymiddleware/error_caching_test.go b/pkg/frontend/querymiddleware/error_caching_test.go
index 46fae948f78..80906eae679 100644
--- a/pkg/frontend/querymiddleware/error_caching_test.go
+++ b/pkg/frontend/querymiddleware/error_caching_test.go
@@ -20,6 +20,8 @@ import (
 )
 
 func TestErrorCachingHandler_Do(t *testing.T) {
+	keyGen := NewDefaultCacheKeyGenerator(newTestPrometheusCodec(), time.Second)
+
 	newDefaultRequest := func() *PrometheusRangeQueryRequest {
 		return &PrometheusRangeQueryRequest{
 			queryExpr: parseQuery(t, "up"),
@@ -31,7 +33,7 @@ func TestErrorCachingHandler_Do(t *testing.T) {
 
 	runHandler := func(ctx context.Context, inner MetricsQueryHandler, c cache.Cache, req MetricsQueryRequest) (Response, error) {
 		limits := &mockLimits{resultsCacheTTLForErrors: time.Minute}
-		middleware := newErrorCachingMiddleware(c, limits, resultsCacheEnabledByOption, test.NewTestingLogger(t), prometheus.NewPedanticRegistry())
+		middleware := newErrorCachingMiddleware(c, limits, resultsCacheEnabledByOption, keyGen, test.NewTestingLogger(t), prometheus.NewPedanticRegistry())
 		handler := middleware.Wrap(inner)
 		return handler.Do(ctx, req)
 	}
@@ -80,7 +82,7 @@ func TestErrorCachingHandler_Do(t *testing.T) {
 
 		ctx := user.InjectOrgID(context.Background(), "1234")
 		req := newDefaultRequest()
-		key := errorCachingKey("1234", req)
+		key := keyGen.QueryRequestError(ctx, "1234", req)
 		bytes, err := proto.Marshal(&CachedError{
 			Key:          key,
 			ErrorType:    string(apierror.TypeExec),
@@ -108,7 +110,7 @@ func TestErrorCachingHandler_Do(t *testing.T) {
 		ctx := user.InjectOrgID(context.Background(), "1234")
 		req := newDefaultRequest()
 
-		key := errorCachingKey("1234", req)
+		key := keyGen.QueryRequestError(ctx, "1234", req)
 		bytes, err := proto.Marshal(&CachedError{
 			Key:          "different key that is stored under the same hashed key",
 			ErrorType:    string(apierror.TypeExec),
@@ -136,7 +138,7 @@ func TestErrorCachingHandler_Do(t *testing.T) {
 		ctx := user.InjectOrgID(context.Background(), "1234")
 		req := newDefaultRequest()
 
-		key := errorCachingKey("1234", req)
+		key := keyGen.QueryRequestError(ctx, "1234", req)
 		bytes := []byte{0x0, 0x0, 0x0, 0x0}
 
 		// NOTE: We rely on this mock cache being synchronous
diff --git a/pkg/frontend/querymiddleware/results_cache.go b/pkg/frontend/querymiddleware/results_cache.go
index ae6d6609332..9a6c65612b1 100644
--- a/pkg/frontend/querymiddleware/results_cache.go
+++ b/pkg/frontend/querymiddleware/results_cache.go
@@ -192,6 +192,9 @@ type CacheKeyGenerator interface {
 	// QueryRequest should generate a cache key based on the tenant ID and MetricsQueryRequest.
 	QueryRequest(ctx context.Context, tenantID string, r MetricsQueryRequest) string
 
+	// QueryRequestError should generate a cache key based on errors for the tenant ID and MetricsQueryRequest.
+	QueryRequestError(ctx context.Context, tenantID string, r MetricsQueryRequest) string
+
 	// LabelValues should return a cache key for a label values request. The cache key does not need to contain the tenant ID.
 	// LabelValues can return ErrUnsupportedRequest, in which case the response won't be treated as an error, but the item will still not be cached.
 	// LabelValues should return a nil *GenericQueryCacheKey when it returns an error and
@@ -219,16 +222,20 @@ func NewDefaultCacheKeyGenerator(codec Codec, interval time.Duration) DefaultCac
 }
 
 // QueryRequest generates a cache key based on the userID, MetricsQueryRequest and interval.
-func (g DefaultCacheKeyGenerator) QueryRequest(_ context.Context, userID string, r MetricsQueryRequest) string {
+func (g DefaultCacheKeyGenerator) QueryRequest(_ context.Context, tenantID string, r MetricsQueryRequest) string {
 	startInterval := r.GetStart() / g.interval.Milliseconds()
 	stepOffset := r.GetStart() % r.GetStep()
 
 	// Use original format for step-aligned request, so that we can use existing cached results for such requests.
 	if stepOffset == 0 {
-		return fmt.Sprintf("%s:%s:%d:%d", userID, r.GetQuery(), r.GetStep(), startInterval)
+		return fmt.Sprintf("%s:%s:%d:%d", tenantID, r.GetQuery(), r.GetStep(), startInterval)
 	}
 
-	return fmt.Sprintf("%s:%s:%d:%d:%d", userID, r.GetQuery(), r.GetStep(), startInterval, stepOffset)
+	return fmt.Sprintf("%s:%s:%d:%d:%d", tenantID, r.GetQuery(), r.GetStep(), startInterval, stepOffset)
+}
+
+func (g DefaultCacheKeyGenerator) QueryRequestError(_ context.Context, tenantID string, r MetricsQueryRequest) string {
+	return fmt.Sprintf("EC:%s:%s:%d:%d:%d", tenantID, r.GetQuery(), r.GetStart(), r.GetEnd(), r.GetStep())
 }
 
 // shouldCacheFn checks whether the current request should go to cache
diff --git a/pkg/frontend/querymiddleware/roundtrip.go b/pkg/frontend/querymiddleware/roundtrip.go
index 2efc03a9a09..89d76a52e26 100644
--- a/pkg/frontend/querymiddleware/roundtrip.go
+++ b/pkg/frontend/querymiddleware/roundtrip.go
@@ -357,7 +357,7 @@ func newQueryMiddlewares(
 		queryRangeMiddleware = append(
 			queryRangeMiddleware,
 			newInstrumentMiddleware("error_caching", metrics),
-			newErrorCachingMiddleware(cacheClient, limits, resultsCacheEnabledByOption, log, registerer),
+			newErrorCachingMiddleware(cacheClient, limits, resultsCacheEnabledByOption, cacheKeyGenerator, log, registerer),
 		)
 	}