diff --git a/CHANGELOG.md b/CHANGELOG.md index da28cd2ad7..1d57df8a13 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -49,6 +49,7 @@ * [BUGFIX] Fix issue where negation of native histograms (eg. `-some_native_histogram_series`) did nothing. #9508 * [BUGFIX] Fix issue where `metric might not be a counter, name does not end in _total/_sum/_count/_bucket` annotation would be emitted even if `rate` or `increase` did not have enough samples to compute a result. #9508 * [BUGFIX] Fix issue where sharded queries could return annotations with incorrect or confusing position information. #9536 +* [BUGFIX] Fix issue where downstream consumers may not generate correct cache keys for experimental error caching. #9644 ### Mixin diff --git a/pkg/frontend/querymiddleware/error_caching.go b/pkg/frontend/querymiddleware/error_caching.go index 6415e370e4..930b61393a 100644 --- a/pkg/frontend/querymiddleware/error_caching.go +++ b/pkg/frontend/querymiddleware/error_caching.go @@ -5,7 +5,6 @@ package querymiddleware import ( "context" "errors" - "fmt" "time" "github.com/go-kit/log" @@ -26,7 +25,7 @@ const ( reasonNotCacheableError = "not-cacheable-api-error" ) -func newErrorCachingMiddleware(cache cache.Cache, limits Limits, shouldCacheReq shouldCacheFn, logger log.Logger, reg prometheus.Registerer) MetricsQueryMiddleware { +func newErrorCachingMiddleware(cache cache.Cache, limits Limits, shouldCacheReq shouldCacheFn, keyGen CacheKeyGenerator, logger log.Logger, reg prometheus.Registerer) MetricsQueryMiddleware { cacheLoadAttempted := promauto.With(reg).NewCounter(prometheus.CounterOpts{ Name: "cortex_frontend_query_error_cache_requests_total", Help: "Number of requests that check the results cache for an error.", @@ -50,6 +49,7 @@ func newErrorCachingMiddleware(cache cache.Cache, limits Limits, shouldCacheReq cache: cache, limits: limits, shouldCacheReq: shouldCacheReq, + keyGen: keyGen, logger: logger, cacheLoadAttempted: cacheLoadAttempted, cacheLoadHits: cacheLoadHits, @@ -64,6 +64,7 @@ type errorCachingHandler struct { cache cache.Cache limits Limits shouldCacheReq shouldCacheFn + keyGen CacheKeyGenerator logger log.Logger cacheLoadAttempted prometheus.Counter @@ -85,7 +86,7 @@ func (e *errorCachingHandler) Do(ctx context.Context, request MetricsQueryReques } e.cacheLoadAttempted.Inc() - key := errorCachingKey(tenant.JoinTenantIDs(tenantIDs), request) + key := e.keyGen.QueryRequestError(ctx, tenant.JoinTenantIDs(tenantIDs), request) hashedKey := cacheHashKey(key) if cachedErr := e.loadErrorFromCache(ctx, key, hashedKey, spanLog); cachedErr != nil { @@ -176,9 +177,3 @@ func (e *errorCachingHandler) isCacheable(apiErr *apierror.APIError) (bool, stri return true, "" } - -// errorCachingKey returns the key for caching and error query response. Standalone function -// to allow for easier testing. -func errorCachingKey(tenantID string, r MetricsQueryRequest) string { - return fmt.Sprintf("EC:%s:%s:%d:%d:%d", tenantID, r.GetQuery(), r.GetStart(), r.GetEnd(), r.GetStep()) -} diff --git a/pkg/frontend/querymiddleware/error_caching_test.go b/pkg/frontend/querymiddleware/error_caching_test.go index 46fae948f7..80906eae67 100644 --- a/pkg/frontend/querymiddleware/error_caching_test.go +++ b/pkg/frontend/querymiddleware/error_caching_test.go @@ -20,6 +20,8 @@ import ( ) func TestErrorCachingHandler_Do(t *testing.T) { + keyGen := NewDefaultCacheKeyGenerator(newTestPrometheusCodec(), time.Second) + newDefaultRequest := func() *PrometheusRangeQueryRequest { return &PrometheusRangeQueryRequest{ queryExpr: parseQuery(t, "up"), @@ -31,7 +33,7 @@ func TestErrorCachingHandler_Do(t *testing.T) { runHandler := func(ctx context.Context, inner MetricsQueryHandler, c cache.Cache, req MetricsQueryRequest) (Response, error) { limits := &mockLimits{resultsCacheTTLForErrors: time.Minute} - middleware := newErrorCachingMiddleware(c, limits, resultsCacheEnabledByOption, test.NewTestingLogger(t), prometheus.NewPedanticRegistry()) + middleware := newErrorCachingMiddleware(c, limits, resultsCacheEnabledByOption, keyGen, test.NewTestingLogger(t), prometheus.NewPedanticRegistry()) handler := middleware.Wrap(inner) return handler.Do(ctx, req) } @@ -80,7 +82,7 @@ func TestErrorCachingHandler_Do(t *testing.T) { ctx := user.InjectOrgID(context.Background(), "1234") req := newDefaultRequest() - key := errorCachingKey("1234", req) + key := keyGen.QueryRequestError(ctx, "1234", req) bytes, err := proto.Marshal(&CachedError{ Key: key, ErrorType: string(apierror.TypeExec), @@ -108,7 +110,7 @@ func TestErrorCachingHandler_Do(t *testing.T) { ctx := user.InjectOrgID(context.Background(), "1234") req := newDefaultRequest() - key := errorCachingKey("1234", req) + key := keyGen.QueryRequestError(ctx, "1234", req) bytes, err := proto.Marshal(&CachedError{ Key: "different key that is stored under the same hashed key", ErrorType: string(apierror.TypeExec), @@ -136,7 +138,7 @@ func TestErrorCachingHandler_Do(t *testing.T) { ctx := user.InjectOrgID(context.Background(), "1234") req := newDefaultRequest() - key := errorCachingKey("1234", req) + key := keyGen.QueryRequestError(ctx, "1234", req) bytes := []byte{0x0, 0x0, 0x0, 0x0} // NOTE: We rely on this mock cache being synchronous diff --git a/pkg/frontend/querymiddleware/results_cache.go b/pkg/frontend/querymiddleware/results_cache.go index ae6d660933..9a6c65612b 100644 --- a/pkg/frontend/querymiddleware/results_cache.go +++ b/pkg/frontend/querymiddleware/results_cache.go @@ -192,6 +192,9 @@ type CacheKeyGenerator interface { // QueryRequest should generate a cache key based on the tenant ID and MetricsQueryRequest. QueryRequest(ctx context.Context, tenantID string, r MetricsQueryRequest) string + // QueryRequestError should generate a cache key based on errors for the tenant ID and MetricsQueryRequest. + QueryRequestError(ctx context.Context, tenantID string, r MetricsQueryRequest) string + // LabelValues should return a cache key for a label values request. The cache key does not need to contain the tenant ID. // LabelValues can return ErrUnsupportedRequest, in which case the response won't be treated as an error, but the item will still not be cached. // LabelValues should return a nil *GenericQueryCacheKey when it returns an error and @@ -219,16 +222,20 @@ func NewDefaultCacheKeyGenerator(codec Codec, interval time.Duration) DefaultCac } // QueryRequest generates a cache key based on the userID, MetricsQueryRequest and interval. -func (g DefaultCacheKeyGenerator) QueryRequest(_ context.Context, userID string, r MetricsQueryRequest) string { +func (g DefaultCacheKeyGenerator) QueryRequest(_ context.Context, tenantID string, r MetricsQueryRequest) string { startInterval := r.GetStart() / g.interval.Milliseconds() stepOffset := r.GetStart() % r.GetStep() // Use original format for step-aligned request, so that we can use existing cached results for such requests. if stepOffset == 0 { - return fmt.Sprintf("%s:%s:%d:%d", userID, r.GetQuery(), r.GetStep(), startInterval) + return fmt.Sprintf("%s:%s:%d:%d", tenantID, r.GetQuery(), r.GetStep(), startInterval) } - return fmt.Sprintf("%s:%s:%d:%d:%d", userID, r.GetQuery(), r.GetStep(), startInterval, stepOffset) + return fmt.Sprintf("%s:%s:%d:%d:%d", tenantID, r.GetQuery(), r.GetStep(), startInterval, stepOffset) +} + +func (g DefaultCacheKeyGenerator) QueryRequestError(_ context.Context, tenantID string, r MetricsQueryRequest) string { + return fmt.Sprintf("EC:%s:%s:%d:%d:%d", tenantID, r.GetQuery(), r.GetStart(), r.GetEnd(), r.GetStep()) } // shouldCacheFn checks whether the current request should go to cache diff --git a/pkg/frontend/querymiddleware/roundtrip.go b/pkg/frontend/querymiddleware/roundtrip.go index 2efc03a9a0..89d76a52e2 100644 --- a/pkg/frontend/querymiddleware/roundtrip.go +++ b/pkg/frontend/querymiddleware/roundtrip.go @@ -357,7 +357,7 @@ func newQueryMiddlewares( queryRangeMiddleware = append( queryRangeMiddleware, newInstrumentMiddleware("error_caching", metrics), - newErrorCachingMiddleware(cacheClient, limits, resultsCacheEnabledByOption, log, registerer), + newErrorCachingMiddleware(cacheClient, limits, resultsCacheEnabledByOption, cacheKeyGenerator, log, registerer), ) }