fix(projects): Treat fetch failures as pending (#4140)

When we fail to fetch a project state for any reason (max retries exceeded, redis error), we should not mark the project as invalid. Currently we treat the project as if the DSN was invalid, and respond to clients with a 403 response. This might have made sense as a defensive measure in a pre-spooling world where envelopes could not be buffered for a longer period of time, but with spooling and time-based envelope eviction, we can now safely buffer them.
getsentry · Dec 3, 2024 · 8667f9d · 8667f9d
1 parent fa27cd7
commit 8667f9d
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 15 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## Unreleased
+
+**Bug Fixes**:
+
+- Accept incoming requests even if there was an error fetching their project config. ([#4140](https://github.com/getsentry/relay/pull/4140))
+
 ## 24.11.1
 
 **Breaking Changes**:

diff --git a/relay-server/src/services/projects/cache/service.rs b/relay-server/src/services/projects/cache/service.rs
@@ -124,9 +124,7 @@ impl ProjectCacheService {
                         "failed to fetch project from source: {fetch:?}"
                     );
 
-                    // TODO: change this to ProjectState::Pending once we consider it safe to do so.
-                    // see https://github.com/getsentry/relay/pull/4140.
-                    ProjectState::Disabled.into()
+                    ProjectState::Pending.into()
                 }
             };
 

diff --git a/tests/integration/test_query.py b/tests/integration/test_query.py
@@ -155,51 +155,47 @@ def get_project_config():
         mini_sentry.clear_test_failures()
 
 
-def test_query_retry_maxed_out(mini_sentry, relay_with_processing, events_consumer):
+def test_query_retry_maxed_out(mini_sentry, relay):
     """
     Assert that a query is not retried an infinite amount of times.
-
-    This is not specific to processing or store, but here we have the outcomes
-    consumer which we can use to assert that an event has been dropped.
     """
     request_count = 0
 
-    events_consumer = events_consumer()
-
     original_get_project_config = mini_sentry.app.view_functions["get_project_config"]
 
     @mini_sentry.app.endpoint("get_project_config")
     def get_project_config():
         if flask_request.json.get("global") is True:
             return original_get_project_config()
-
         nonlocal request_count
         request_count += 1
         print("RETRY", request_count)
         return "no", 500
 
     RETRIES = 1
     query_timeout = 0.5  # Initial grace period
-
     # Relay's exponential backoff: INITIAL_INTERVAL = 1s; DEFAULT_MULTIPLIER = 1.5;
     for retry in range(RETRIES):  # 1 retry
         query_timeout += 1 * 1.5 ** (retry + 1)
 
-    relay = relay_with_processing(
-        {"limits": {"query_timeout": math.ceil(query_timeout)}}
-    )
+    relay = relay(mini_sentry, {"limits": {"query_timeout": math.ceil(query_timeout)}})
 
     # No error messages yet
     assert mini_sentry.test_failures.empty()
 
     try:
         relay.send_event(42)
         time.sleep(query_timeout)
-
         assert request_count == 1 + RETRIES
         assert {str(e) for _, e in mini_sentry.current_test_failures()} == {
             "Relay sent us event: error fetching project states: upstream request returned error 500 Internal Server Error: no error details",
         }
+
+        time.sleep(1)  # Wait for project to be cached
+
+        # Relay still accepts events for this project
+        next_response = relay.send_event(42)
+        assert "id" in next_response
     finally:
         mini_sentry.clear_test_failures()