From 2b84c453afc83ca68280c7e32df431b81e6d56f0 Mon Sep 17 00:00:00 2001 From: James Greenhill Date: Thu, 25 May 2023 22:32:11 -0700 Subject: [PATCH 1/4] chore: don't resolve jobs backpressure page if data is missing --- charts/posthog/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/posthog/values.yaml b/charts/posthog/values.yaml index f5fe395a..c0db0a8c 100644 --- a/charts/posthog/values.yaml +++ b/charts/posthog/values.yaml @@ -2426,7 +2426,7 @@ prometheus: description: "The `session_recording_events_dlq` topic offset has increased over the past 5 minutes." - alert: GraphileJobExecutionLag - expr: (max by(task_identifier) (posthog_celery_graphile_lag_seconds{task_identifier!="bufferJob"})) > 900 + expr: last_over_time((posthog_celery_graphile_lag_seconds{task_identifier!="bufferJob"}[12h])) > 1000 for: 5m labels: rotation: common From 4daad41b04f71668af4072bd4e43871142dd187c Mon Sep 17 00:00:00 2001 From: James Greenhill Date: Thu, 25 May 2023 22:33:15 -0700 Subject: [PATCH 2/4] 1h not 12h --- charts/posthog/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/posthog/values.yaml b/charts/posthog/values.yaml index c0db0a8c..85f9f7a1 100644 --- a/charts/posthog/values.yaml +++ b/charts/posthog/values.yaml @@ -2426,7 +2426,7 @@ prometheus: description: "The `session_recording_events_dlq` topic offset has increased over the past 5 minutes." - alert: GraphileJobExecutionLag - expr: last_over_time((posthog_celery_graphile_lag_seconds{task_identifier!="bufferJob"}[12h])) > 1000 + expr: last_over_time((posthog_celery_graphile_lag_seconds{task_identifier!="bufferJob"}[1h])) > 1000 for: 5m labels: rotation: common From 16f02db3eedd991491a577f57f19e23c10d3b8ad Mon Sep 17 00:00:00 2001 From: James Greenhill Date: Thu, 25 May 2023 23:01:42 -0700 Subject: [PATCH 3/4] maybe? --- charts/posthog/values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/posthog/values.yaml b/charts/posthog/values.yaml index 85f9f7a1..2656d55d 100644 --- a/charts/posthog/values.yaml +++ b/charts/posthog/values.yaml @@ -2426,8 +2426,8 @@ prometheus: description: "The `session_recording_events_dlq` topic offset has increased over the past 5 minutes." - alert: GraphileJobExecutionLag - expr: last_over_time((posthog_celery_graphile_lag_seconds{task_identifier!="bufferJob"}[1h])) > 1000 - for: 5m + expr: (max by(task_identifier) (posthog_celery_graphile_lag_seconds{task_identifier!="bufferJob"})) > 900 + for: 10m labels: rotation: common severity: critical From 55b794911aef8985648d5d3d7b945414dad3876f Mon Sep 17 00:00:00 2001 From: James Greenhill Date: Thu, 25 May 2023 23:06:39 -0700 Subject: [PATCH 4/4] expression update --- charts/posthog/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/posthog/values.yaml b/charts/posthog/values.yaml index 2656d55d..c393fda4 100644 --- a/charts/posthog/values.yaml +++ b/charts/posthog/values.yaml @@ -2426,7 +2426,7 @@ prometheus: description: "The `session_recording_events_dlq` topic offset has increased over the past 5 minutes." - alert: GraphileJobExecutionLag - expr: (max by(task_identifier) (posthog_celery_graphile_lag_seconds{task_identifier!="bufferJob"})) > 900 + expr: ((max by(task_identifier) (posthog_celery_graphile_lag_seconds{task_identifier!="bufferJob"})) > 900) unless absent(posthog_celery_graphile_lag_seconds{task_identifier!="bufferJob"}) for: 10m labels: rotation: common