-
-
Notifications
You must be signed in to change notification settings - Fork 4.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(backpressure): Add Service monitoring (#50928)
This changes the existing queue-based monitoring to a more generic `service` monitoring. Services are organized in a hierarchy of base services, and consumers depending on those. The main monitor / monitoring loop will check the health of different base services, and aggregates that health for the consumers. The consumer health is then persisted and queried. The settings around monitoring and checking are also streamlined, and a new setting for high watermarks replaces the existing queue-size based options. fixes https://github.com/getsentry/team-processing/issues/56 fixes https://github.com/getsentry/team-processing/issues/55 --------- Co-authored-by: Sebastian Zivota <[email protected]>
- Loading branch information
1 parent
904a049
commit 13ef106
Showing
14 changed files
with
477 additions
and
468 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
from typing import Mapping | ||
|
||
import sentry_sdk | ||
from django.conf import settings | ||
|
||
from sentry import options | ||
from sentry.processing.backpressure.topology import CONSUMERS | ||
from sentry.utils import redis | ||
|
||
|
||
def _prefix_key(key_name: str) -> str: | ||
return f"bp1:{key_name}" | ||
|
||
|
||
HEALTHY_KEY_NAME = "consumer_is_healthy" | ||
|
||
|
||
def _unhealthy_consumer_key(name: str) -> str: | ||
return _prefix_key(f"{HEALTHY_KEY_NAME}:{name}") | ||
|
||
|
||
service_monitoring_cluster = redis.redis_clusters.get( | ||
settings.SENTRY_SERVICE_MONITORING_REDIS_CLUSTER | ||
) | ||
|
||
|
||
def is_consumer_healthy(consumer_name: str = "default") -> bool: | ||
"""Checks whether the given consumer is healthy by looking it up in Redis. | ||
NB: If the consumer is not found in Redis, it is assumed to be healthy. | ||
This behavior might change in the future. | ||
""" | ||
|
||
if not options.get("backpressure.checking.enabled"): | ||
return True | ||
# check if queue is healthy by pinging Redis | ||
try: | ||
return service_monitoring_cluster.get(_unhealthy_consumer_key(consumer_name)) == "true" | ||
except Exception as e: | ||
sentry_sdk.capture_exception(e) | ||
# By default it's considered unhealthy | ||
return False | ||
|
||
|
||
def record_consumer_health(service_health: Mapping[str, bool]) -> None: | ||
with service_monitoring_cluster.pipeline() as pipeline: | ||
key_ttl = options.get("backpressure.status_ttl") | ||
for name, dependencies in CONSUMERS.items(): | ||
is_healthy = True | ||
for dependency in dependencies: | ||
is_healthy = is_healthy and service_health[dependency] | ||
|
||
pipeline.set( | ||
_unhealthy_consumer_key(name), "true" if is_healthy else "false", ex=key_ttl | ||
) | ||
|
||
pipeline.execute() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
from dataclasses import dataclass | ||
from typing import Any, Generator, Mapping, Union | ||
|
||
import requests | ||
from redis import Redis | ||
from rediscluster import RedisCluster | ||
|
||
|
||
@dataclass | ||
class ServiceMemory: | ||
used: int | ||
available: int | ||
percentage: float | ||
|
||
def __init__(self, used: int, available: int): | ||
self.used = used | ||
self.available = available | ||
self.percentage = used / available | ||
|
||
|
||
def query_rabbitmq_memory_usage(host: str) -> ServiceMemory: | ||
"""Returns the currently used memory and the memory limit of a | ||
RabbitMQ host. | ||
""" | ||
|
||
if not host.endswith("/"): | ||
host += "/" | ||
url = f"{host}api/nodes" | ||
|
||
response = requests.get(url) | ||
response.raise_for_status() | ||
json = response.json() | ||
return ServiceMemory(json[0]["mem_used"], json[0]["mem_limit"]) | ||
|
||
|
||
# Based on configuration, this could be: | ||
# - a `rediscluster` Cluster (actually `RetryingRedisCluster`) | ||
# - a straight `Redis` client (actually `FailoverRedis`) | ||
# - or any class configured via `client_class`. | ||
# It could in theory also be a `rb` (aka redis blaster) Cluster, but we | ||
# intentionally do not support these. | ||
Cluster = Union[RedisCluster, Redis] | ||
|
||
|
||
def get_memory_usage(info: Mapping[str, Any]) -> ServiceMemory: | ||
# or alternatively: `used_memory_rss`? | ||
memory_used = info.get("used_memory", 0) | ||
# `maxmemory` might be 0 in development | ||
memory_available = info.get("maxmemory", 0) or info["total_system_memory"] | ||
|
||
return ServiceMemory(memory_used, memory_available) | ||
|
||
|
||
def iter_cluster_memory_usage(cluster: Cluster) -> Generator[ServiceMemory, None, None]: | ||
""" | ||
A generator that yields redis `INFO` results for each of the nodes in the `cluster`. | ||
""" | ||
if isinstance(cluster, RedisCluster): | ||
# `RedisCluster` returns these as a dictionary, with the node-id as key | ||
for info in cluster.info().values(): | ||
yield get_memory_usage(info) | ||
else: | ||
# otherwise, lets just hope that `info()` does the right thing | ||
yield get_memory_usage(cluster.info()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
import time | ||
from dataclasses import dataclass | ||
from typing import Dict, Generator, List, Mapping, Union | ||
|
||
from django.conf import settings | ||
|
||
from sentry import options | ||
from sentry.processing.backpressure.health import record_consumer_health | ||
|
||
# from sentry import options | ||
from sentry.processing.backpressure.memory import ( | ||
Cluster, | ||
ServiceMemory, | ||
iter_cluster_memory_usage, | ||
query_rabbitmq_memory_usage, | ||
) | ||
from sentry.processing.backpressure.topology import PROCESSING_SERVICES | ||
from sentry.utils import redis | ||
|
||
|
||
@dataclass | ||
class Redis: | ||
cluster: Cluster | ||
|
||
|
||
@dataclass | ||
class RabbitMq: | ||
servers: List[str] | ||
|
||
|
||
Service = Union[Redis, RabbitMq, None] | ||
|
||
|
||
def check_service_memory(service: Service) -> Generator[ServiceMemory, None, None]: | ||
""" | ||
This queries the given [`Service`] and returns the [`ServiceMemory`] | ||
for each of the individual servers that comprise the service. | ||
""" | ||
|
||
if isinstance(service, Redis): | ||
yield from iter_cluster_memory_usage(service.cluster) | ||
|
||
elif isinstance(service, RabbitMq): | ||
for server in service.servers: | ||
yield query_rabbitmq_memory_usage(server) | ||
|
||
|
||
def load_service_definitions() -> Dict[str, Service]: | ||
services: Dict[str, Service] = {} | ||
for name, definition in settings.SENTRY_PROCESSING_SERVICES.items(): | ||
if cluster_id := definition.get("redis"): | ||
cluster = redis.redis_clusters.get(cluster_id) | ||
services[name] = Redis(cluster) | ||
|
||
elif rabbitmq_urls := definition.get("rabbitmq"): | ||
services[name] = RabbitMq(rabbitmq_urls) | ||
|
||
else: | ||
services[name] = None | ||
|
||
return services | ||
|
||
|
||
def assert_all_services_defined(services: Dict[str, Service]) -> None: | ||
for name in PROCESSING_SERVICES: | ||
if name not in services: | ||
raise ValueError( | ||
f"The `{name}` Service is missing from `settings.SENTRY_PROCESSING_SERVICES`." | ||
) | ||
|
||
|
||
def check_service_health(services: Mapping[str, Service]) -> Mapping[str, bool]: | ||
service_health = {} | ||
high_watermarks = options.get("backpressure.high_watermarks") | ||
|
||
for name, service in services.items(): | ||
high_watermark = high_watermarks[name] | ||
is_healthy = True | ||
for memory in check_service_memory(service): | ||
is_healthy = is_healthy and memory.percentage < high_watermark | ||
|
||
service_health[name] = is_healthy | ||
|
||
return service_health | ||
|
||
|
||
def start_service_monitoring() -> None: | ||
services = load_service_definitions() | ||
assert_all_services_defined(services) | ||
|
||
while True: | ||
if not options.get("backpressure.monitoring.enabled"): | ||
time.sleep(options.get("backpressure.monitoring.interval")) | ||
continue | ||
|
||
# first, check each base service and record its health | ||
service_health = check_service_health(services) | ||
|
||
# then, check the derived services and record their health | ||
record_consumer_health(service_health) | ||
|
||
time.sleep(options.get("backpressure.monitoring.interval")) |
Oops, something went wrong.