diff --git a/config/datadog/README.md b/config/datadog/README.md index 34f4887..5b58533 100644 --- a/config/datadog/README.md +++ b/config/datadog/README.md @@ -27,10 +27,10 @@ cd examples/otel ### Step 2: Configure Docker Compose and OpenTelemetry Collector #### datadog-docker-compose.yml -The [datadog-docker-compose.yml](https://github.com/aerospike/aerospike-monitoring/blob/master/examples/otel/datadog-docker-compose.yml) file contains services such as `aerospike-prometheus-exporter` and `otel-collector` that require specific environment configurations and volume mounts. +The [datadog-docker-compose.yml](../../examples/otel/datadog-docker-compose.yml) file contains services such as `aerospike-prometheus-exporter` and `otel-collector` that require specific environment configurations and volume mounts. #### datadog-otel-collector-config.yml -The [datadog-otel-collector-config.yml](https://github.com/aerospike/aerospike-monitoring/blob/master/examples/otel/datadog-otel-collector-config.yml) configures the OpenTelemetry Collector with receivers, processors, exporters, and service pipelines for handling traces. +The [datadog-otel-collector-config.yml](../../examples/otel/datadog-otel-collector-config.yml) configures the OpenTelemetry Collector with receivers, processors, exporters, and service pipelines for handling traces. **Important:** Update the `datadog-api-site` and `datadog-api-key` in this configuration file to match your Datadog account details. ![OpenTelemetry Collector API Config](assets/otel-collector-api-config.png) @@ -97,24 +97,24 @@ In the **"New Dashboard"** screen, click on the **"Configure"** option on the to ![Datadog Dashboard Import](assets/datadog-dashbaord-import.png) --- -## Creating Bulk Monitoring Alerts in Datadog +## Creating Monitors/Alerts in Datadog -To create bulk monitoring alerts in Datadog, you can use the provided **Python script**. This script reads multiple monitor configurations from a JSON file and creates monitors via the **Datadog API**. +To create Monitors/Alerts in Datadog, you can use the provided **Python script**. This script reads multiple monitor configurations from a JSON file and creates monitors via the **Datadog API**. ### Prerequisites - **Datadog API Key** and **Application Key** are required. - Python must be installed on your machine. -### Steps to Create Bulk Alerts +### Steps to Create Monitors -The `datadog_alerts_creation.py` script reads alert rules from a JSON file and creates corresponding monitors in Datadog using the Datadog API. +The `datadog_monitors_creation.py` script reads alert rules from a JSON file and creates corresponding monitors in Datadog using the Datadog API. #### Important -- `api_key` and `app_key`: These should be updated with your actual **Datadog API** and **Application** keys. -- `datadog_site`: Update this with the site where your **Datadog** account is hosted. -- `aerospike_rules.json`: This JSON file contains multiple monitor configurations, such as monitor name, type, query, and message. +- `api_key` and `app_key`: During runtime, you will be prompted to enter your actual Datadog API Key and Datadog Application Key. +- `datadog_site`: During runtime, you will also be prompted to enter the Datadog site where your account is hosted (e.g., `datadoghq.com`, `us5.datadoghq.com`). +- `aerospike_datadog_monitors.json`: This JSON file contains multiple monitor configurations, such as monitor name, type, query, and message. - Make sure to adjust the **thresholds** according to your requirements. ### Run the Python Script @@ -122,7 +122,7 @@ The `datadog_alerts_creation.py` script reads alert rules from a JSON file and c After updating the necessary configurations, run the script using the following command: ```bash -python datadog_alerts_creation.py +python3 datadog_monitors_creation.py ``` --- diff --git a/config/datadog/dashboards/usecases/rolling_restarts.json b/config/datadog/dashboards/usecases/rolling_restarts.json index e9e70a2..37a3c70 100644 --- a/config/datadog/dashboards/usecases/rolling_restarts.json +++ b/config/datadog/dashboards/usecases/rolling_restarts.json @@ -139,7 +139,6 @@ "title": "Dead", "title_size": "16", "title_align": "left", - "time": {}, "type": "query_value", "requests": [ { @@ -193,7 +192,6 @@ "title": "Unavailable", "title_size": "16", "title_align": "left", - "time": {}, "type": "query_value", "requests": [ { @@ -567,7 +565,6 @@ "title": "Quiesces Pending", "title_size": "16", "title_align": "left", - "time": {}, "type": "query_value", "requests": [ { @@ -679,7 +676,6 @@ "title": "HWM breached", "title_size": "16", "title_align": "left", - "time": {}, "type": "query_value", "requests": [ { @@ -1158,7 +1154,7 @@ { "id": 3296083092942762, "definition": { - "title": "XDR Lag (rate)", + "title": "XDR Lag", "title_size": "16", "title_align": "left", "show_legend": true, @@ -1168,20 +1164,21 @@ "min", "max" ], + "time": {}, "type": "timeseries", "requests": [ { "formulas": [ { "alias": "XDR Lag", - "formula": "per_second(query1)" + "formula": "query1" } ], "queries": [ { "data_source": "metrics", "name": "query1", - "query": "sum:aerospike.aerospike_xdr_lag{$aerospike_cluster,$aerospike_service,$ns} by {aerospike_cluster,aerospike_service}.rollup(max, 30)" + "query": "sum:aerospike.aerospike_xdr_lag{$aerospike_cluster,$aerospike_service,$ns} by {aerospike_cluster,aerospike_service}" } ], "response_format": "timeseries", @@ -1228,7 +1225,6 @@ "min", "max" ], - "time": {}, "type": "timeseries", "requests": [ { @@ -1583,7 +1579,6 @@ "min", "max" ], - "time": {}, "type": "timeseries", "requests": [ { @@ -2346,7 +2341,7 @@ { "id": 5261966586423734, "definition": { - "title": "HWM Breaches (rate) (total)", + "title": "HWM Breaches (total)", "title_size": "16", "title_align": "left", "show_legend": true, @@ -2356,20 +2351,21 @@ "min", "max" ], + "time": {}, "type": "timeseries", "requests": [ { "formulas": [ { "alias": "HWM_Breached", - "formula": "per_second(query1)" + "formula": "query1" } ], "queries": [ { "data_source": "metrics", "name": "query1", - "query": "sum:aerospike.aerospike_namespace_hwm_breached{$aerospike_cluster,$aerospike_service,$ns} by {aerospike_cluster,aerospike_service,ns}.rollup(max, 30)" + "query": "sum:aerospike.aerospike_namespace_hwm_breached{$aerospike_cluster,$aerospike_service,$ns} by {aerospike_cluster,aerospike_service,ns}" } ], "response_format": "timeseries", @@ -2644,7 +2640,7 @@ "x": 0, "y": 9, "width": 12, - "height": 1 + "height": 11 } }, { @@ -2820,7 +2816,7 @@ }, "layout": { "x": 0, - "y": 10, + "y": 20, "width": 12, "height": 1, "is_column_break": true @@ -3075,7 +3071,7 @@ }, "layout": { "x": 0, - "y": 11, + "y": 21, "width": 12, "height": 1 } @@ -3416,7 +3412,7 @@ }, "layout": { "x": 0, - "y": 12, + "y": 22, "width": 12, "height": 1 } @@ -3451,4 +3447,4 @@ "layout_type": "ordered", "notify_list": [], "reflow_type": "fixed" -} \ No newline at end of file +} diff --git a/config/datadog/monitors/aerospike_datadog_monitors.json b/config/datadog/monitors/aerospike_datadog_monitors.json new file mode 100644 index 0000000..d457624 --- /dev/null +++ b/config/datadog/monitors/aerospike_datadog_monitors.json @@ -0,0 +1,660 @@ +[ +{ + "name": "AerospikeNodeDown", + "type": "query alert", + "query": "min(last_1m):sum:aerospike.aerospike_node_up{*} by {aerospike_cluster,aerospike_service} == 0", + "message": "Node {{aerospike_service}} is down. ", + "tags": [], + "options": { + "thresholds": { + "critical": 0 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "NamespaceStopWrites", + "type": "query alert", + "query": "max(last_1m):max:aerospike.aerospike_namespace_stop_writes{*} by {aerospike_cluster,aerospike_service,ns} == 1", + "message": "Used disk space for namespace {{ns}} in node {{aerospike_service}} is above stop writes limit. ", + "tags": [], + "options": { + "thresholds": { + "critical": 1 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "AerospikeAllFlashAverageObjectsPerSprig", + "type": "query alert", + "query": "max(last_1m):((sum:aerospike.aerospike_namespace_master_objects{*} by {aerospike_cluster,aerospike_service,ns} / 4096) / sum:aerospike.aerospike_namespace_partition_tree_sprigs{*} by {aerospike_cluster,aerospike_service,ns}) > 50", + "message": "Average objects per sprig has been breached for namespace {{ns}} in node {{aerospike_service}}. ", + "tags": [], + "options": { + "thresholds": { + "critical": 50 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "AerospikeIndexStageSize", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_namespace_index_stage_size{*} by {aerospike_cluster,aerospike_service,ns} > 4000000000", + "message": "Index stage size configuration is not configured according to documentation in {{ns}} from node {{aerospike_service}}. ", + "tags": [], + "options": { + "thresholds": { + "critical": 4000000000 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "AerospikeSIndexStageSize", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_namespace_sindex_stage_size{*} by {aerospike_cluster,aerospike_service,ns} > 4000000000", + "message": "Sindex stage size configuration is not configured according to documentation in {{ns}} in node {{aerospike_service}}. ", + "tags": [], + "options": { + "thresholds": { + "critical": 4000000000 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "AerospikeIndexPressureDirtyMemory", + "type": "query alert", + "query": "max(last_1m):((sum:aerospike.aerospike_namespace_index_pressure_dirty_memory{*} by {aerospike_cluster,aerospike_service} / sum:aerospike.aerospike_namespace_index_pressure_total_memory{*} by {aerospike_cluster,aerospike_service}) * 100) > 10000000", + "message": "Dirty memory ratio against the total memory is above configured limit in node {{aerospike_service}}. ", + "tags": [], + "options": { + "thresholds": { + "critical": 10000000 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "ActiveProxies", + "type": "query alert", + "query": "sum(last_2m):per_second(sum:aerospike.aerospike_namespace_client_proxy_complete{*} by {aerospike_cluster,aerospike_service,ns}.rollup(max, 30)) + per_second(sum:aerospike.aerospike_namespace_client_proxy_timeout{*} by {aerospike_cluster,aerospike_service,ns}.rollup(max, 30)) + per_second(sum:aerospike.aerospike_namespace_client_proxy_error{*} by {aerospike_cluster,aerospike_service,ns}.rollup(max, 30)) + per_second(sum:aerospike.aerospike_namespace_batch_sub_proxy_complete{*} by {aerospike_cluster,aerospike_service,ns}.rollup(max, 30)) + per_second(sum:aerospike.aerospike_namespace_batch_sub_proxy_timeout{*} by {aerospike_cluster,aerospike_service,ns}.rollup(max, 30)) + per_second(sum:aerospike.aerospike_namespace_batch_sub_proxy_error{*} by {aerospike_cluster,aerospike_service,ns}.rollup(max, 30)) > 0", + "message": "Node is doing proxy. Proxies can happen during cluster change / migrations or if there are any network issues. Active proxies detected for {{ns}} on node {{aerospike_service}}. ", + "tags": [], + "options": { + "thresholds": { + "critical": 0 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "NamespaceSupervisorFallingBehind", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_namespace_objects{*} by {aerospike_cluster,aerospike_service,ns} > 0", + "message": "There seems some lag falling behind and/or display the length of time the most recent NSUP cycle lasted {{ns}} in node {{aerospike_service}}. ", + "tags": [], + "options": { + "thresholds": { + "critical": 0 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "NamespaceSupervisorFallingBehinds", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_namespace_nsup_cycle_deleted_pct{*} by {aerospike_cluster,aerospike_service,ns} > 1", + "message": "There seems some lag falling behind and/or display the length of time the most recent NSUP cycle lasted {{ns}} in node {{aerospike_service}}. ", + "tags": [], + "options": { + "thresholds": { + "critical": 1 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "HwmBreached", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_namespace_hwm_breached{*} by {aerospike_cluster,aerospike_service,ns} == 1", + "message": "high-water-disk-pct or high-water-memory-pct has been breached for namespace {{ns}} in node {{aerospike_service}}. Eviction may start to recover disk space. ", + "tags": [], + "options": { + "thresholds": { + "critical": 1 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "LowDataAvailable", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_namespace_data_avail_pct{*} by {aerospike_cluster,aerospike_service,ns} < 25", + "message": "Device available has dropped below the threshold value for namespace {{ns}} in node {{aerospike_service}}. May indicate that defrag is unable to keep up with the current load, and may result in stop writes if it continues to drop. The current value is {{value}}. ", + "tags": [], + "options": { + "thresholds": { + "critical": 25, + "warning": 55 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "ClientTimeouts", + "type": "query alert", + "query": "max(last_1m):per_second(sum:aerospike.aerospike_namespace_client_read_timeout{*} by {aerospike_cluster,aerospike_service,ns}.rollup(max, 30)) + per_second(sum:aerospike.aerospike_namespace_client_write_timeout{*} by {aerospike_cluster,aerospike_service,ns}.rollup(max, 30)) + per_second(sum:aerospike.aerospike_namespace_client_tsvc_timeout{*} by {aerospike_cluster,aerospike_service,ns}.rollup(max, 30)) > 1", + "message": "Client connections timing out at a rate greater than 1/s. Timeouts can occur during network issues or resource contention on the client and/or server nodes {{aerospike_service}}. ", + "tags": [], + "options": { + "thresholds": { + "critical": 1 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "DeviceWriteQ", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_namespace_storage_engine_device_write_q{*} by {aerospike_cluster,aerospike_service,ns} > 1", + "message": "Device write queue is greater than 1 for namespace {{ns}} on device {{storage_engine}} in node {{aerospike_service}}. May indicate underperforming storage subsystem or hotkeys. ", + "tags": [], + "options": { + "thresholds": { + "critical": 1 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "ShadowDeviceWriteQ", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_namespace_storage_engine_device_shadow_write_q{*} by {aerospike_cluster,aerospike_service,ns,storage_engine} > 1", + "message": "Shadow device write queue is greater than 1 for namespace {{ns}} on device {{storage_engine}} in node {{aerospike_service}}. May indicate underperforming storage subsystem or hotkeys. ", + "tags": [], + "options": { + "thresholds": { + "critical": 1 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "DeviceDefragQ", + "type": "query alert", + "query": "max(last_5m):sum:aerospike.aerospike_namespace_storage_engine_device_defrag_q{*} by {aerospike_cluster,aerospike_service,ns,storage_engine} > 1000", + "message": "Device Defrag queue has been above 1000 for more than 5m for namespace {{ns}} on device {{storage_engine}} in node {{aerospike_service}}. May indicate underperforming storage subsystem or hotkeys. ", + "tags": [], + "options": { + "thresholds": { + "critical": 1000 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "ClockSkewStopWrites", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_namespace_clock_skew_stop_writes{*} by {aerospike_cluster,aerospike_service,ns} == 1", + "message": "Clock has skewed for namespace {{ns}} in node {{aerospike_service}}. ", + "tags": [], + "options": { + "thresholds": { + "critical": 1 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "UnavailablePartitions", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_namespace_unavailable_partitions{*} by {aerospike_cluster,aerospike_service,ns} > 0", + "message": "Some partitions are not available for namespace {{ns}} on node {{aerospike_service}}. Check for network issues and make sure the cluster forms properly. ", + "tags": [], + "options": { + "thresholds": { + "critical": 0 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "DeadPartitions", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_namespace_dead_partitions{*} by {aerospike_cluster,aerospike_service,ns} > 2", + "message": "Some partitions are dead for namespace {{ns}} on node {{aerospike_service}}. Greater than replication-factor number nodes had an unclean shutdown, and there may be data loss. Will require the use of the revive command to make the partitions available again. ", + "tags": [], + "options": { + "thresholds": { + "critical": 2 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "NamespaceDataCloseToStopWrites", + "type": "query alert", + "query": "max(last_1m):((sum:aerospike.aerospike_namespace_data_avail_pct{*} by {aerospike_cluster,aerospike_service,ns}) - (sum:aerospike.aerospike_namespace_storage_engine_stop_writes_avail_pct{*} by {aerospike_cluster,aerospike_service,ns})) <= 10", + "message": "data_avail_pct for namespace {{ns}} in node {{aerospike_service}} is close to stop-writes-avail-pct limit. ", + "tags": [], + "options": { + "thresholds": { + "critical": 10 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "HighDataUseNamespace", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_namespace_data_used_pct{*} by {aerospike_cluster,aerospike_service,ns} > 85", + "message": "Data used has crossed above the threshold value for namespace {{ns}} in node {{aerospike_service}}. May indicate a need to reduce the object count or increase capacity and the current data used by the namespace is {{value}} . ", + "tags": [], + "options": { + "thresholds": { + "critical": 85, + "warning": 80 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "BestPracticesFailure", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_node_stats_failed_best_practices{*} by {aerospike_cluster,aerospike_service} > 0", + "message": "Best Practices check failed on {{aerospike_service}} in cluster {{aerospike_cluster}} ", + "tags": [], + "options": { + "thresholds": { + "critical": 0 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "ClusterSize", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_node_stats_cluster_size{*} by {aerospike_cluster,aerospike_service} < ", + "message": "Cluster size mismatch for node {{aerospike_service}} ", + "tags": [], + "options": { + "thresholds": { + "critical": 2 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "ClientConnections", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_node_stats_cluster_size{*} by {aerospike_cluster,aerospike_service} > 10000", + "message": "The number of client connections is greater than the expected peak. The current client connection count is {{value}} in node {{aerospike_service}} . ", + "tags": [], + "options": { + "thresholds": { + "critical": 10000, + "warning": 11 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "ClientConnectionOpened", + "type": "query alert", + "query": "max(last_1m):per_second(sum:aerospike.aerospike_node_stats_client_connections_opened{*} by {aerospike_cluster,aerospike_service}.rollup(max, 30)) > 100", + "message": "Client connections are being opened at a rate greater than 100/s. Connection churn can increase latency and client timeouts which in turn cause the client to open more connections. ", + "tags": [], + "options": { + "thresholds": { + "critical": 100 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "ClientConnectionClosed", + "type": "query alert", + "query": "max(last_1m):per_second(sum:aerospike.aerospike_node_stats_client_connections_closed{*} by {aerospike_cluster,aerospike_service}.rollup(max, 30)) > 100", + "message": "Client connections are being closed at a rate greater than 100/s. Connection churn can increase latency and client timeouts which in turn cause the client to open more connections. ", + "tags": [], + "options": { + "thresholds": { + "critical": 100 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "ClockSkew", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_node_stats_cluster_clock_skew_ms{*} by {aerospike_cluster,aerospike_service} > 20000", + "message": "Current maximum clock skew between nodes - will trigger stop writes when it exceeds the threshold if nsup-period is non-zero. The current clock skew is {{value}} ms ", + "tags": [], + "options": { + "thresholds": { + "critical": 20000, + "warning": 2000 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "LowMemorySystem", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_node_stats_system_free_mem_pct{*} by {aerospike_cluster,aerospike_service} < 10", + "message": "Total memory free has dropped below the threshold value for node {{aerospike_service}} and the current value is {{value}}% ", + "tags": [], + "options": { + "thresholds": { + "critical": 10, + "warning": 20 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "HeapEfficiency", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_node_stats_heap_efficiency_pct{*} by {aerospike_cluster,aerospike_service} < 60", + "message": "Heap efficiency for node {{aerospike_service}} has dropped below 60%. ", + "tags": [], + "options": { + "thresholds": { + "critical": 60 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "RwInProgress", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_node_stats_rw_in_progress{*} by {aerospike_cluster,aerospike_service,ns,storage_engine} > 100", + "message": "Read/write queue is greater than 100 for namespace {{ns}} on device {{storage_engine}} in node {{aerospike_service}}. May indicate underperforming storage subsystem or hotkeys. ", + "tags": [], + "options": { + "thresholds": { + "critical": 100 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "NamespaceSetQuota", + "type": "query alert", + "query": "max(last_1m):(((sum:aerospike.aerospike_sets_data_used_bytes{*} by {aerospike_cluster,aerospike_service}) / (sum:aerospike.aerospike_sets_stop_writes_size{*} by {aerospike_cluster,aerospike_service})) * 100) > 99", + "message": "One of your nodes is at {{value}} % of the quota you have configured on the set. ", + "tags": [], + "options": { + "thresholds": { + "critical": 99, + "warning": 80 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "XDRTimelag", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_xdr_lag{*} by {aerospike_cluster,aerospike_service,ns,dc} > 5", + "message": "XDR lag for namespace exceeding 5 second(s) from node {{aerospike_service}} to DC {{dc}}. XDR lag may be due to network connectivity issues, inability for the source to keep up with incoming writes, or write failures at the destination. ", + "tags": [], + "options": { + "thresholds": { + "critical": 5 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "XDRAbandonedRecords", + "type": "query alert", + "query": "max(last_1m):per_second(sum:aerospike.aerospike_xdr_abandoned{*} by {aerospike_cluster,aerospike_service,ns,dc}.rollup(max, 30)) > 0", + "message": "Abandoned records detected for XDR on node {{aerospike_service}} to DC {{dc}} .Records abandoned at a destination cluster may indicate a configuration mismatch for the namespace between source and destination. ", + "tags": [], + "options": { + "thresholds": { + "critical": 0 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "XDRRetryNoNodes", + "type": "query alert", + "query": "max(last_1m):per_second(sum:aerospike.aerospike_xdr_retry_no_node{*} by {aerospike_cluster,aerospike_service,ns,dc}.rollup(max, 30)) > 0", + "message": "XDR retries occuring on node {{aerospike_service}} to DC {{dc}} due to unkown master node destination. XDR cannot determine which destination node is the master. ", + "tags": [], + "options": { + "thresholds": { + "critical": 0 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "XDRRetryConnReset", + "type": "query alert", + "query": "max(last_2m):per_second(sum:aerospike.aerospike_xdr_retry_conn_reset{*} by {aerospike_cluster,aerospike_service,ns,dc}.rollup(max, 30)) > 2", + "message": "Rate of XDR connections resets greater than 2/s from {{aerospike_service}} to DC {{dc}} ,XDR retries occuring due to due to timeouts, network problems, or destination node restarts. ", + "tags": [], + "options": { + "thresholds": { + "critical": 2 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "XDRRetryDest", + "type": "query alert", + "query": "max(last_1m):per_second(sum:aerospike.aerospike_xdr_retry_dest{*} by {aerospike_cluster,aerospike_service,ns,dc}.rollup(max, 30)) > 5", + "message": "Increase in XDR write retries is greater than 5/s from {{aerospike_service}} to DC {{dc}}, XDR retries due to errors returned by the destination node, u.e. key busy or device overload. ", + "tags": [], + "options": { + "thresholds": { + "critical": 5 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "XDRLatency", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_xdr_latency_ms{*} by {aerospike_cluster,aerospike_service,ns,dc} > 100", + "message": "XDR latency above 100ms from {{aerospike_service}} to DC {{dc}} and it is higher than the expected. ", + "tags": [], + "options": { + "thresholds": { + "critical": 100 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "XDRLap", + "type": "query alert", + "query": "max(last_1m):sum:aerospike.aerospike_xdr_lap_us{*} by {aerospike_cluster,aerospike_service,ns,dc} > 75000", + "message": "XDR lap time greater than 75000 microseconds from {{aerospike_service}} to DC {{dc}}, The XDR processing cycle time (lap_us) is approaching the configured period-ms value. ", + "tags": [], + "options": { + "thresholds": { + "critical": 75000 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "XDRRecoveries", + "type": "query alert", + "query": "max(last_2m):per_second(sum:aerospike.aerospike_xdr_recoveries{*} by {aerospike_cluster,aerospike_service,ns,dc}.rollup(max, 30)) > 0", + "message": "XDR recoveries increasing on {{aerospike_service}} to DC {{dc}} , XDR recoveries happen during reind or may indicate that the in-memory transaction queue is full (the transaction-queue-limit may be too small). ", + "tags": [], + "options": { + "thresholds": { + "critical": 0 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "AerospikeReadLatency(<4ms)", + "type": "query alert", + "query": "max(last_1m):(avg:aerospike.aerospike_latencies_read_ms_bucket{le:4} by {aerospike_cluster,aerospike_service,ns}) > 16", + "message": "Read latency < 4ms bucket is exceeds the threshold value for namespace {{ns}} in node {{aerospike_service}}. The current value is {{ value }}. ", + "tags": [], + "options": { + "thresholds": { + "critical": 16, + "warning": 2 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "AerospikeReadLatency(<8ms)", + "type": "query alert", + "query": "max(last_1m):(avg:aerospike.aerospike_latencies_read_ms_bucket{le:8} by {aerospike_cluster,aerospike_service,ns}) > 16", + "message": "Read latency < 8ms bucket is exceeds the threshold value for namespace {{ns}} in node {{aerospike_service}}. The current value is {{ value }}. ", + "tags": [], + "options": { + "thresholds": { + "critical": 16, + "warning": 2 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "AerospikeReadLatency(<64ms)", + "type": "query alert", + "query": "max(last_1m):(avg:aerospike.aerospike_latencies_read_ms_bucket{le:64} by {aerospike_cluster,aerospike_service,ns}) > 16", + "message": "Read latency < 64ms bucket is exceeds the threshold value for namespace {{ns}} in node {{aerospike_service}}. The current value is {{ value }}. ", + "tags": [], + "options": { + "thresholds": { + "critical": 16, + "warning": 2 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "AerospikeReadLatency(<512ms)", + "type": "query alert", + "query": "max(last_1m):(avg:aerospike.aerospike_latencies_read_ms_bucket{le:512} by {aerospike_cluster,aerospike_service,ns}) > 16", + "message": "Read latency < 512ms bucket is exceeds the threshold value for namespace {{ns}} in node {{aerospike_service}}. The current value is {{ value }}. ", + "tags": [], + "options": { + "thresholds": { + "critical": 16, + "warning": 2 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "AerospikeWriteLatency(<4ms)", + "type": "query alert", + "query": "max(last_1m):(avg:aerospike.aerospike_latencies_write_ms_bucket{le:4} by {aerospike_cluster,aerospike_service,ns}) > 16", + "message": "Write latency < 4ms bucket is exceeds the threshold value for namespace {{ns}} in node {{aerospike_service}}. The current value is {{ value }}. ", + "tags": [], + "options": { + "thresholds": { + "critical": 16, + "warning": 2 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "AerospikeWriteLatency(<8ms)", + "type": "query alert", + "query": "max(last_1m):(avg:aerospike.aerospike_latencies_write_ms_bucket{le:8} by {aerospike_cluster,aerospike_service,ns}) > 16", + "message": "Write latency < 8ms bucket is exceeds the threshold value for namespace {{ns}} in node {{aerospike_service}}. The current value is {{ value }}. ", + "tags": [], + "options": { + "thresholds": { + "critical": 16, + "warning": 2 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "AerospikeWriteLatency(<64ms)", + "type": "query alert", + "query": "max(last_1m):(avg:aerospike.aerospike_latencies_write_ms_bucket{le:64} by {aerospike_cluster,aerospike_service,ns}) > 16", + "message": "Write latency < 64ms bucket is exceeds the threshold value for namespace {{ns}} in node {{aerospike_service}}. The current value is {{ value }}. ", + "tags": [], + "options": { + "thresholds": { + "critical": 16, + "warning": 2 + }, + "notify_audit": true, + "include_tags": true + } +}, +{ + "name": "AerospikeWriteLatency(<512ms)", + "type": "query alert", + "query": "max(last_1m):(avg:aerospike.aerospike_latencies_write_ms_bucket{le:512} by {aerospike_cluster,aerospike_service,ns}) > 16", + "message": "Write latency < 512ms bucket is exceeds the threshold value for namespace {{ns}} in node {{aerospike_service}}. The current value is {{ value }}. ", + "tags": [], + "options": { + "thresholds": { + "critical": 16, + "warning": 2 + }, + "notify_audit": true, + "include_tags": true + } +} +] diff --git a/config/datadog/monitors/datadog_monitors_creation.py b/config/datadog/monitors/datadog_monitors_creation.py new file mode 100644 index 0000000..a574c1d --- /dev/null +++ b/config/datadog/monitors/datadog_monitors_creation.py @@ -0,0 +1,32 @@ +import json +import requests + +# Prompt user for API key, Application key, and Datadog site +api_key = input("Enter your Datadog API Key: ").strip() +app_key = input("Enter your Datadog Application Key: ").strip() +datadog_site = input("Enter your Datadog site (e.g., datadoghq.com): ").strip() + +# Load the alert rules from the JSON file +with open('aerospike_datadog_monitors.json') as f: + monitors = json.load(f) + +for monitor in monitors: + + url = f'https://{datadog_site}/api/v1/monitor' + + # Send the POST request to create the monitor + response = requests.post( + url, + headers={ + 'Content-Type': 'application/json', + 'DD-API-KEY': api_key, + 'DD-APPLICATION-KEY': app_key + }, + json=monitor + ) + + if response.status_code == 200: + print("Monitor created successfully:", response.json()) + else: + print("Failed to create monitor. Status code:", response.status_code) + print("Response:", response.json())