Skip to content

Commit

Permalink
Collect postgres checksums metrics (#17203)
Browse files Browse the repository at this point in the history
  • Loading branch information
sethsamuel authored Mar 29, 2024
1 parent 19f57e3 commit de516ff
Show file tree
Hide file tree
Showing 14 changed files with 67 additions and 1 deletion.
6 changes: 6 additions & 0 deletions postgres/assets/configuration/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,12 @@ files:
value:
type: boolean
example: true
- name: collect_checksum_metrics
description: Collect counts of database failed checksums. Only supported on versions >= 12.
value:
type: boolean
example: false
display_default: false
- name: collect_activity_metrics
description: |
Collect metrics regarding transactions from pg_stat_activity. Please make sure the user
Expand Down
1 change: 1 addition & 0 deletions postgres/changelog.d/17203.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added collect_checksum_metrics option to collect Postgres failed checksum counts for databases with it enabled.
1 change: 1 addition & 0 deletions postgres/datadog_checks/postgres/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def __init__(self, instance, init_config):
# Default value for `count_metrics` is True for backward compatibility
self.collect_count_metrics = is_affirmative(instance.get('collect_count_metrics', True))
self.collect_activity_metrics = is_affirmative(instance.get('collect_activity_metrics', False))
self.collect_checksum_metrics = is_affirmative(instance.get('collect_checksum_metrics', False))
self.activity_metrics_excluded_aggregations = instance.get('activity_metrics_excluded_aggregations', [])
self.collect_database_size_metrics = is_affirmative(instance.get('collect_database_size_metrics', True))
self.collect_wal_metrics = self._should_collect_wal_metrics(instance.get('collect_wal_metrics'))
Expand Down
4 changes: 4 additions & 0 deletions postgres/datadog_checks/postgres/config_models/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ def instance_collect_bloat_metrics():
return False


def instance_collect_checksum_metrics():
return False


def instance_collect_count_metrics():
return True

Expand Down
1 change: 1 addition & 0 deletions postgres/datadog_checks/postgres/config_models/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ class InstanceConfig(BaseModel):
azure: Optional[Azure] = None
collect_activity_metrics: Optional[bool] = None
collect_bloat_metrics: Optional[bool] = None
collect_checksum_metrics: Optional[bool] = None
collect_count_metrics: Optional[bool] = None
collect_database_size_metrics: Optional[bool] = None
collect_default_database: Optional[bool] = None
Expand Down
5 changes: 5 additions & 0 deletions postgres/datadog_checks/postgres/data/conf.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,11 @@ instances:
#
# collect_count_metrics: true

## @param collect_checksum_metrics - boolean - optional - default: false
## Collect counts of database failed checksums. Only supported on versions >= 12.
#
# collect_checksum_metrics: false

## @param collect_activity_metrics - boolean - optional - default: false
## Collect metrics regarding transactions from pg_stat_activity. Please make sure the user
## has sufficient privileges to read from pg_stat_activity before enabling this option.
Expand Down
6 changes: 5 additions & 1 deletion postgres/datadog_checks/postgres/metrics_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
ACTIVITY_METRICS_LT_8_3,
ACTIVITY_QUERY_10,
ACTIVITY_QUERY_LT_10,
CHECKSUM_METRICS,
COMMON_ARCHIVER_METRICS,
COMMON_BGW_METRICS,
COMMON_METRICS,
Expand All @@ -27,7 +28,7 @@
REPLICATION_METRICS_10,
REPLICATION_STATS_METRICS,
)
from .version_utils import V8_3, V9, V9_1, V9_2, V9_4, V9_6, V10, V14
from .version_utils import V8_3, V9, V9_1, V9_2, V9_4, V9_6, V10, V12, V14

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -86,6 +87,9 @@ def get_instance_metrics(self, version):
if self.config.collect_database_size_metrics:
self.instance_metrics.update(DATABASE_SIZE_METRICS)

if self.config.collect_checksum_metrics and version >= V12:
self.instance_metrics = dict(self.instance_metrics, **CHECKSUM_METRICS)

metrics = self.instance_metrics

res = {
Expand Down
11 changes: 11 additions & 0 deletions postgres/datadog_checks/postgres/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,17 @@ def _collect_stats(self, instance_tags):
with conn.cursor(cursor_factory=CommenterCursor) as cursor:
self._query_scope(cursor, archiver_instance_metrics, instance_tags, False)

if self._config.collect_checksum_metrics and self.version >= V12:
# SHOW queries need manual cursor execution so can't be bundled with the metrics
with conn.cursor(cursor_factory=CommenterCursor) as cursor:
cursor.execute("SHOW data_checksums;")
enabled = cursor.fetchone()[0]
self.count(
"postgresql.checksums.enabled",
1,
tags=self.tags_without_db + ["enabled:" + "true" if enabled == "on" else "false"],
hostname=self.resolved_hostname,
)
if self._config.collect_activity_metrics:
activity_metrics = self.metrics_cache.get_activity_metrics(self.version)
with conn.cursor(cursor_factory=CommenterCursor) as cursor:
Expand Down
2 changes: 2 additions & 0 deletions postgres/datadog_checks/postgres/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ def payload_pg_version(version):
'temp_files': ('postgresql.temp_files', AgentCheck.rate),
}

CHECKSUM_METRICS = {'checksum_failures': ('postgresql.checksums.checksum_failures', AgentCheck.monotonic_count)}

NEWER_14_METRICS = {
'session_time': ('postgresql.sessions.session_time', AgentCheck.monotonic_count),
'active_time': ('postgresql.sessions.active_time', AgentCheck.monotonic_count),
Expand Down
2 changes: 2 additions & 0 deletions postgres/metadata.csv
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ postgresql.bgwriter.maxwritten_clean,count,,,,The number of times the background
postgresql.bgwriter.sync_time,count,,millisecond,,The total amount of checkpoint processing time spent synchronizing files to disk.,0,postgres,bgw sync time,
postgresql.bgwriter.write_time,count,,millisecond,,The total amount of checkpoint processing time spent writing files to disk.,0,postgres,bgw wrt time,
postgresql.buffer_hit,gauge,,hit,second,"The number of times disk blocks were found in the buffer cache, preventing the need to read from the database. This metric is tagged with db.",1,postgres,buff hit,
postgresql.checksums.checksum_failures,count,,,,"The number of checksum failures in this database. This metric is tagged with db.",0,postgres,checksums,
postgresql.checksums.enabled,count,,,,"Whether database checksums are enabled. Value is always 1 and tagged with enabled:true or enabled:false. This metric is tagged with db.",0,postgres,checksums.enabled,
postgresql.cluster_vacuum.heap_blks_scanned,gauge,,block,,"Number of heap blocks scanned. This counter only advances when the phase is seq scanning heap. Only available with PostgreSQL 12 and newer. This metric is tagged with db, table, command, phase, index.",0,postgres,postgres cluster blk_scanned,
postgresql.cluster_vacuum.heap_blks_total,gauge,,block,,"Total number of heap blocks in the table. This number is reported as of the beginning of seq scanning heap. Only available with PostgreSQL 12 and newer. This metric is tagged with db, table, command, phase, index.",0,postgres,postgres cluster blk_total,
postgresql.cluster_vacuum.heap_tuples_scanned,gauge,,,,"Number of heap tuples scanned. This counter only advances when the phase is seq scanning heap, index scanning heap or writing new heap. Only available with PostgreSQL 12 and newer. This metric is tagged with db, table, command, phase, index.",0,postgres,postgres cluster tuple_scanned,
Expand Down
8 changes: 8 additions & 0 deletions postgres/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from datadog_checks.dev import get_docker_hostname
from datadog_checks.dev.docker import get_container_ip
from datadog_checks.postgres.util import (
CHECKSUM_METRICS,
NEWER_14_METRICS,
QUERY_PG_CONTROL_CHECKPOINT,
QUERY_PG_REPLICATION_SLOTS,
Expand Down Expand Up @@ -410,3 +411,10 @@ def check_subscription_stats_metrics(aggregator, expected_tags, count=1):
return
for metric_name in _iterate_metric_name(STAT_SUBSCRIPTION_STATS_METRICS):
aggregator.assert_metric(metric_name, count=count, tags=expected_tags)


def check_checksum_metrics(aggregator, expected_tags, count=1):
if float(POSTGRES_VERSION) < 12:
return
for metric_name in _iterate_metric_name(CHECKSUM_METRICS):
aggregator.assert_metric(metric_name, count=count, tags=expected_tags + ['db:{}'.format(DB_NAME)])
4 changes: 4 additions & 0 deletions postgres/tests/compose/docker-compose-replication.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ services:
- ./etc/postgresql:/etc/postgresql/
environment:
POSTGRES_PASSWORD: datad0g
POSTGRES_INITDB_ARGS: --data-checksums
command: postgres -c 'config_file=/etc/postgresql/postgresql.conf' -c 'hba_file=/etc/postgresql/pg_hba.conf'

postgres_replica:
Expand All @@ -37,6 +38,7 @@ services:
- ./etc/postgresql_replica:/etc/postgresql/
environment:
POSTGRES_PASSWORD: datad0g
POSTGRES_INITDB_ARGS: --data-checksums
command: postgres -c 'config_file=/etc/postgresql/postgresql.conf'

postgres_replica2:
Expand All @@ -55,6 +57,7 @@ services:
- ./etc/postgresql_replica2:/etc/postgresql/
environment:
POSTGRES_PASSWORD: datad0g
POSTGRES_INITDB_ARGS: --data-checksums
command: postgres -c 'config_file=/etc/postgresql/postgresql.conf'

postgres_logical_replica:
Expand All @@ -73,4 +76,5 @@ services:
- ./etc/postgresql_logical_replica:/etc/postgresql/
environment:
POSTGRES_PASSWORD: datad0g
POSTGRES_INITDB_ARGS: --data-checksums
command: postgres -c 'config_file=/etc/postgresql/postgresql.conf'
1 change: 1 addition & 0 deletions postgres/tests/compose/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ services:
- ./etc/postgresql:/etc/postgresql/
environment:
POSTGRES_PASSWORD: datad0g
POSTGRES_INITDB_ARGS: --data-checksums
command: postgres -c 'config_file=/etc/postgresql/postgresql.conf' -c 'hba_file=/etc/postgresql/pg_hba.conf'
16 changes: 16 additions & 0 deletions postgres/tests/test_discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# Licensed under a 3-clause BSD style license (see LICENSE)

import copy
import os
import re
import time
from contextlib import contextmanager
Expand All @@ -22,6 +23,9 @@
"exclude": ["dogs_5$", "dogs_50$"],
}

POSTGRES_VERSION = os.environ.get('POSTGRES_VERSION', None)


# the number of test databases that exist from [dogs_0, dogs_100]
NUM_DOGS_DATABASES = 101

Expand Down Expand Up @@ -63,6 +67,10 @@
'postgresql.table.count',
}

CHECKSUM_METRICS = {
'postgresql.checksums.checksum_failures',
}


@contextmanager
def get_postgres_connection(dbname="postgres"):
Expand Down Expand Up @@ -185,6 +193,7 @@ def test_autodiscovery_collect_all_metrics(aggregator, integration_check, pg_ins
]
pg_instance['collect_function_metrics'] = True
pg_instance['collect_count_metrics'] = True
pg_instance['collect_checksum_metrics'] = True
del pg_instance['dbname']

# execute dummy_function to populate pg_stat_user_functions for dogs_nofunc database
Expand All @@ -203,12 +212,16 @@ def test_autodiscovery_collect_all_metrics(aggregator, integration_check, pg_ins
for db in databases:
relation_metrics_expected_tags = _get_expected_tags(check, pg_instance, db=db, table='breed', schema='public')
count_metrics_expected_tags = _get_expected_tags(check, pg_instance, db=db, schema='public')
checksum_metrics_expected_tags = _get_expected_tags(check, pg_instance, db=db)
for metric in RELATION_METRICS:
aggregator.assert_metric(metric, tags=relation_metrics_expected_tags)
for metric in DYNAMIC_RELATION_METRICS:
aggregator.assert_metric(metric, tags=relation_metrics_expected_tags)
for metric in COUNT_METRICS:
aggregator.assert_metric(metric, tags=count_metrics_expected_tags)
if float(POSTGRES_VERSION) >= 12:
for metric in CHECKSUM_METRICS:
aggregator.assert_metric(metric, tags=checksum_metrics_expected_tags)

# we only created and executed the dummy_function in dogs_nofunc database
for metric in FUNCTION_METRICS:
Expand All @@ -220,6 +233,9 @@ def test_autodiscovery_collect_all_metrics(aggregator, integration_check, pg_ins
aggregator.assert_metric(
'dd.postgres._collect_relations_autodiscovery.time',
)
if float(POSTGRES_VERSION) >= 12:
checksum_metrics_expected_tags = _get_expected_tags(check, pg_instance, with_db=False, enabled="true")
aggregator.assert_metric('postgresql.checksums.enabled', value=1, tags=checksum_metrics_expected_tags)


@pytest.mark.integration
Expand Down

0 comments on commit de516ff

Please sign in to comment.