Skip to content

Commit

Permalink
Add more labels to the metric so it has all the same labels as in add…
Browse files Browse the repository at this point in the history
…ition to the label
  • Loading branch information
tykling committed Mar 6, 2024
1 parent b077c0e commit 0777877
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 74 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Changed
- Add the ``monitor`` label in all examples, also the ones scraping internal metrics

- Add more labels to the `dnsexp_scrape_failures_total` metric so it has all the same labels as `dnsexp_dns_query_time_seconds` in addition to the `reason` label.

## [v1.0.0-rc1] - 2024-03-06

Expand Down
10 changes: 10 additions & 0 deletions src/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pathlib import Path
from threading import Thread

import httpx
import pytest
import yaml
from dns_exporter.entrypoint import main
Expand Down Expand Up @@ -215,3 +216,12 @@ def mock_dns_query_https_valuerror(mocker):
"dns.query.https",
side_effect=ValueError("mocked"),
)


@pytest.fixture()
def mock_dns_query_httpx_connecttimeout(mocker):
"""Monkeypatch dns.query.https to raise a httpx.ConnectTimeout."""
mocker.patch(
"dns.query.https",
side_effect=httpx.ConnectTimeout("mocked"),
)
47 changes: 23 additions & 24 deletions src/dns_exporter/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import contextlib
import logging
import re
import socket
Expand All @@ -26,6 +27,7 @@
from dns_exporter.exceptions import ProtocolSpecificError, UnknownFailureReasonError, ValidationError
from dns_exporter.metrics import (
FAILURE_REASONS,
TTL_LABELS,
dnsexp_dns_queries_total,
dnsexp_dns_responsetime_seconds,
dnsexp_scrape_failures_total,
Expand Down Expand Up @@ -126,21 +128,21 @@ def collect_dns(self) -> Iterator[CounterMetricFamily | GaugeMetricFamily]:
except dns.exception.Timeout:
# configured timeout was reached before a response arrived
reason = "timeout"
self.increase_failure_reason_metric(failure_reason=reason, config=self.config)
self.increase_failure_reason_metric(failure_reason=reason, labels=self.labels)
except ConnectionRefusedError:
# server actively refused the connection
reason = "connection_error"
self.increase_failure_reason_metric(
failure_reason=reason,
config=self.config,
labels=self.labels,
)
except OSError as e:
# raised by multiple protocols on ICMP unreach
logger.debug(f"Protocol {self.config.protocol} got OSError '{e}', exception follows", exc_info=True)
reason = "connection_error"
self.increase_failure_reason_metric(
failure_reason=reason,
config=self.config,
labels=self.labels,
)
except ProtocolSpecificError as e:
# a protocol specific exception was raised, log and re-raise
Expand All @@ -149,7 +151,7 @@ def collect_dns(self) -> Iterator[CounterMetricFamily | GaugeMetricFamily]:
exc_info=True,
)
reason = str(e)
self.increase_failure_reason_metric(failure_reason=reason, config=self.config)
self.increase_failure_reason_metric(failure_reason=reason, labels=self.labels)
except Exception: # noqa: BLE001
logger.warning(
f"""Caught an unknown exception while looking up qname {self.config.query_name} using server
Expand All @@ -158,7 +160,7 @@ def collect_dns(self) -> Iterator[CounterMetricFamily | GaugeMetricFamily]:
exc_info=True,
)
reason = "other_failure"
self.increase_failure_reason_metric(failure_reason=reason, config=self.config)
self.increase_failure_reason_metric(failure_reason=reason, labels=self.labels)

# clock it
qtime = time.time() - start
Expand Down Expand Up @@ -213,11 +215,11 @@ def handle_response(
logger.debug("Validating response and yielding remaining metrics")
try:
self.validate_response(response=response)
self.increase_failure_reason_metric(failure_reason="")
self.increase_failure_reason_metric(failure_reason="", labels=self.labels)
yield get_dns_success_metric(1)
except ValidationError as E:
logger.exception(f"Validation failed: {E.args[1]}")
self.increase_failure_reason_metric(failure_reason=E.args[1], config=self.config)
self.increase_failure_reason_metric(failure_reason=E.args[1], labels=self.labels)
yield get_dns_success_metric(0)

def handle_response_options(self, response: Message) -> None:
Expand Down Expand Up @@ -630,24 +632,11 @@ def validate_response(self, response: Message) -> None:
self.validate_response_rrs(response=response)

@staticmethod
def increase_failure_reason_metric(
failure_reason: str,
config: Config | None = None,
) -> None:
def increase_failure_reason_metric(failure_reason: str, labels: dict[str, str]) -> None:
"""This method is used to maintain failure metrics.
If an empty string is passed as failure_reason (meaning success) the failure counters will not be incremented.
"""
# get server and proxy (if any)
if config:
protocol = config.protocol
server = config.server.geturl() if config.server else "none"
proxy = config.proxy.geturl() if config.proxy else "none"
else:
protocol = "none"
server = "none"
proxy = "none"

# was there a failure?
if not failure_reason:
return
Expand All @@ -657,17 +646,27 @@ def increase_failure_reason_metric(
# unknown failure_reason, this is a bug
raise UnknownFailureReasonError(failure_reason)

# delete unwelcome labels
for key in TTL_LABELS:
with contextlib.suppress(KeyError):
del labels[key]

# build a dict with reason first and the rest of the labels after
labeldict = {"reason": failure_reason}
labeldict.update(labels)
logger.debug(labeldict)
# increase the global failure counter
dnsexp_scrape_failures_total.labels(reason=failure_reason, protocol=protocol, server=server, proxy=proxy).inc()
dnsexp_scrape_failures_total.labels(**labeldict).inc()
return


class FailCollector(DNSCollector):
"""Custom collector class used to handle pre-DNSCollector failures, like configuration issues."""

def __init__(self, failure_reason: str) -> None:
def __init__(self, failure_reason: str, labels: dict[str, str]) -> None:
"""Save failure reason for use later."""
self.reason = failure_reason
self.labels = labels

def collect_dns(
self,
Expand All @@ -677,4 +676,4 @@ def collect_dns(
yield get_dns_qtime_metric()
yield get_dns_ttl_metric()
yield get_dns_success_metric(value=0)
self.increase_failure_reason_metric(failure_reason=self.reason)
self.increase_failure_reason_metric(failure_reason=self.reason, labels=self.labels)
38 changes: 23 additions & 15 deletions src/dns_exporter/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from dns_exporter.collector import DNSCollector, FailCollector
from dns_exporter.config import Config, ConfigDict, RFValidator, RRValidator
from dns_exporter.exceptions import ConfigError
from dns_exporter.metrics import dnsexp_http_requests_total, dnsexp_http_responses_total
from dns_exporter.metrics import QTIME_LABELS, dnsexp_http_requests_total, dnsexp_http_responses_total
from dns_exporter.version import __version__

if TYPE_CHECKING: # pragma: no cover
Expand Down Expand Up @@ -356,10 +356,10 @@ def validate_config(self) -> None:
raise ConfigError("invalid_request_query_name")

@staticmethod
def handle_failure(fail_registry: CollectorRegistry, failure: str) -> None:
def handle_failure(fail_registry: CollectorRegistry, failure: str, labels: dict[str, str]) -> None:
"""Handle various failure cases that can occur before the DNSCollector is called."""
logger.debug(f"Initialising FailCollector to handle failure: {failure}")
fail_collector = FailCollector(failure_reason=failure)
fail_collector = FailCollector(failure_reason=failure, labels=labels)
logger.debug("Registering FailCollector in dnsexp_registry")
fail_registry.register(fail_collector)

Expand Down Expand Up @@ -585,11 +585,17 @@ def handle_query_request(self) -> None:
dnsexp_registry = CollectorRegistry()
self.fail_registry = CollectorRegistry()

# begin labels dict
self.labels: dict[str, str] = {}
for key in QTIME_LABELS:
# default all labels to the string "none"
self.labels[key] = "none"

# build and validate configuration for this scrape from defaults, config file and request querystring
try:
self.build_final_config(qs=self.qs)
except ConfigError as E:
self.handle_failure(self.fail_registry, str(E))
self.handle_failure(self.fail_registry, str(E), labels=self.labels)
# something is wrong with the config, send error response and bail out
self.send_metric_response(registry=self.fail_registry, query=self.qs)
return
Expand All @@ -603,16 +609,18 @@ def handle_query_request(self) -> None:
return

# config is ready for action, begin the labels dict
labels: dict[str, str] = {
"server": str(self.config.server.geturl()), # type: ignore[union-attr]
"ip": str(self.config.ip),
"port": str(self.config.server.port), # type: ignore[union-attr]
"protocol": str(self.config.protocol),
"family": str(self.config.family),
"proxy": str(self.config.proxy.geturl()) if self.config.proxy else "none",
"query_name": str(self.config.query_name),
"query_type": str(self.config.query_type),
}
self.labels.update(
{
"server": str(self.config.server.geturl()), # type: ignore[union-attr]
"ip": str(self.config.ip),
"port": str(self.config.server.port), # type: ignore[union-attr]
"protocol": str(self.config.protocol),
"family": str(self.config.family),
"proxy": str(self.config.proxy.geturl()) if self.config.proxy else "none",
"query_name": str(self.config.query_name),
"query_type": str(self.config.query_type),
}
)

# prepare query
qname = dns.name.from_text(str(self.config.query_name))
Expand Down Expand Up @@ -662,7 +670,7 @@ def handle_query_request(self) -> None:
q.flags |= dns.flags.RD

# register the DNSCollector in dnsexp_registry
dns_collector = DNSCollector(config=self.config, query=q, labels=labels)
dns_collector = DNSCollector(config=self.config, query=q, labels=self.labels)
dnsexp_registry.register(dns_collector)
# send the response (which triggers the collect)
logger.debug("Returning DNS query metrics")
Expand Down
42 changes: 31 additions & 11 deletions src/dns_exporter/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
########################################################
# scrape-specific metrics used by the DNSCollector (served under /query)

# the labels used in the qtime and ttl metrics
# the labels used in the qtime, ttl, and failure metrics
QTIME_LABELS = [
"server",
"ip",
Expand All @@ -38,6 +38,14 @@
"nsid",
]

# additional labels used in the per-RR TTL metrics
TTL_LABELS = [
"rr_section", # answer, authority or additional
"rr_name",
"rr_type",
"rr_value",
]

FAILURE_REASONS = [
"invalid_request_module",
"invalid_request_config",
Expand Down Expand Up @@ -150,10 +158,7 @@ def get_dns_ttl_metric() -> GaugeMetricFamily:
documentation="DNS response RR TTL in seconds.",
labels=[
*QTIME_LABELS,
"rr_section", # answer, authority or additional
"rr_name",
"rr_type",
"rr_value",
*TTL_LABELS,
],
)

Expand Down Expand Up @@ -267,14 +272,29 @@ def get_dns_ttl_metric() -> GaugeMetricFamily:
dnsexp_scrape_failures_total = Counter(
name="dnsexp_scrape_failures_total",
documentation="The total number of scrapes failed by failure reason, protocol, server, and proxy (where applicable). This counter is increased every time the dns_exporter receives a scrape request which fails for some reason, including response validation logic.", # noqa: E501
labelnames=["reason", "protocol", "server", "proxy"],
labelnames=["reason", *QTIME_LABELS],
)
"""``dnsexp_scrape_failures_total`` is the Counter keeping track of how many scrape requests failed for some reason.
This metric has four labels:
- ``reason`` is set to the failure reason.
- ``protocol`` is set to the protocol.
- ``server`` is set to the server URL.
- ``proxy`` is set to the proxy URL (or ``none``).
This Counter has the following labels, they are the same as ``dns_exporter.metrics.dnsexp_dns_query_time_seconds``
plus the ``reason`` label which has the failure reason.
- ``reason``
- ``server``
- ``ip``
- ``port``
- ``protocol``
- ``family``
- ``query_name``
- ``query_type``
- ``transport``
- ``opcode``
- ``rcode``
- ``flags``
- ``answer``
- ``authority``
- ``additional``
- ``nsid``
The placeholder ``none`` is used for cases where there is no suitable value for the label.
"""
2 changes: 1 addition & 1 deletion src/tests/test_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ class Conf:
mock_conf = Conf()
c = DNSCollector(mock_conf, 2, 3)
with pytest.raises(Exception, match="Unknown failure_reason foo - please file a bug!") as e:
list(c.increase_failure_reason_metric(failure_reason="foo"))
list(c.increase_failure_reason_metric(failure_reason="foo", labels={}))
assert str(e.value) == "Unknown failure_reason foo - please file a bug!"
Loading

0 comments on commit 0777877

Please sign in to comment.