Skip to content

Commit

Permalink
include server and proxy labels in failure metrics for #98
Browse files Browse the repository at this point in the history
  • Loading branch information
tykling committed Mar 2, 2024
1 parent 8489094 commit 6e41fe1
Show file tree
Hide file tree
Showing 8 changed files with 175 additions and 82 deletions.
31 changes: 23 additions & 8 deletions src/dns_exporter/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,19 +128,21 @@ def collect_dns(self) -> Iterator[CounterMetricFamily | GaugeMetricFamily]:
except dns.exception.Timeout:
# configured timeout was reached before a response arrived
reason = "timeout"
yield from self.yield_failure_reason_metric(failure_reason=reason)
yield from self.yield_failure_reason_metric(failure_reason=reason, config=self.config)
except ConnectionRefusedError:
# server actively refused the connection
reason = "connection_error"
yield from self.yield_failure_reason_metric(
failure_reason=reason,
config=self.config,
)
except OSError as e:
# raised by multiple protocols on ICMP unreach
logger.debug(f"Protocol {self.config.protocol} got OSError '{e}', exception follows", exc_info=True)
reason = "connection_error"
yield from self.yield_failure_reason_metric(
failure_reason=reason,
config=self.config,
)
except ProtocolSpecificError as e:
# a protocol specific exception was raised, log and re-raise
Expand All @@ -149,7 +151,7 @@ def collect_dns(self) -> Iterator[CounterMetricFamily | GaugeMetricFamily]:
exc_info=True,
)
reason = str(e)
yield from self.yield_failure_reason_metric(failure_reason=reason)
yield from self.yield_failure_reason_metric(failure_reason=reason, config=self.config)
except Exception: # noqa: BLE001
logger.warning(
f"""Caught an unknown exception while looking up qname {self.config.query_name} using server
Expand All @@ -158,14 +160,16 @@ def collect_dns(self) -> Iterator[CounterMetricFamily | GaugeMetricFamily]:
exc_info=True,
)
reason = "other_failure"
yield from self.yield_failure_reason_metric(failure_reason=reason)
yield from self.yield_failure_reason_metric(failure_reason=reason, config=self.config)

# clock it
qtime = time.time() - start

# did we get a response?
if r is None:
logger.info(f"No DNS response received from server {self.config.server.geturl()} :( returning metrics, failure reason is '{reason}'...")
logger.warning(
f"No DNS response received from server {self.config.server.geturl()} - failure reason is '{reason}'..."
)
yield from (get_dns_qtime_metric(), get_dns_ttl_metric(), get_dns_success_metric(value=0))
return None

Expand Down Expand Up @@ -215,7 +219,7 @@ def handle_response(
yield get_dns_success_metric(1)
except ValidationError as E:
logger.exception(f"Validation failed: {E.args[1]}")
yield from self.yield_failure_reason_metric(failure_reason=E.args[1])
yield from self.yield_failure_reason_metric(failure_reason=E.args[1], config=self.config)
yield get_dns_success_metric(0)

def handle_response_options(self, response: Message) -> None:
Expand Down Expand Up @@ -614,27 +618,38 @@ def validate_response(self, response: Message) -> None:
@staticmethod
def yield_failure_reason_metric(
failure_reason: str,
config: Config | None = None,
) -> Iterator[CounterMetricFamily]:
"""This method is used to maintain failure metrics.
If an empty string is passed as failure_reason (meaning success) the failure counters will not be incremented.
"""
# get server and proxy (if any)
if config:
server = config.server.geturl() if config.server else "none"
proxy = config.proxy.geturl() if config.proxy else "none"
else:
server = "none"
proxy = "none"

# was there a failure?
if failure_reason:
# is it a valid failure reason?
if failure_reason not in FAILURE_REASONS:
# unknown failure_reason, this is a bug
raise UnknownFailureReasonError(failure_reason)
# increase the global failure counter
dnsexp_scrape_failures_total.labels(reason=failure_reason).inc()
dnsexp_scrape_failures_total.labels(reason=failure_reason, server=server, proxy=proxy).inc()
# get the failure metric
fail = get_dns_failure_metric()
# initialise all labels in the per-scrape metric,
# loop over known failure reasons
for reason in FAILURE_REASONS:
# set counter to 1 on match (custom collector - the metrics only exist during the scrape)
if reason == failure_reason:
fail.add_metric([reason], 1)
fail.add_metric([reason, server, proxy], 1)
else:
fail.add_metric([reason], 0)
fail.add_metric([reason, server, proxy], 0)
yield fail


Expand Down
18 changes: 10 additions & 8 deletions src/dns_exporter/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,16 @@
# get logger
logger = logging.getLogger(f"dns_exporter.{__name__}")

# the currently supported protocols in dns_exporter
valid_protocols = [
"udp",
"tcp",
"udptcp",
"dot",
"doh",
"doq",
]


@dataclass
class RRValidator:
Expand Down Expand Up @@ -283,14 +293,6 @@ def validate_integers(self) -> None:

def validate_protocol(self) -> None:
"""Validate protocol."""
valid_protocols = [
"udp",
"tcp",
"udptcp",
"dot",
"doh",
"doq",
]
if self.protocol not in valid_protocols:
raise ConfigError(
"invalid_request_protocol",
Expand Down
5 changes: 3 additions & 2 deletions src/dns_exporter/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,9 @@ def main(mockargs: list[str] | None = None) -> None:
rootlogger = logging.getLogger("")
rootlogger.setLevel(level)
# httpx is noisy at INFO
if logger.isEnabledFor(logging.INFO):
logging.getLogger('httpx').setLevel(logging.WARNING)
if level == logging.INFO:
# httpx is noisy at level info, cap to WARNING
logging.getLogger("httpx").setLevel(logging.WARNING)
logger.info(
f"dns_exporter v{DNSExporter.__version__} starting up - logging at level {level}",
)
Expand Down
10 changes: 5 additions & 5 deletions src/dns_exporter/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,16 +467,16 @@ def parse_server(server: str, protocol: str) -> urllib.parse.SplitResult:
Parse it with urllib.parse.urlsplit, add explicit port if needed, and return the result.
"""
# make sure we always have a scheme to help the parser
if "://" not in server:
server = f"{protocol}://{server}"
# parse the string
splitresult = urllib.parse.urlsplit(server)
# make sure scheme is the dns_exporter internal protocol identifier (not https://)
splitresult = splitresult._replace(scheme=protocol)
if protocol == "doh" and not splitresult.path:
# use the default DoH path
splitresult = urllib.parse.urlsplit(
urllib.parse.urlunsplit(
splitresult._replace(path="/dns-query", scheme="https"),
),
)
splitresult = splitresult._replace(path="/dns-query")
# is there an explicit port in the configured server url? use default if not.
if splitresult.port is None:
if protocol in ["udp", "tcp", "udptcp"]:
Expand Down
14 changes: 10 additions & 4 deletions src/dns_exporter/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,13 +164,16 @@ def get_dns_failure_metric() -> CounterMetricFamily:
A scrape (or the resulting DNS query) can fail for many reasons, including configuration issues, server issues,
timeout, network issues, bad response, or failed response validation.
This metric has just one label:
This metric has three labels:
- ``reason``: The reason for the failure.
- ``server`` is set to the server URL.
- ``proxy`` is set to the proxy URL (or ``none``).
"""
return CounterMetricFamily(
name="dnsexp_failures_total",
documentation="The total number of scrape failures by failure reason. This counter is increased every time a scrape is initiated and a valid response (considering validation rules) is not received.", # noqa: E501
labels=["reason"],
labels=["reason", "server", "proxy"],
)


Expand Down Expand Up @@ -283,10 +286,13 @@ def get_dns_failure_metric() -> CounterMetricFamily:
dnsexp_scrape_failures_total = Counter(
name="dnsexp_scrape_failures_total",
documentation="The total number of scrapes failed by failure reason. This counter is increased every time the dns_exporter receives a scrape request which fails for some reason, including response validation logic.", # noqa: E501
labelnames=["reason"],
labelnames=["reason", "server", "proxy"],
)
"""``dnsexp_scrape_failures_total`` is the Counter keeping track of how many scrape requests failed for some reason.
This metric has one label:
This metric has three labels:
- ``reason`` is set to the failure reason.
- ``server`` is set to the server URL.
- ``proxy`` is set to the proxy URL (or ``none``).
"""
36 changes: 27 additions & 9 deletions src/tests/test_certificate.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ def test_cert_verify_fail_doh(dns_exporter_example_config, caplog):
"family": "ipv4",
},
)
assert 'dnsexp_failures_total{reason="certificate_error"} 1.0' in r.text
assert (
'dnsexp_failures_total{proxy="none",reason="certificate_error",server="doh://91.239.100.100:443/dns-query"} 1.0'
in r.text
)


def test_cert_verify_fail_dot(dns_exporter_example_config, caplog):
Expand All @@ -35,7 +38,9 @@ def test_cert_verify_fail_dot(dns_exporter_example_config, caplog):
"family": "ipv4",
},
)
assert 'dnsexp_failures_total{reason="certificate_error"} 1.0' in r.text
assert (
'dnsexp_failures_total{proxy="none",reason="certificate_error",server="dot://91.239.100.100:853"} 1.0' in r.text
)


# this fails because the adguard servers have IP:.... SAN entries in the certificates
Expand All @@ -54,7 +59,9 @@ def test_cert_verify_fail_doq(dns_exporter_example_config, caplog):
"family": "ipv4",
},
)
assert 'dnsexp_failures_total{reason="certificate_error"} 1.0' in r.text
assert (
'dnsexp_failures_total{proxy="none",reason="certificate_error",server="doq://94.140.14.140:853"} 1.0' in r.text
)


###################################################################################
Expand All @@ -75,7 +82,10 @@ def test_cert_verify_fail_custom_ca_doh(dns_exporter_example_config, caplog):
"verify_certificate_path": "tests/certificates/test.crt",
},
)
assert 'dnsexp_failures_total{reason="certificate_error"} 1.0' in r.text
assert (
'dnsexp_failures_total{proxy="none",reason="certificate_error",server="doh://91.239.100.100:443/dns-query"} 1.0'
in r.text
)


def test_cert_verify_fail_custom_ca_dot(dns_exporter_example_config, caplog):
Expand All @@ -92,7 +102,9 @@ def test_cert_verify_fail_custom_ca_dot(dns_exporter_example_config, caplog):
"verify_certificate_path": "tests/certificates/test.crt",
},
)
assert 'dnsexp_failures_total{reason="certificate_error"} 1.0' in r.text
assert (
'dnsexp_failures_total{proxy="none",reason="certificate_error",server="dot://91.239.100.100:853"} 1.0' in r.text
)
assert "Protocol dot raised ssl.SSLCertVerificationError, returning certificate_error" in caplog.text


Expand All @@ -112,7 +124,7 @@ def test_cert_verify_fail_custom_ca_doq(dns_exporter_example_config, caplog):
},
)
assert "Custom CA path for DoQ is disabled pending https://github.com/tykling/dns_exporter/issues/95" in caplog.text
assert 'dnsexp_failures_total{reason="invalid_request_config"} 1.0' in r.text
assert 'dnsexp_failures_total{proxy="none",reason="invalid_request_config",server="none"} 1.0' in r.text


###################################################################################
Expand Down Expand Up @@ -188,7 +200,10 @@ def test_cert_verify_invalid_path_doh(dns_exporter_example_config, caplog):
"verify_certificate_path": "/nonexistant",
},
)
assert 'dnsexp_failures_total{reason="invalid_request_config"} 1.0' in r.text
assert (
'dnsexp_failures_total{proxy="none",reason="invalid_request_config",server="doh://91.239.100.100:443/dns-query"} 1.0'
in r.text
)
assert "Protocol doh raised exception, returning failure reason invalid_request_config" in caplog.text


Expand All @@ -206,7 +221,10 @@ def test_cert_verify_invalid_path_dot(dns_exporter_example_config, caplog):
"verify_certificate_path": "/nonexistant",
},
)
assert 'dnsexp_failures_total{reason="invalid_request_config"} 1.0' in r.text
assert (
'dnsexp_failures_total{proxy="none",reason="invalid_request_config",server="dot://91.239.100.100:853"} 1.0'
in r.text
)
assert "Protocol dot raised ValueError, is verify_certificate_path wrong" in caplog.text


Expand All @@ -225,5 +243,5 @@ def test_cert_verify_invalid_path_doq(dns_exporter_example_config, caplog):
"verify_certificate_path": "/nonexistant",
},
)
assert 'dnsexp_failures_total{reason="invalid_request_config"} 1.0' in r.text
assert 'dnsexp_failures_total{proxy="none",reason="invalid_request_config",server="none"} 1.0' in r.text
assert "Custom CA path for DoQ is disabled pending https://github.com/tykling/dns_exporter/issues/95" in caplog.text
Loading

0 comments on commit 6e41fe1

Please sign in to comment.