From 1020867e7bee17de66edba383785ce1601e8f932 Mon Sep 17 00:00:00 2001 From: Andrew Scribner Date: Wed, 24 Jul 2024 16:56:11 -0400 Subject: [PATCH 1/6] remove stale opentelemetry_sdk packages This hack mitigate an issue where the otel library's mechanism to discover resource detectors raised an Exception after a charm refresh, as described in https://github.com/canonical/grafana-agent-operator/issues/146. That was caused by a juju/charmcraft issue (https://bugs.launchpad.net/juju/+bug/2058335) where, when upgrading packages, Juju leaves behind package metadata from the old versions. This caused otel's discovery mechanism to incorrectly use the old resource detector, leading to an exception. The fix here is to, before the otel discovery mechanism fires, detect if we have multiple "opentelemetry_sdk" distributions (packages) and if we do, delete any that appear to be stale (where "stale" is determined by whether they present `entry_points` - this assumption worked in testing, but not sure if it is universally valid) --- lib/charms/tempo_k8s/v1/charm_tracing.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/lib/charms/tempo_k8s/v1/charm_tracing.py b/lib/charms/tempo_k8s/v1/charm_tracing.py index ebe022e0..4d7d9969 100644 --- a/lib/charms/tempo_k8s/v1/charm_tracing.py +++ b/lib/charms/tempo_k8s/v1/charm_tracing.py @@ -178,7 +178,9 @@ def my_tracing_endpoint(self) -> Optional[str]: import os from contextlib import contextmanager from contextvars import Context, ContextVar, copy_context +from importlib.metadata import distributions from pathlib import Path +import shutil from typing import ( Any, Callable, @@ -217,7 +219,7 @@ def my_tracing_endpoint(self) -> Optional[str]: # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 11 +LIBPATCH = 12 PYDEPS = ["opentelemetry-exporter-otlp-proto-http==1.21.0"] @@ -359,6 +361,23 @@ def _get_server_cert( return server_cert +def _remove_stale_otel_sdk_packages(): + """Hack to remove stale opentelemetry sdk packages from the environment. + + See https://github.com/canonical/grafana-agent-operator/issues/146 and + https://bugs.launchpad.net/juju/+bug/2058335 for more context. + """ + # Find any opentelemetry_sdk distributions + otel_sdk_distributions = list(distributions(name="opentelemetry_sdk")) + # If there are more than 2, inspect them and infer that any with 0 entrypoints are stale + if len(otel_sdk_distributions) > 1: + for d in otel_sdk_distributions: + if len(d.entry_points) == 0: + # Distribution appears to be empty. Remove it + logger.debug(f"Removing empty opentelemetry_sdk distribution at: {d._path}") + shutil.rmtree(d._path) + + def _setup_root_span_initializer( charm_type: _CharmType, tracing_endpoint_attr: str, @@ -391,6 +410,7 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): _service_name = service_name or f"{self.app.name}-charm" unit_name = self.unit.name + _remove_stale_otel_sdk_packages() resource = Resource.create( attributes={ "service.name": _service_name, From f932dc4eff77dd50c075a08f674a96f75b404d4e Mon Sep 17 00:00:00 2001 From: Andrew Scribner Date: Thu, 25 Jul 2024 14:37:12 -0400 Subject: [PATCH 2/6] feat: add guard so otel package removal only happens on upgrade-charm hooks --- lib/charms/tempo_k8s/v1/charm_tracing.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/lib/charms/tempo_k8s/v1/charm_tracing.py b/lib/charms/tempo_k8s/v1/charm_tracing.py index 4d7d9969..03d5d0b3 100644 --- a/lib/charms/tempo_k8s/v1/charm_tracing.py +++ b/lib/charms/tempo_k8s/v1/charm_tracing.py @@ -366,16 +366,19 @@ def _remove_stale_otel_sdk_packages(): See https://github.com/canonical/grafana-agent-operator/issues/146 and https://bugs.launchpad.net/juju/+bug/2058335 for more context. + + This only does something if executed on an upgrade-charm event. """ - # Find any opentelemetry_sdk distributions - otel_sdk_distributions = list(distributions(name="opentelemetry_sdk")) - # If there are more than 2, inspect them and infer that any with 0 entrypoints are stale - if len(otel_sdk_distributions) > 1: - for d in otel_sdk_distributions: - if len(d.entry_points) == 0: - # Distribution appears to be empty. Remove it - logger.debug(f"Removing empty opentelemetry_sdk distribution at: {d._path}") - shutil.rmtree(d._path) + if os.getenv("JUJU_HOOK_PATH") == "hooks/upgrade-charm": + # Find any opentelemetry_sdk distributions + otel_sdk_distributions = list(distributions(name="opentelemetry_sdk")) + # If there are more than 2, inspect them and infer that any with 0 entrypoints are stale + if len(otel_sdk_distributions) > 1: + for d in otel_sdk_distributions: + if len(d.entry_points) == 0: + # Distribution appears to be empty. Remove it + logger.debug(f"Removing empty opentelemetry_sdk distribution at: {d._path}") + shutil.rmtree(d._path) def _setup_root_span_initializer( From 1db92527bf613efcd815f5f71ff8b0115180e7ab Mon Sep 17 00:00:00 2001 From: Andrew Scribner Date: Thu, 25 Jul 2024 14:44:02 -0400 Subject: [PATCH 3/6] debug: ignore type errors --- lib/charms/tempo_k8s/v1/charm_tracing.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/charms/tempo_k8s/v1/charm_tracing.py b/lib/charms/tempo_k8s/v1/charm_tracing.py index 03d5d0b3..3cfdf02d 100644 --- a/lib/charms/tempo_k8s/v1/charm_tracing.py +++ b/lib/charms/tempo_k8s/v1/charm_tracing.py @@ -176,11 +176,11 @@ def my_tracing_endpoint(self) -> Optional[str]: import inspect import logging import os +import shutil from contextlib import contextmanager from contextvars import Context, ContextVar, copy_context from importlib.metadata import distributions from pathlib import Path -import shutil from typing import ( Any, Callable, @@ -377,8 +377,8 @@ def _remove_stale_otel_sdk_packages(): for d in otel_sdk_distributions: if len(d.entry_points) == 0: # Distribution appears to be empty. Remove it - logger.debug(f"Removing empty opentelemetry_sdk distribution at: {d._path}") - shutil.rmtree(d._path) + logger.debug(f"Removing empty opentelemetry_sdk distribution at: {d._path}") # type: ignore + shutil.rmtree(d._path) # type: ignore def _setup_root_span_initializer( From 832b367cdb94c10baab10a4cb82f8db8a6cb09d2 Mon Sep 17 00:00:00 2001 From: Andrew Scribner Date: Fri, 26 Jul 2024 11:29:49 -0400 Subject: [PATCH 4/6] fix: env variable used to guard removing stale otel package --- lib/charms/tempo_k8s/v1/charm_tracing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/charms/tempo_k8s/v1/charm_tracing.py b/lib/charms/tempo_k8s/v1/charm_tracing.py index 3cfdf02d..0b082f49 100644 --- a/lib/charms/tempo_k8s/v1/charm_tracing.py +++ b/lib/charms/tempo_k8s/v1/charm_tracing.py @@ -369,7 +369,7 @@ def _remove_stale_otel_sdk_packages(): This only does something if executed on an upgrade-charm event. """ - if os.getenv("JUJU_HOOK_PATH") == "hooks/upgrade-charm": + if os.getenv("JUJU_DISPATCH_PATH") == "hooks/upgrade-charm": # Find any opentelemetry_sdk distributions otel_sdk_distributions = list(distributions(name="opentelemetry_sdk")) # If there are more than 2, inspect them and infer that any with 0 entrypoints are stale From 5abda90ca4ccf791150a8dd496cbac29bc31bb84 Mon Sep 17 00:00:00 2001 From: Andrew Scribner Date: Fri, 26 Jul 2024 11:34:47 -0400 Subject: [PATCH 5/6] docs: add comment describing when patch can be removed --- lib/charms/tempo_k8s/v1/charm_tracing.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/charms/tempo_k8s/v1/charm_tracing.py b/lib/charms/tempo_k8s/v1/charm_tracing.py index 0b082f49..85d5c48a 100644 --- a/lib/charms/tempo_k8s/v1/charm_tracing.py +++ b/lib/charms/tempo_k8s/v1/charm_tracing.py @@ -365,7 +365,9 @@ def _remove_stale_otel_sdk_packages(): """Hack to remove stale opentelemetry sdk packages from the environment. See https://github.com/canonical/grafana-agent-operator/issues/146 and - https://bugs.launchpad.net/juju/+bug/2058335 for more context. + https://bugs.launchpad.net/juju/+bug/2058335 for more context. This patch can be removed after + this juju issue is resolved and sufficient time has passed to expect most users of this library + have migrated to the patched version of juju. This only does something if executed on an upgrade-charm event. """ From bac4831ab8205d1c97d5993f736f4edb7689f52c Mon Sep 17 00:00:00 2001 From: Pietro Pasotti Date: Mon, 29 Jul 2024 13:06:39 +0200 Subject: [PATCH 6/6] pr comments --- lib/charms/tempo_k8s/v1/charm_tracing.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/lib/charms/tempo_k8s/v1/charm_tracing.py b/lib/charms/tempo_k8s/v1/charm_tracing.py index 85d5c48a..913b20bb 100644 --- a/lib/charms/tempo_k8s/v1/charm_tracing.py +++ b/lib/charms/tempo_k8s/v1/charm_tracing.py @@ -362,25 +362,27 @@ def _get_server_cert( def _remove_stale_otel_sdk_packages(): - """Hack to remove stale opentelemetry sdk packages from the environment. + """Hack to remove stale opentelemetry sdk packages from the charm's python venv. See https://github.com/canonical/grafana-agent-operator/issues/146 and - https://bugs.launchpad.net/juju/+bug/2058335 for more context. This patch can be removed after + https://bugs.launchpad.net/juju/+bug/2058335 for more context. This patch can be removed after this juju issue is resolved and sufficient time has passed to expect most users of this library have migrated to the patched version of juju. This only does something if executed on an upgrade-charm event. """ if os.getenv("JUJU_DISPATCH_PATH") == "hooks/upgrade-charm": + logger.debug("Executing _remove_stale_otel_sdk_packages patch on charm upgrade") # Find any opentelemetry_sdk distributions otel_sdk_distributions = list(distributions(name="opentelemetry_sdk")) - # If there are more than 2, inspect them and infer that any with 0 entrypoints are stale + # If there is more than 1, inspect each and if it has 0 entrypoints, infer that it is stale if len(otel_sdk_distributions) > 1: - for d in otel_sdk_distributions: - if len(d.entry_points) == 0: - # Distribution appears to be empty. Remove it - logger.debug(f"Removing empty opentelemetry_sdk distribution at: {d._path}") # type: ignore - shutil.rmtree(d._path) # type: ignore + for distribution in otel_sdk_distributions: + if len(distribution.entry_points) == 0: + # Distribution appears to be empty. Remove it + path = distribution._path # type: ignore + logger.debug(f"Removing empty opentelemetry_sdk distribution at: {path}") + shutil.rmtree(path) def _setup_root_span_initializer( @@ -415,6 +417,9 @@ def wrap_init(self: CharmBase, framework: Framework, *args, **kwargs): _service_name = service_name or f"{self.app.name}-charm" unit_name = self.unit.name + # apply hacky patch to remove stale opentelemetry sdk packages on upgrade-charm. + # it could be trouble if someone ever decides to implement their own tracer parallel to + # ours and before the charm has inited. We assume they won't. _remove_stale_otel_sdk_packages() resource = Resource.create( attributes={