diff --git a/.docker/otel-collector-config.yaml b/.docker/otel-collector-config.yaml new file mode 100644 index 0000000..963741b --- /dev/null +++ b/.docker/otel-collector-config.yaml @@ -0,0 +1,25 @@ +receivers: + otlp: + protocols: + grpc: + http: + +exporters: + debug: + prometheus: + endpoint: "0.0.0.0:8889" + const_labels: + otel: otel + otlp: + endpoint: "jaeger:4317" + tls: + insecure: true + +service: + pipelines: + metrics: + receivers: [otlp] + exporters: [prometheus] + traces: + receivers: [otlp] + exporters: [otlp] diff --git a/.docker/prometheus.yml b/.docker/prometheus.yml new file mode 100644 index 0000000..1c7522c --- /dev/null +++ b/.docker/prometheus.yml @@ -0,0 +1,12 @@ +global: + scrape_interval: 10s + +scrape_configs: + - job_name: 'imalive' + static_configs: + - targets: ['imalive-api:8080'] + metrics_path: '/v1/prom' + scheme: http + - job_name: 'opentelemetry' + static_configs: + - targets: ['otel-collector:8889'] diff --git a/.env.example b/.env.example index 17d2d99..27e9d9d 100644 --- a/.env.example +++ b/.env.example @@ -14,3 +14,4 @@ SLACK_TRIGGER= DISCORD_TRIGGER= WARNING_THRESHOLD=80 ERROR_THRESHOLD=90 +OTEL_COLLECTOR_ENDPOINT="otel-collector:4317" diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8a7a7d7..bfbc753 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,8 +1,8 @@ stages: - publish - deliver - - deploy - test + - arm mirror: stage: publish @@ -31,7 +31,7 @@ api_x86: - imalive api_arm: - stage: deliver + stage: arm script: - setsid ./ci/docker-deliver.sh "arm" "imalive-api" only: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8285971..ac8ddba 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -11,3 +11,8 @@ You can also run unit tests by running this command: ```shell docker-compose -f docker-compose-local.yml up --build --abort-on-container-exit imalive-tests ``` + +Then you can try: +* jaegger UI here for the traces: http://localhost:16686 +* the opentelemetry metrics exporter endpoint: http://localhost:8889/metrics +* prometheus: http://localhost:9090 diff --git a/Dockerfile b/Dockerfile index f5d830b..3de331b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,8 @@ WORKDIR /app COPY requirements.txt /app/requirements.txt -RUN apk add --no-cache --virtual .build-deps gcc musl-dev linux-headers && \ +RUN apk add --no-cache libstdc++ && \ + apk add --no-cache --virtual .build-deps gcc g++ musl-dev linux-headers && \ pip install --upgrade pip && \ pip install -r requirements.txt && \ apk del .build-deps diff --git a/README.md b/README.md index 1204180..b75e9ca 100644 --- a/README.md +++ b/README.md @@ -16,8 +16,6 @@ Just a dummy healthcheck api for your nodes (support x86 and armhf for raspberry It provide a http/restful endpoint that you can use as a healthcheck rule to your loadbalancer and also publish a heartbit in stdout (usefull if you collect it in a log/alerting management system such as elasticstack). -BTW we're also providing packages and images for elasticstack (Kibana, Elasticsearch, Filebeat) [here](https://gitlab.comwork.io/oss/elasticstack). - ![kibana](./img/kibana.png) ## Table of content @@ -121,7 +119,7 @@ $ curl localhost:8080/v1/metrics ### Metrics for prometheus -If you want to use `imalive` as a metrics exporter, this is the way: +If you want to use `imalive` as a Prometheus metrics exporter, this is the way: ```shell $ curl localhost:8080/v1/prom @@ -146,6 +144,20 @@ disk_total 56.096561431884766 # HELP imalive_imalive_http_reques ``` +Here's an example of Prometheus config for scraping the data: + +```yaml +global: + scrape_interval: 10s + +scrape_configs: + - job_name: 'imalive' + static_configs: + - targets: ['imalive-api:8080'] + metrics_path: '/v1/prom' + scheme: http +``` + ## Heartbit You can change the wait time between two heartbit with the `WAIT_TIME` environment variable (in seconds). @@ -162,6 +174,60 @@ You can change `anode` by your node name with the `IMALIVE_NODE_NAME` environmen You also can log only a json output by making the environment variable `LOG_FORMAT` equal "json". +## OpenTelemetry + +You can also configure an OTEL Grpc endpoint using the `OTEL_COLLECTOR_ENDPOINT` environment variable. + +Imalive is sending metrics and traces through GRPC OTLP, you'll be able to see your traces on Jaegger like this: + +![jaegger](./img/jaegger.png) + +And your metrics on Prometheus like this: + +![prometheus](./img/prometheus.png) + +Here's an example of Prometheus configuration for scrapping the opentelemetry collector metrics: + +```yaml +global: + scrape_interval: 10s + +scrape_configs: + - job_name: 'opentelemetry' + static_configs: + - targets: ['otel-collector:8889'] +``` + +And the opentelemetry collector configuration as well for receiving the traces and metrics from imalive: + +```yaml +receivers: + otlp: + protocols: + grpc: + http: + +exporters: + debug: + prometheus: + endpoint: "0.0.0.0:8889" + const_labels: + otel: otel + otlp: + endpoint: "jaeger:4317" + tls: + insecure: true + +service: + pipelines: + metrics: + receivers: [otlp] + exporters: [prometheus] + traces: + receivers: [otlp] + exporters: [otlp] +``` + ## Development / contributions Go see this [documentation](./CONTRIBUTING.md) diff --git a/VERSION b/VERSION index 87ce492..40c341b 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.5.2 +3.6.0 diff --git a/docker-compose-local.yml b/docker-compose-local.yml index 0a4814a..6f06831 100644 --- a/docker-compose-local.yml +++ b/docker-compose-local.yml @@ -1,9 +1,10 @@ -version: "3.3" +version: "3.9" services: imalive-api: restart: always image: imalive-api:latest + container_name: imalive-api build: context: . dockerfile: ./Dockerfile @@ -12,8 +13,50 @@ services: - .env ports: - "8080:8080" + networks: + - imalive-net + otel-collector: + restart: always + image: otel/opentelemetry-collector:latest + container_name: otel-collector + command: ["--config=/etc/otel-collector-config.yaml"] + volumes: + - .docker/otel-collector-config.yaml:/etc/otel-collector-config.yaml + ports: + - "1888:1888" # pprof extension + - "8888:8888" # Prometheus metrics exposed by the collector + - "8889:8889" # Prometheus exporter metrics + - "13133:13133" # health_check extension + - "4317:4317" # OTLP gRPC receiver + - "4318:4318" # OTLP gRPC receiver + - "55679:55679" # zpages extension + depends_on: + - jaeger + networks: + - imalive-net + jaeger: + restart: always + image: jaegertracing/all-in-one:latest + container_name: jaeger + ports: + - "16686:16686" + networks: + - imalive-net + prometheus: + image: prom/prometheus + container_name: prometheus + ports: + - 9090:9090 + volumes: + - .docker/prometheus.yml:/etc/prometheus/prometheus.yml + networks: + - imalive-net + restart: always imalive-tests: build: context: . dockerfile: ./Dockerfile target: unit_tests + +networks: + imalive-net: diff --git a/img/jaegger.png b/img/jaegger.png new file mode 100644 index 0000000..2e60aea Binary files /dev/null and b/img/jaegger.png differ diff --git a/img/prometheus.png b/img/prometheus.png new file mode 100644 index 0000000..4551ccb Binary files /dev/null and b/img/prometheus.png differ diff --git a/requirements.txt b/requirements.txt index 564fe00..860ea6f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,8 @@ requests uvicorn[standard] fastapi-utils psutil -prometheus-fastapi-instrumentator \ No newline at end of file +prometheus-fastapi-instrumentator +opentelemetry-api +opentelemetry-sdk +opentelemetry-instrumentation-fastapi +opentelemetry-exporter-otlp diff --git a/src/main.py b/src/main.py index b180153..65944e7 100644 --- a/src/main.py +++ b/src/main.py @@ -1,13 +1,13 @@ -import asyncio - from fastapi import FastAPI from prometheus_fastapi_instrumentator import Instrumentator +from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor from restful_ressources import import_ressources from utils.common import is_not_empty from utils.manifests import get_manifest_as_dict from utils.heartbit import heartbit +from utils.otel import init_otel_tracer, init_otel_metrics version = "unkown" manifest = get_manifest_as_dict() @@ -24,10 +24,15 @@ instrumentator = Instrumentator() +init_otel_tracer() +init_otel_metrics() + heartbit() instrumentator.instrument(app, metric_namespace='imalive', metric_subsystem='imalive') instrumentator.expose(app, endpoint='/v1/prom') instrumentator.expose(app, endpoint='/prom') +FastAPIInstrumentor.instrument_app(app) + import_ressources(app) diff --git a/src/routes/api_health.py b/src/routes/api_health.py index 75e5a04..48ad1f4 100644 --- a/src/routes/api_health.py +++ b/src/routes/api_health.py @@ -1,12 +1,16 @@ -from utils.health import health from fastapi import APIRouter +from utils.otel import get_otel_tracer +from utils.health import health + router = APIRouter() @router.get("") def get_health(): - return health() + with get_otel_tracer().start_as_current_span("imalive-health-get-route"): + return health() @router.post("") def post_health(): - return health() + with get_otel_tracer().start_as_current_span("imalive-health-post-route"): + return health() diff --git a/src/routes/api_manifest.py b/src/routes/api_manifest.py index c3fa434..1745b5f 100644 --- a/src/routes/api_manifest.py +++ b/src/routes/api_manifest.py @@ -1,14 +1,17 @@ from fastapi import APIRouter from fastapi.responses import JSONResponse + +from utils.otel import get_otel_tracer from utils.manifests import get_manifest_as_dict router = APIRouter() @router.get("") def get_manifest(): - manifest = get_manifest_as_dict() + with get_otel_tracer().start_as_current_span("imalive-manifest-route"): + manifest = get_manifest_as_dict() - if manifest['status'] == 'error': - return JSONResponse(content=manifest, status_code=500) - else: - return manifest + if manifest['status'] == 'error': + return JSONResponse(content=manifest, status_code=500) + else: + return manifest diff --git a/src/routes/api_metrics.py b/src/routes/api_metrics.py index 61160cb..73897f8 100644 --- a/src/routes/api_metrics.py +++ b/src/routes/api_metrics.py @@ -1,8 +1,11 @@ -from utils.metrics import all_metrics from fastapi import APIRouter +from utils.otel import get_otel_tracer +from utils.metrics import all_metrics + router = APIRouter() @router.get("") def get_metrics(): - return all_metrics() + with get_otel_tracer().start_as_current_span("imalive-metrics-route"): + return all_metrics() diff --git a/src/routes/api_root.py b/src/routes/api_root.py index 0f8f345..72c1ca0 100644 --- a/src/routes/api_root.py +++ b/src/routes/api_root.py @@ -1,8 +1,11 @@ -from utils.health import health from fastapi import APIRouter +from utils.otel import get_otel_tracer +from utils.health import health + router = APIRouter() @router.get("/") def get_root(): - return health() + with get_otel_tracer().start_as_current_span("imalive-root-route"): + return health() diff --git a/src/utils/gauge.py b/src/utils/gauge.py new file mode 100644 index 0000000..d966ba3 --- /dev/null +++ b/src/utils/gauge.py @@ -0,0 +1,37 @@ +import re + +from prometheus_client import Gauge +from opentelemetry.metrics import Observation + +from utils.otel import get_otel_meter + +_numeric_value_pattern = r"-?\d+\.\d+" +_current_gauge_values = {} + +def create_gauge(name, description): + _current_gauge_values[name] = { + 'val': 0.0, + 'desc': description + } + + def observable_gauge_func(_): + yield Observation(_current_gauge_values[name]['val']) + + get_otel_meter().create_observable_gauge( + name = name, + description = description, + callbacks=[observable_gauge_func] + ) + + return Gauge( + name, + description + ) + +def set_gauge(gauge, value): + match = re.search(_numeric_value_pattern, "{}".format(value)) + + if match: + val = float(match.group()) + gauge.set(val) + _current_gauge_values[gauge._name]['val'] = val diff --git a/src/utils/heartbit.py b/src/utils/heartbit.py index 0d3193e..17c48ea 100644 --- a/src/utils/heartbit.py +++ b/src/utils/heartbit.py @@ -3,9 +3,11 @@ import threading from time import sleep -from utils.common import is_enabled -from utils.prom import create_gauge, set_gauge + +from utils.common import is_enabled +from utils.otel import get_otel_tracer +from utils.gauge import create_gauge, set_gauge from utils.metrics import all_metrics, check_and_log_usage from utils.logger import log_msg @@ -59,16 +61,17 @@ def disc(metrics): def heartbit(): def loop_heartbit(): while True: - metrics = all_metrics() + with get_otel_tracer().start_as_current_span("imalive-heartbit"): + metrics = all_metrics() - cpu(metrics) - ram(metrics) - swap(metrics) - disc(metrics) + cpu(metrics) + ram(metrics) + swap(metrics) + disc(metrics) - log_msg("INFO", metrics if is_enabled(LOG_FORMAT) and LOG_FORMAT == "json" else "[metrics] I'm alive! metrics = {}".format(metrics)) + log_msg("INFO", metrics if is_enabled(LOG_FORMAT) and LOG_FORMAT == "json" else "[metrics] I'm alive! metrics = {}".format(metrics)) - sleep(WAIT_TIME) + sleep(WAIT_TIME) def start_heartbit(): loop = asyncio.new_event_loop() diff --git a/src/utils/otel.py b/src/utils/otel.py new file mode 100644 index 0000000..767cca5 --- /dev/null +++ b/src/utils/otel.py @@ -0,0 +1,40 @@ +import os + +from opentelemetry import trace +from opentelemetry.metrics import set_meter_provider, get_meter +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.resources import Resource +from opentelemetry.semconv.resource import ResourceAttributes +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter + +from utils.common import is_enabled + +_otel_tracer = trace.get_tracer(__name__) +_otel_collector_endpoint = os.getenv('OTEL_COLLECTOR_ENDPOINT') +_otel_service_name = "imalive-{}".format(os.getenv('IMALIVE_NODE_NAME', "anode")) +_otel_service_version = os.getenv('VERSION', '0.1') +_otel_meter = get_meter(_otel_service_name, version=_otel_service_version) +_otel_resource = Resource.create(attributes={ + ResourceAttributes.SERVICE_NAME: _otel_service_name, +}) + +def init_otel_tracer(): + trace.set_tracer_provider(TracerProvider(resource=_otel_resource)) + + if is_enabled(_otel_collector_endpoint): + trace.get_tracer_provider().add_span_processor(BatchSpanProcessor(OTLPSpanExporter(endpoint=_otel_collector_endpoint, insecure=True))) + +def init_otel_metrics(): + if is_enabled(_otel_collector_endpoint): + otlp_exporter = OTLPMetricExporter(endpoint=_otel_collector_endpoint, insecure=True) + set_meter_provider(MeterProvider(resource=_otel_resource, metric_readers=[PeriodicExportingMetricReader(otlp_exporter, export_interval_millis=5000)])) + +def get_otel_tracer(): + return _otel_tracer + +def get_otel_meter(): + return _otel_meter diff --git a/src/utils/prom.py b/src/utils/prom.py deleted file mode 100644 index 23ac980..0000000 --- a/src/utils/prom.py +++ /dev/null @@ -1,17 +0,0 @@ -import re - -from prometheus_client import Gauge - -_numeric_value_pattern = r"-?\d+\.\d+" - -def create_gauge(name, description): - return Gauge( - name, - description - ) - -def set_gauge(gauge, value): - match = re.search(_numeric_value_pattern, "{}".format(value)) - - if match: - gauge.set(float(match.group()))