Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: add initial support for prometheus metrics. #1465

Draft
wants to merge 1 commit into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 25 additions & 13 deletions api/tacticalrmm/core/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,35 @@

def monitoring_view(function):
def wrap(request, *args, **kwargs):
if request.method != "POST":
return HttpResponse("Invalid request type\n", status=400)
if request.method == "POST":
try:
data = json.loads(request.body)
except:
return HttpResponse("Invalid json\n", status=400)

if "auth" not in data.keys():
return HttpResponse("Invalid payload\n", status=400)

token = getattr(settings, "MON_TOKEN", "")
if not token:
return HttpResponse("Missing token\n", status=401)

try:
data = json.loads(request.body)
except:
return HttpResponse("Invalid json\n", status=400)
if data.get("auth") != token:
return HttpResponse("Not authenticated\n", status=401)

if "auth" not in data.keys():
return HttpResponse("Invalid payload\n", status=400)
elif request.method == "GET":
if "Authorization" not in request.headers:
return HttpResponse("Missing 'Authorization' header\n", status=400)

token = getattr(settings, "MON_TOKEN", "")
if not token:
return HttpResponse("Missing token\n", status=401)
token = getattr(settings, "MON_TOKEN", "")
if not token:
return HttpResponse("Missing token\n", status=401)

if data.get("auth") != token:
return HttpResponse("Not authenticated\n", status=401)
if request.headers["Authorization"] != "Bearer " + token:
return HttpResponse("Not authenticated\n", status=401)

else:
return HttpResponse("Invalid request type\n", status=400)

return function(request, *args, **kwargs)

Expand Down
139 changes: 139 additions & 0 deletions api/tacticalrmm/core/tests.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from unittest.mock import patch

import tempfile
import requests
from channels.db import database_sync_to_async
from channels.testing import WebsocketCommunicator
Expand Down Expand Up @@ -500,3 +501,141 @@ def test_get_meshagent_url_docker(self):
r,
"http://tactical-meshcentral:4443/meshagents?id=4&meshid=abc123&installflags=0",
)


class TestMonitoring(TacticalTestCase):
url = "/core/status/"

def setUp(self):
self.setup_client()
self.setup_coresettings()

# sample data for generated metrics
client1 = baker.make("clients.Client")
client1_site1 = baker.make("clients.Site", client=client1)
client1_site2 = baker.make("clients.Site", client=client1)
baker.make("agents.agent", _quantity=10, site=client1_site1)
baker.make("agents.agent", _quantity=13, site=client1_site2)
client2 = baker.make("clients.Client")
client2_site1 = baker.make("clients.Site", client=client2)
baker.make("agents.agent", _quantity=13, site=client2_site1)

# Generate snakeoil cert with `make-ssl-cert generate-default-snakeoil` on ubuntu
# Cert will be in '/etc/ssl/certs/ssl-cert-snakeoil.pem'.
# Cert is used only for expiration date, so it can be selfsigned, expired and no key is needed.
self.snakeoil_certificate = tempfile.NamedTemporaryFile(delete=False)
self.snakeoil_certificate.write(
"""-----BEGIN CERTIFICATE-----
MIIC4jCCAcqgAwIBAgIUCFgTym78sGgRHwEmLyGgmr1JjSUwDQYJKoZIhvcNAQEL
BQAwFzEVMBMGA1UEAwwMZjNjMTQzOWM0NzZjMB4XDTIzMDMzMTA1MTgzOFoXDTMz
MDMyODA1MTgzOFowFzEVMBMGA1UEAwwMZjNjMTQzOWM0NzZjMIIBIjANBgkqhkiG
9w0BAQEFAAOCAQ8AMIIBCgKCAQEAzFWItB4aM/aUWIhk0SS1XKHLHao9/OwbGHet
lnrlZD2YM/DdUzqdYeYdujyLvWUj1xU+YcFv+vo3Mmu8HQVOKNcEZ5ZilHW/87X8
6ZjtUzPYmCapxXNTX8yh2EES582uq64j0t3OwfaCJmpJLwjvCnrizfUFe76iy5Ge
wVviYtkaIfHEwNoJLmFb07rYhNuV4tiwHUhmZqqm5nxpjKbTsI4YHnpSxNktU32C
vNVnIRIAHDZ8n8wCaKTPZMui9X/IJx1pA3EkbD2givbH/0nYRcd5ZUDxLsTJThob
8k5kPd1zVXqaH/ufqkekqoiY+kIWsgVd0iWx3qihhydAhRY5SQIDAQABoyYwJDAJ
BgNVHRMEAjAAMBcGA1UdEQQQMA6CDGYzYzE0MzljNDc2YzANBgkqhkiG9w0BAQsF
AAOCAQEAH91bAuK3tKf1v4D+t48SWSE2uFjCe6o2CzMwAdM3rVa47X2cw5nKOH5L
8nQJhJjq/t93DJi4WOpN579NWtTkwXyCl7srSvj8aK4FDKxKcWQNT1PUAa+gh8IB
WJdEK4lMSatCtA/wsq6jmkTwINZ/ELZp4BRU2gUp8mFU9fVQDMlY+2qwUzzIp97A
WISWVxML58FDFnQLsaP1SfapVWTTXTh4xnhr7VxklUadcGRnx9+Ig4Ieq27eSCiV
DC/aSRIyi9HaVZPTMbqLC50auHr/dQIL4pGyxFTD8OJoeRkQgAb1wWuAPhab20Xu
XyFzZMiRlyNNSPoYVExb65s1bawqew==
-----END CERTIFICATE-----""".encode(
encoding="utf-8"
)
)
self.snakeoil_certificate.close()

def tearDown(self):
from os import unlink

unlink(self.snakeoil_certificate.name)

# prometheus tests
def test_prometheus_missing_auth_header_request(self):
r = self.client.get(self.url)
self.assertEqual(r.status_code, 400)

def test_prometheus_missing_token_config(self):
r = self.client.get(self.url, HTTP_Authorization="Bearer MySuperTestSecret")
self.assertEqual(r.status_code, 401)

@override_settings(MON_TOKEN="MySuperTestSecret")
def test_prometheus_incorrect_token_request(self):
r = self.client.get(self.url, HTTP_Authorization="Bearer NotMySuperTestSecret")
self.assertEqual(r.status_code, 401)

@override_settings(DOCKER_BUILD=True, MON_TOKEN="MySuperTestSecret")
def test_prometheus_correct_docker_build_request(self):
with self.settings(
CERT_FILE=self.snakeoil_certificate.name, KEY_FILE="/do/not/need/a/key/here"
):
r = self.client.get(self.url, HTTP_Authorization="Bearer MySuperTestSecret")
self.assertEqual(r.status_code, 200)

@override_settings(MON_TOKEN="MySuperTestSecret")
def test_prometheus_correct_request(self):
with self.settings(
CERT_FILE=self.snakeoil_certificate.name, KEY_FILE="/do/not/need/a/key/here"
):
r = self.client.get(self.url, HTTP_Authorization="Bearer MySuperTestSecret")
self.assertEqual(r.status_code, 200)

# invalid tests
def test_invalid_request(self):
r = self.client.put(self.url)
self.assertEqual(r.status_code, 400)
self.assertEqual(
r.content,
b"Invalid request type\n",
)

# json tests
def test_json_invalid_json_request(self):
r = self.client.post(
self.url,
data="I am not json!",
content_type="application/json",
)
self.assertEqual(r.status_code, 400)

def test_json_invalid_payload_request(self):
r = self.client.post(
self.url, data={"notauth": "NotMySuperTestSecret"}, format="json"
)
self.assertEqual(r.status_code, 400)

def test_json_missing_token_request(self):
r = self.client.post(
self.url, data={"auth": "MySuperTestSecret"}, format="json"
)
self.assertEqual(r.status_code, 401)

@override_settings(MON_TOKEN="MySuperTestSecret")
def test_json_incorrect_token_request(self):
r = self.client.post(
self.url, data={"auth": "NotMySuperTestSecret"}, format="json"
)
self.assertEqual(r.status_code, 401)

@override_settings(MON_TOKEN="MySuperTestSecret")
def test_json_correct_request(self):
with self.settings(
CERT_FILE=self.snakeoil_certificate.name, KEY_FILE="/do/not/need/a/key/here"
):
r = self.client.post(
self.url, data={"auth": "MySuperTestSecret"}, format="json"
)
self.assertEqual(r.status_code, 200)

@override_settings(DOCKER_BUILD=True, MON_TOKEN="MySuperTestSecret")
def test_json_correct_docker_build_request(self):
with self.settings(
CERT_FILE=self.snakeoil_certificate.name, KEY_FILE="/do/not/need/a/key/here"
):
r = self.client.post(
self.url, data={"auth": "MySuperTestSecret"}, format="json"
)
self.assertEqual(r.status_code, 200)
180 changes: 145 additions & 35 deletions api/tacticalrmm/core/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pytz
from cryptography import x509
from django.conf import settings
from django.http import JsonResponse
from django.http import JsonResponse, HttpResponse
from django.shortcuts import get_object_or_404
from django.utils import timezone as djangotime
from django.views.decorators.csrf import csrf_exempt
Expand Down Expand Up @@ -409,43 +409,153 @@
from agents.models import Agent
from clients.models import Client, Site

disk_usage: int = round(psutil.disk_usage("/").percent)
mem_usage: int = round(psutil.virtual_memory().percent)

# common metrics bits
cert_file, _ = get_certs()
cert_bytes = Path(cert_file).read_bytes()

cert = x509.load_pem_x509_certificate(cert_bytes)
expires = pytz.utc.localize(cert.not_valid_after)
now = djangotime.now()
delta = expires - now

ret = {
"version": settings.TRMM_VERSION,
"latest_agent_version": settings.LATEST_AGENT_VER,
"agent_count": Agent.objects.count(),
"client_count": Client.objects.count(),
"site_count": Site.objects.count(),
"disk_usage_percent": disk_usage,
"mem_usage_percent": mem_usage,
"days_until_cert_expires": delta.days,
"cert_expired": delta.days < 0,

# common services
services = {
"django": "rmm.service",
"mesh": "meshcentral.service",
"daphne": "daphne.service",
"celery": "celery.service",
"celerybeat": "celerybeat.service",
"redis": "redis-server.service",
"postgres": "postgresql.service",
"mongo": "mongod.service",
"nats": "nats.service",
"nats-api": "nats-api.service",
"nginx": "nginx.service",
}

if settings.DOCKER_BUILD:
ret["services_running"] = "not available in docker"
else:
ret["services_running"] = {
"django": sysd_svc_is_running("rmm.service"),
"mesh": sysd_svc_is_running("meshcentral.service"),
"daphne": sysd_svc_is_running("daphne.service"),
"celery": sysd_svc_is_running("celery.service"),
"celerybeat": sysd_svc_is_running("celerybeat.service"),
"redis": sysd_svc_is_running("redis-server.service"),
"postgres": sysd_svc_is_running("postgresql.service"),
"mongo": sysd_svc_is_running("mongod.service"),
"nats": sysd_svc_is_running("nats.service"),
"nats-api": sysd_svc_is_running("nats-api.service"),
"nginx": sysd_svc_is_running("nginx.service"),
# TRMM json monitoring
if request.method == "POST":
disk_usage: int = round(psutil.disk_usage("/").percent)
mem_usage: int = round(psutil.virtual_memory().percent)

cert_expires = pytz.utc.localize(cert.not_valid_after)
now = djangotime.now()
delta = cert_expires - now

ret = {
"version": settings.TRMM_VERSION,
"latest_agent_version": settings.LATEST_AGENT_VER,
"agent_count": Agent.objects.count(),
"client_count": Client.objects.count(),
"site_count": Site.objects.count(),
"disk_usage_percent": disk_usage,
"mem_usage_percent": mem_usage,
"days_until_cert_expires": delta.days,
"cert_expired": delta.days < 0,
}
return JsonResponse(ret, json_dumps_params={"indent": 2})

if settings.DOCKER_BUILD:
ret["services_running"] = "not available in docker"
else:
ret["services_running"] = {}
for k, v in services.items():
ret["services_running"][k] = sysd_svc_is_running(v)
return JsonResponse(ret, json_dumps_params={"indent": 2})

# TRMM Prometheus monitoring
elif request.method == "GET":
# get agent counts
from clients.serializers import ClientSerializer
from django.db.models import Count, Prefetch

agent_counts = ClientSerializer(
Client.objects.order_by("name").prefetch_related(
Prefetch(
"sites",
queryset=Site.objects.order_by("name")
.select_related("client")
.annotate(agent_count=Count("agents")),
to_attr="filtered_sites",
)
),
many=True,
).data

# generate agent count metrics
agent_count_metrics = []
for client in agent_counts:
for site in client["sites"]:
agent_count_metrics.append(
(
{"client": client["name"], "site": site["name"]},
site["agent_count"],
)
)

# create base prometheus metric dataset
metrics = {
"trmm_buildinfo": {
"type": "gauge",
"help": "trmm version",
"entries": [({"version": settings.TRMM_VERSION}, 1)],
},
"trmm_meshinfo": {
"type": "gauge",
"help": "meshcentral version",
"entries": [({"version": settings.MESH_VER}, 1)],
},
"trmm_natsinfo": {
"type": "gauge",
"help": "nats version",
"entries": [({"version": settings.NATS_SERVER_VER}, 1)],
},
"trmm_appinfo": {
"type": "gauge",
"help": "vue version",
"entries": [({"version": settings.APP_VER}, 1)],
},
"trmm_agentinfo": {
"type": "gauge",
"help": "latest version of trmm agent",
"entries": [({"version": settings.LATEST_AGENT_VER}, 1)],
},
"trmm_agents": {
"type": "gauge",
"help": "number of registered agents in trmm",
"entries": agent_count_metrics,
},
"trmm_cert_expiry": {
"type": "gauge",
"help": "unix timestamp of certificate expiration",
"entries": [({}, cert.not_valid_after.timestamp())],
},
}

# add service metrics if this is not a docker build
if not settings.DOCKER_BUILD:
e = []
for k, v in services.items():
e.append(({"name": v, "service": k}, int(sysd_svc_is_running(v))))

metrics["trmm_systemd_unit_state"] = {
"type": "gauge",
"help": "trmm service status for non docker builds",
"entries": e,
}

# render prometheus metrics
payload = ""
for metric, data in metrics.items():
# create help and type hints
if "help" in data:
payload += "# HELP {} {}\n".format(metric, data["help"])
payload += "# TYPE {} {}\n".format(metric, data["type"])
# populate the metrics
for labels, value in data["entries"]:
label_string = ",".join(
['{}="{}"'.format(i[0], i[1]) for i in labels.items()]
)
if label_string != "":
label_string = "{{{}}}".format(label_string)
payload += "{}{} {}\n".format(metric, label_string, value)
return HttpResponse(payload, content_type="text/plain")

# The monitoring_view decorator should prevent this state from ever occuring.
else:
return HttpResponse("It should not be possible to be here.\n", status=500)

Check warning on line 561 in api/tacticalrmm/core/views.py

View check run for this annotation

Codecov / codecov/patch

api/tacticalrmm/core/views.py#L561

Added line #L561 was not covered by tests