Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

App level metrics monitoring with Prometheus #186

Merged
merged 21 commits into from
Feb 18, 2025
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ server {
ssl_trusted_certificate {{tls_cert_dir}}/{{inventory_hostname}}/chain.pem;

proxy_ssl_server_name on;
location ~ /([a-zA-Z0-9_\.]+)/(.*) {
proxy_pass http://$1:9100/$2$is_args$args;
location ~ /([a-zA-Z0-9_\.]+)/([0-9]+)/(.*) {
proxy_pass http://$1:$2/$3$is_args$args;
}
}
6 changes: 6 additions & 0 deletions ansible/roles/monitoring/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -1 +1,7 @@
enable_log_ingestion: false

# ECS monitoring
ecs_aws_region: eu-central-1
monitoring_secret_key: "{{lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/ooni_monitoring/secret_key', profile='oonidevops_user_dev')}}"
monitoring_access_key: "{{lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/ooni_monitoring/access_key', profile='oonidevops_user_dev')}}"
ecs_targets_file: "/var/lib/prometheus/file_discovery/targets.json"
59 changes: 59 additions & 0 deletions ansible/roles/monitoring/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,64 @@
mode: 0644
owner: root

# Prometheus service discovery
- name: Create service discovery user
tags: monitoring, prometheus
user:
name: "ecs_discovery"
comment: "(Created by Ansible) System user that runs the service discovery script to find scrape targets in ECS"
shell: "/sbin/nologin"
create_home: no
system: true

- name: Create file-discovery directory for prometheus
tags: monitoring, prometheus
file:
path: /var/lib/prometheus/file_discovery
state: directory
owner: ecs_discovery
group: ecs_discovery

- name: Add script for ecs-discovery
tags: monitoring, prometheus
template:
src: templates/ecs-discovery.py
dest: /usr/bin/
mode: '555'
owner: ecs_discovery
group: ecs_discovery

- name: Add settings file for ecs-discovery
tags: monitoring, prometheus
template:
src: templates/ecs_discovery_settings.sh
dest: /etc/ooni/ecs_discovery_settings.sh
mode: '400'
owner: ecs_discovery
group: ecs_discovery

- name: Add the wrapper script for ecs-discovery
tags: monitoring, prometheus
template:
src: templates/ecs_discovery_wrapper.sh
dest: /usr/bin/ecs_discovery_wrapper.sh
mode: '500'
owner: ecs_discovery
group: ecs_discovery

- name: Add cronjob for running the ecs-discovery script (DEV environment)
tags: monitoring, prometheus, cron
cron:
name: "ECS Service Discovery"
user: ecs_discovery # TODO Discuss if it's a good Idea to use the root user for this
minute: "*/5"
hour: "*"
day: "*"
month: "*"
weekday: "*"
job: "ecs_discovery_wrapper.sh 2>&1 | /usr/bin/logger -t [ECS_DISCOVERY]"
state: present

- name: Installs grafana
tags: monitoring, grafana
apt:
Expand Down Expand Up @@ -141,6 +199,7 @@
- python3-tables
- python3-tqdm
- python3-ujson
- python3-boto3

- name: Install jupyter.service
tags: jupyter
Expand Down
275 changes: 275 additions & 0 deletions ansible/roles/monitoring/templates/ecs-discovery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,275 @@
#! /usr/bin/python3
import argparse
import dataclasses
from typing import List, Dict, Any
import logging
import os
from pathlib import Path
from datetime import datetime, timezone
import json

import boto3

parser = argparse.ArgumentParser(
description="""
ECS Discovery: Find ECS-deployed services by pulling the
data from AWS using the boto3 library.

This command line tool will list all currently running
ECS services with their host and port.

You can list them into stdout or write them in a file
compatible with prometheus file-based discovery
"""
)

parser.add_argument(
"--show", action="store_true", help="Display discovered services in STDOUT"
)
parser.add_argument(
"--region",
type=str,
help="AWS region. If not provided, will be pulled from environment: AWS_REGION",
default=None,
)
parser.add_argument(
"--secret-key",
type=str,
help="Secret Access key. If not provided, will be pulled from environment: AWS_SECRET_KEY",
default=None,
)
parser.add_argument(
"--access-key",
type=str,
help="Access key ID. If not provided, will be pulled from environment: AWS_ACCESS_KEY_ID",
default=None,
)
parser.add_argument(
"--output-file",
type=str,
help="Where to write json file with targets. If set to 'None', it won't write to disk. Defaults to ./targets.json",
default="targets.json",
)


@dataclasses.dataclass
class ECSService:
private_ip: str
port: int
container_name: str
task_arn: str
ec2_instance_id: str
cluster: str
date_discovered : datetime


class ECSDiscovery:

def __init__(self, region: str, secret_key: str, access_key: str) -> None:
self.region = region
self.secret_key = secret_key
self.access_key = access_key
self.ecs_client = boto3.client(
"ecs",
aws_access_key_id=self.access_key,
aws_secret_access_key=self.secret_key,
region_name=self.region,
)

def list_services(self) -> List[ECSService]:
"""
List all ECS services in every cluster
"""

logging.info("Retrieving clusters...")
clusters = self._list_clusters()
results: List[ECSService] = []

for cluster_desc in clusters:
cluster_name = cluster_desc["clusterName"]

logging.info(f"Retrieving tasks for cluster {cluster_name}...")
tasks = self._list_tasks(cluster_desc)

logging.info(
f"Retrieving container instance information for found tasks..."
)
# map from container instance ARN to Instance description
container_instance_descriptions = (
self._list_container_instance_descriptions(tasks, cluster_name)
)

for task_description in tasks:
task_arn = task_description["taskArn"]

# Describe container instance
container_instance_arn = task_description["containerInstanceArn"]
container_instance_description = container_instance_descriptions[
container_instance_arn
]
instance_id = container_instance_description["ec2InstanceId"]

for container in task_description["containers"]:

container_name = container["name"]
for binding in container["networkBindings"]:
# Get the task port
logging.info(
f"Found port: {binding['hostPort']} for container {container_name} in instance {instance_id}",
)

# Still doesn't know the private IP for the EC2 instance
results.append(
ECSService(
"",
port=binding["hostPort"],
container_name=container_name,
task_arn=task_arn,
ec2_instance_id=instance_id,
cluster=cluster_name,
date_discovered=datetime.now(timezone.utc)
)
)

self._set_ec2_private_ips(results)

return results

def _list_clusters(self) -> List[Dict[str, Any]]:
"""
List all clusters, including descriptions
"""
clusters = self.ecs_client.list_clusters()
clusters_arns = clusters["clusterArns"]
clustersDescriptions = self.ecs_client.describe_clusters(
clusters=clusters_arns
)["clusters"]

return clustersDescriptions

def _list_tasks(self, cluster: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
List all tasks within a cluster, including descriptions
"""
cluster_name = cluster["clusterName"]
cluster_arn = cluster["clusterArn"]
tasks = self.ecs_client.list_tasks(cluster=cluster_arn)
task_arns = tasks["taskArns"]
task_descriptions = self.ecs_client.describe_tasks(
cluster=cluster_name, tasks=task_arns
)["tasks"]
return task_descriptions

def _list_container_instance_descriptions(
self, tasks: List[Dict[str, Any]], cluster_name: str
) -> Dict[str, Dict[str, Any]]:
"""
List all container instances, including descriptions.

The resulting dict has the shape: Container ARN -> Container Description
"""

container_instances_ids = []
for task_description in tasks:
container_instance_arn = task_description["containerInstanceArn"]
container_instance_id = container_instance_arn.split("/")[-1]
container_instances_ids.append(container_instance_id)

# This will map from container instance arn to its description
container_arn_to_description = {}
container_instances_descriptions = self.ecs_client.describe_container_instances(
cluster=cluster_name, containerInstances=container_instances_ids
)
for container_inst_desc in container_instances_descriptions[
"containerInstances"
]:
container_arn_to_description[
container_inst_desc["containerInstanceArn"]
] = container_inst_desc

return container_arn_to_description

def _set_ec2_private_ips(self, services: List[ECSService]):
"""
Set up the private IP for the given list of services
"""

ec2_client = boto3.client(
"ec2",
aws_access_key_id=self.access_key,
aws_secret_access_key=self.secret_key,
region_name=self.region,
)
instance_ids = [service.ec2_instance_id for service in services]
instance_description = ec2_client.describe_instances(InstanceIds=instance_ids)

instances = {}
for reservation in instance_description["Reservations"]:
for instance in reservation["Instances"]:
instance_id = instance["InstanceId"]
instances[instance_id] = instance

for service in services:
private_ip = instances[service.ec2_instance_id]["PrivateIpAddress"]
service.private_ip = private_ip

def to_prom_json(services : List[ECSService]) -> List[Dict[str, Any]]:
"""
Convert a list of service objects into a prometheus-compatible list of dict
"""
services_json = []

for service in services:
services_json.append({
"targets" : [
f"{service.private_ip}:{service.port}"
],
"labels": {
"job" : service.container_name,
"instance" : service.ec2_instance_id,
"task" : service.task_arn,
"date_discovered" : service.date_discovered.isoformat()
}
})

return services_json


def main(args : argparse.Namespace):

logging.basicConfig(level=logging.INFO)

# Collect arguments
secret_key = args.secret_key or os.environ.get("AWS_SECRET_KEY")
access_key = args.access_key or os.environ.get("AWS_ACCESS_KEY_ID")
region = args.region or os.environ.get("AWS_REGION")

# Check that all arguments are passed
mandatory_args = [('secret key', secret_key), ('access key', access_key), ('region', region)]
for (arg_name, arg_val) in mandatory_args:
if arg_val is None:
logging.error(f"Missing argument: {arg_name}. You can specify it by command line arguments or environment variables, see --help")
exit(1)

# If no show and no output file, do nothing
if args.show and args.output_file == "None":
return

discovery = ECSDiscovery(region, secret_key, access_key) # type: ignore
services = discovery.list_services()

if args.show:
for service in services:
logging.info(f"[Cluster {service.cluster}] ({service.container_name}) {service.private_ip}:{service.port}")

# Save file to disk
if args.output_file != "None":
services_json = to_prom_json(services)
path = Path(args.output_file)
with path.open("w") as f:
json.dump(services_json, f)


if __name__ == "__main__":
args = parser.parse_args()
main(args)
8 changes: 8 additions & 0 deletions ansible/roles/monitoring/templates/ecs_discovery_settings.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#! /bin/bash

# This is a configuration file for the ECS discovery cronjob that discovers ECS tasks
# to be monitored by Prometheus

export AWS_REGION={{ecs_aws_region}}
export AWS_SECRET_KEY={{monitoring_secret_key}}
export AWS_ACCESS_KEY_ID={{monitoring_access_key}}
8 changes: 8 additions & 0 deletions ansible/roles/monitoring/templates/ecs_discovery_wrapper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#! /bin/bash

# This script is a wrapper over the ECS discovery script with the environment variables properly set

# We read the environment variables from /etc/ooni/ecs_discovery_settings.sh

source /etc/ooni/ecs_discovery_settings.sh
ecs-discovery.py --output-file {{ecs_targets_file}}
Loading
Loading