diff --git a/ansible/roles/clickhouse_proxy/templates/prometheus-proxy.conf b/ansible/roles/clickhouse_proxy/templates/prometheus-proxy.conf index 2ea41815..d875d5f9 100644 --- a/ansible/roles/clickhouse_proxy/templates/prometheus-proxy.conf +++ b/ansible/roles/clickhouse_proxy/templates/prometheus-proxy.conf @@ -10,7 +10,7 @@ server { ssl_trusted_certificate {{tls_cert_dir}}/{{inventory_hostname}}/chain.pem; proxy_ssl_server_name on; - location ~ /([a-zA-Z0-9_\.]+)/(.*) { - proxy_pass http://$1:9100/$2$is_args$args; + location ~ /([a-zA-Z0-9_\.]+)/([0-9]+)/(.*) { + proxy_pass http://$1:$2/$3$is_args$args; } } diff --git a/ansible/roles/monitoring/defaults/main.yml b/ansible/roles/monitoring/defaults/main.yml index 47ef408f..51d3f4e3 100644 --- a/ansible/roles/monitoring/defaults/main.yml +++ b/ansible/roles/monitoring/defaults/main.yml @@ -1 +1,8 @@ enable_log_ingestion: false + +# ECS monitoring +ecs_aws_region: eu-central-1 +monitoring_secret_key_dev: "{{lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/ooni_monitoring/secret_key', profile='oonidevops_user_dev')}}" +monitoring_access_key_dev: "{{lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/ooni_monitoring/access_key', profile='oonidevops_user_dev')}}" + +ecs_targets_file: "/var/lib/prometheus/file_discovery/targets.json" \ No newline at end of file diff --git a/ansible/roles/monitoring/tasks/main.yml b/ansible/roles/monitoring/tasks/main.yml index f2b60e9b..14275288 100644 --- a/ansible/roles/monitoring/tasks/main.yml +++ b/ansible/roles/monitoring/tasks/main.yml @@ -53,6 +53,64 @@ mode: 0644 owner: root +# Prometheus service discovery +- name: Create service discovery user + tags: monitoring, prometheus + user: + name: "ecs_discovery" + comment: "(Created by Ansible) System user that runs the service discovery script to find scrape targets in ECS" + shell: "/sbin/nologin" + create_home: no + system: true + +- name: Create file-discovery directory for prometheus + tags: monitoring, prometheus + file: + path: /var/lib/prometheus/file_discovery + state: directory + owner: ecs_discovery + group: ecs_discovery + +- name: Add script for ecs-discovery + tags: monitoring, prometheus + template: + src: templates/ecs-discovery.py + dest: /usr/bin/ + mode: '555' + owner: ecs_discovery + group: ecs_discovery + +- name: Add settings file for ecs-discovery + tags: monitoring, prometheus + template: + src: templates/ecs_discovery_settings.sh + dest: /etc/ooni/ecs_discovery_settings.sh + mode: '400' + owner: ecs_discovery + group: ecs_discovery + +- name: Add the wrapper script for ecs-discovery + tags: monitoring, prometheus + template: + src: templates/ecs_discovery_wrapper.sh + dest: /usr/bin/ecs_discovery_wrapper.sh + mode: '500' + owner: ecs_discovery + group: ecs_discovery + +- name: Add cronjob for running the ecs-discovery script (DEV environment) + tags: monitoring, prometheus, cron + cron: + name: "ECS Service Discovery" + user: ecs_discovery + minute: "*/5" + hour: "*" + day: "*" + month: "*" + weekday: "*" + job: "ecs_discovery_wrapper.sh 2>&1 | /usr/bin/logger -t [ECS_DISCOVERY]" + state: present + - name: Installs grafana tags: monitoring, grafana apt: @@ -141,6 +199,7 @@ - python3-tables - python3-tqdm - python3-ujson + - python3-boto3 - name: Install jupyter.service tags: jupyter diff --git a/ansible/roles/monitoring/templates/ecs-discovery.py b/ansible/roles/monitoring/templates/ecs-discovery.py new file mode 100755 index 00000000..20785fa6 --- /dev/null +++ b/ansible/roles/monitoring/templates/ecs-discovery.py @@ -0,0 +1,275 @@ +#!/usr/bin/python3 +import argparse +import dataclasses +from typing import List, Dict, Any +import logging +import os +from pathlib import Path +from datetime import datetime, timezone +import json + +import boto3 + +parser = argparse.ArgumentParser( + description=""" + ECS Discovery: Find ECS-deployed services by pulling the + data from AWS using the boto3 library. + + This command line tool will list all currently running + ECS services with their host and port. + + You can list them into stdout or write them in a file + compatible with prometheus file-based discovery +""" +) + +parser.add_argument( + "--show", action="store_true", help="Display discovered services in STDOUT" +) +parser.add_argument( + "--region", + type=str, + help="AWS region. If not provided, will be pulled from environment: AWS_REGION", + default=None, +) +parser.add_argument( + "--secret-key", + type=str, + help="Secret Access key. If not provided, will be pulled from environment: AWS_SECRET_KEY", + default=None, +) +parser.add_argument( + "--access-key", + type=str, + help="Access key ID. If not provided, will be pulled from environment: AWS_ACCESS_KEY_ID", + default=None, +) +parser.add_argument( + "--output-file", + type=str, + help="Where to write json file with targets. If not provided it won't write to disk", + default="targets.json", +) + + +@dataclasses.dataclass +class ECSService: + private_ip: str + port: int + container_name: str + task_arn: str + ec2_instance_id: str + cluster: str + date_discovered : datetime + + +class ECSDiscovery: + + def __init__(self, region: str, secret_key: str, access_key: str) -> None: + self.region = region + self.secret_key = secret_key + self.access_key = access_key + self.ecs_client = boto3.client( + "ecs", + aws_access_key_id=self.access_key, + aws_secret_access_key=self.secret_key, + region_name=self.region, + ) + + def list_services(self) -> List[ECSService]: + """ + List all ECS services in every cluster + """ + + logging.info("Retrieving clusters...") + clusters = self._list_clusters() + results: List[ECSService] = [] + + for cluster_desc in clusters: + cluster_name = cluster_desc["clusterName"] + + logging.info(f"Retrieving tasks for cluster {cluster_name}...") + tasks = self._list_tasks(cluster_desc) + + logging.info( + f"Retrieving container instance information for found tasks..." + ) + # map from container instance ARN to Instance description + container_instance_descriptions = ( + self._list_container_instance_descriptions(tasks, cluster_name) + ) + + for task_description in tasks: + task_arn = task_description["taskArn"] + + # Describe container instance + container_instance_arn = task_description["containerInstanceArn"] + container_instance_description = container_instance_descriptions[ + container_instance_arn + ] + instance_id = container_instance_description["ec2InstanceId"] + + for container in task_description["containers"]: + + container_name = container["name"] + for binding in container["networkBindings"]: + # Get the task port + logging.info( + f"Found port: {binding['hostPort']} for container {container_name} in instance {instance_id}", + ) + + # Still doesn't know the private IP for the EC2 instance + results.append( + ECSService( + "", + port=binding["hostPort"], + container_name=container_name, + task_arn=task_arn, + ec2_instance_id=instance_id, + cluster=cluster_name, + date_discovered=datetime.now(timezone.utc) + ) + ) + + self._set_ec2_private_ips(results) + + return results + + def _list_clusters(self) -> List[Dict[str, Any]]: + """ + List all clusters, including descriptions + """ + clusters = self.ecs_client.list_clusters() + clusters_arns = clusters["clusterArns"] + clustersDescriptions = self.ecs_client.describe_clusters( + clusters=clusters_arns + )["clusters"] + + return clustersDescriptions + + def _list_tasks(self, cluster: Dict[str, Any]) -> List[Dict[str, Any]]: + """ + List all tasks within a cluster, including descriptions + """ + cluster_name = cluster["clusterName"] + cluster_arn = cluster["clusterArn"] + tasks = self.ecs_client.list_tasks(cluster=cluster_arn) + task_arns = tasks["taskArns"] + task_descriptions = self.ecs_client.describe_tasks( + cluster=cluster_name, tasks=task_arns + )["tasks"] + return task_descriptions + + def _list_container_instance_descriptions( + self, tasks: List[Dict[str, Any]], cluster_name: str + ) -> Dict[str, Dict[str, Any]]: + """ + List all container instances, including descriptions. + + The resulting dict has the shape: Container ARN -> Container Description + """ + + container_instances_ids = [] + for task_description in tasks: + container_instance_arn = task_description["containerInstanceArn"] + container_instance_id = container_instance_arn.split("/")[-1] + container_instances_ids.append(container_instance_id) + + # This will map from container instance arn to its description + container_arn_to_description = {} + container_instances_descriptions = self.ecs_client.describe_container_instances( + cluster=cluster_name, containerInstances=container_instances_ids + ) + for container_inst_desc in container_instances_descriptions[ + "containerInstances" + ]: + container_arn_to_description[ + container_inst_desc["containerInstanceArn"] + ] = container_inst_desc + + return container_arn_to_description + + def _set_ec2_private_ips(self, services: List[ECSService]): + """ + Set up the private IP for the given list of services + """ + + ec2_client = boto3.client( + "ec2", + aws_access_key_id=self.access_key, + aws_secret_access_key=self.secret_key, + region_name=self.region, + ) + instance_ids = [service.ec2_instance_id for service in services] + instance_description = ec2_client.describe_instances(InstanceIds=instance_ids) + + instances = {} + for reservation in instance_description["Reservations"]: + for instance in reservation["Instances"]: + instance_id = instance["InstanceId"] + instances[instance_id] = instance + + for service in services: + private_ip = instances[service.ec2_instance_id]["PrivateIpAddress"] + service.private_ip = private_ip + +def to_prom_json(services : List[ECSService]) -> List[Dict[str, Any]]: + """ + Convert a list of service objects into a prometheus-compatible list of dict + """ + services_json = [] + + for service in services: + services_json.append({ + "targets" : [ + f"{service.private_ip}:{service.port}" + ], + "labels": { + "job" : service.container_name, + "instance" : service.ec2_instance_id, + "task" : service.task_arn, + "date_discovered" : service.date_discovered.isoformat() + } + }) + + return services_json + + +def main(args : argparse.Namespace): + + logging.basicConfig(level=logging.INFO) + + # Collect arguments + secret_key = args.secret_key or os.environ.get("AWS_SECRET_KEY") + access_key = args.access_key or os.environ.get("AWS_ACCESS_KEY_ID") + region = args.region or os.environ.get("AWS_REGION") + + # Check that all arguments are passed + mandatory_args = [('secret key', secret_key), ('access key', access_key), ('region', region)] + for (arg_name, arg_val) in mandatory_args: + if arg_val is None: + logging.error(f"Missing argument: {arg_name}. You can specify it by command line arguments or environment variables, see --help") + exit(1) + + # If no show and no output file, do nothing + if args.show and args.output_file is None: + return + + discovery = ECSDiscovery(region, secret_key, access_key) # type: ignore + services = discovery.list_services() + + if args.show: + for service in services: + logging.info(f"[Cluster {service.cluster}] ({service.container_name}) {service.private_ip}:{service.port}") + + # Save file to disk + if args.output_file is not None: + services_json = to_prom_json(services) + path = Path(args.output_file) + with path.open("w") as f: + json.dump(services_json, f) + + +if __name__ == "__main__": + args = parser.parse_args() + main(args) diff --git a/ansible/roles/monitoring/templates/ecs_discovery_settings.sh b/ansible/roles/monitoring/templates/ecs_discovery_settings.sh new file mode 100644 index 00000000..0450dad0 --- /dev/null +++ b/ansible/roles/monitoring/templates/ecs_discovery_settings.sh @@ -0,0 +1,8 @@ +#! /bin/bash + +# This is a configuration file for the ECS discovery cronjob that discovers ECS tasks +# to be monitored by Prometheus + +export AWS_REGION={{ecs_aws_region}} +export AWS_SECRET_KEY={{monitoring_secret_key_dev}} +export AWS_ACCESS_KEY_ID={{monitoring_access_key_dev}} diff --git a/ansible/roles/monitoring/templates/ecs_discovery_wrapper.sh b/ansible/roles/monitoring/templates/ecs_discovery_wrapper.sh new file mode 100644 index 00000000..7676dc6d --- /dev/null +++ b/ansible/roles/monitoring/templates/ecs_discovery_wrapper.sh @@ -0,0 +1,8 @@ +#! /bin/bash + +# This script is a wrapper over the ECS discovery script with the environment variables properly set + +# We read the environment variables from /etc/ooni/ecs_discovery_settings.sh + +source /etc/ooni/ecs_discovery_settings.sh +ecs-discovery.py --output-file {{ecs_targets_file}} \ No newline at end of file diff --git a/ansible/roles/prometheus/templates/prometheus.yml b/ansible/roles/prometheus/templates/prometheus.yml index 3e8fa796..447e5b4c 100755 --- a/ansible/roles/prometheus/templates/prometheus.yml +++ b/ansible/roles/prometheus/templates/prometheus.yml @@ -235,7 +235,7 @@ scrape_configs: action: "replace" - source_labels: [__address__] regex: "([0-9\\.]+):([0-9]+)" # : - replacement: "{{clickhouse_proxy_host_dev}}:9200/${1}/metrics" + replacement: "{{clickhouse_proxy_host_dev}}:9200/${1}/${2}/metrics" target_label: "proxy_host" action: "replace" - source_labels: [proxy_host] @@ -248,4 +248,40 @@ scrape_configs: replacement: "/$2" target_label: "__metrics_path__" action: "replace" + + # Scrape tasks in ECS using file based discovery, useful for application level metrics + - job_name: "ecs-tasks-dev" + scrape_interval: 5s + scheme: https + basic_auth: + username: 'prom' + password: '{{ prometheus_metrics_password_dev }}' + file_sd_configs: + - files: + - '/var/lib/prometheus/file_discovery/*.json' + relabel_configs: # Change the host to the proxy host with relabeling + # Store ip in ecs_host + - source_labels: [__address__] + regex: "([0-9\\.]+):([0-9]+)" # :" + replacement: "$1" + target_label: "ec2_host" + action: "replace" + # Store the full adress with path in proxy_host + - source_labels: [__address__] + regex: "([0-9\\.]+):([0-9]+)" # : + replacement: "{{clickhouse_proxy_host_dev}}:9200/${1}/${2}/metrics" # proxy.org:9200///metrics + target_label: "proxy_host" + action: "replace" + # Change the address where to send the scrape request to + - source_labels: [proxy_host] + regex: "([^/]*)/(.*)" + replacement: "$1" + target_label: "__address__" + action: "replace" + # Change the metrics path to include ip address and /metrics path + - source_labels: [proxy_host] + regex: "([^/]*)/(.*)" + replacement: "/$2" + target_label: "__metrics_path__" + action: "replace" ... diff --git a/tf/modules/ecs_cluster/main.tf b/tf/modules/ecs_cluster/main.tf index 1de525b2..18a2c09e 100644 --- a/tf/modules/ecs_cluster/main.tf +++ b/tf/modules/ecs_cluster/main.tf @@ -115,9 +115,10 @@ resource "aws_security_group" "container_host" { from_port = 32768 to_port = 61000 - security_groups = [ + security_groups = concat([ aws_security_group.web.id, - ] + ], + var.monitoring_sg_ids) } ingress { diff --git a/tf/modules/ecs_cluster/variables.tf b/tf/modules/ecs_cluster/variables.tf index 5604b418..737403ea 100644 --- a/tf/modules/ecs_cluster/variables.tf +++ b/tf/modules/ecs_cluster/variables.tf @@ -64,6 +64,7 @@ variable "instance_volume_size" { variable "monitoring_sg_ids" { default = [] + type = list(string) } variable "node_exporter_port" { diff --git a/tf/modules/ooni_monitoring/main.tf b/tf/modules/ooni_monitoring/main.tf index a3615f73..b4de25dc 100644 --- a/tf/modules/ooni_monitoring/main.tf +++ b/tf/modules/ooni_monitoring/main.tf @@ -20,6 +20,15 @@ resource "aws_iam_user_policy" "ooni_monitoring" { { Action = [ "ec2:DescribeInstances", + # For ECS Discovery + "ECS:ListClusters", + "ECS:ListTasks", + "ECS:DescribeTask", + "EC2:DescribeInstances", + "ECS:DescribeContainerInstances", + "ECS:DescribeTasks", + "ECS:DescribeTaskDefinition", + "ECS:DescribeClusters" ] Effect = "Allow" Resource = "*"