Skip to content

Commit

Permalink
Prometheus ec2 monitoring (#182)
Browse files Browse the repository at this point in the history
Originally this PR was about adding prometheus monitoring to services
and nodes in the ECS cluster, but then we realized that the application
level metrics are trickier to implement because ECS deployed tasks have
a random port assigned, but [the standard ec2 discovery settings in
Prometheus](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config)
require you to provide the port in advance. So for this reason in this
PR we only set up node level metrics, since nodes can have a node
exporter process running in a fixed port (see #179).

Since the ECS nodes are not reachable through the Internet, we added a
proxy server to forward scrape requests from the monitoring server to
the actual nodes.

To achieve node level metrics scraping, we: 

- Add an Nginx proxy configuration to the clickhouse proxy server to
direct traffic from the monitoring server to the actual nodes. In order
to know to which node is directed each scraping request, we send the
private IP address of the node as a path parameter and using nginx rules
we parse the right host to send the metrics request to
- Add a Prometheus relabeling configuration that will take what the
standard ec2 discovery settings provide and use it to rewrite the
address to point to the proxy server and adding the private IP as a
parameter
- Add the permission configurations required to allow traffic from the
monitoring server to the proxy server, and from the proxy server to the
cluster nodes

 This PR solves #171 and #172 and depends on #179
  • Loading branch information
LDiazN authored Feb 14, 2025
1 parent 12158c1 commit 0fab83c
Show file tree
Hide file tree
Showing 12 changed files with 169 additions and 23 deletions.
5 changes: 5 additions & 0 deletions ansible/deploy-clickhouse-proxy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,17 @@
become: true
roles:
- role: bootstrap
- role: dehydrated
vars:
ssl_domains:
- clickhouseproxy.dev.ooni.io
- role: nginx
tags: nginx
- role: clickhouse_proxy
vars:
clickhouse_url: "clickhouse3.prod.ooni.io"
clickhouse_port: 9000
clickhouse_proxy_public_fqdn: "clickhouseproxy.dev.ooni.io"
- role: dehydrated
vars:
ssl_domains: "clickhouseproxy.dev.ooni.io"
Expand Down
1 change: 1 addition & 0 deletions ansible/roles/clickhouse_proxy/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tls_cert_dir: /var/lib/dehydrated/certs
22 changes: 22 additions & 0 deletions ansible/roles/clickhouse_proxy/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,17 @@
notify:
- reload nftables

# For prometheus scrape requests
- name: Allow traffic on port 9200
tags: prometheus-proxy
blockinfile:
path: /etc/ooni/nftables/tcp/9200.nft
create: yes
block: |
add rule inet filter input tcp dport 9200 counter accept comment "prometheus"
notify:
- reload nftables

- name: Create the modules-enabled directory if not exists
tags: webserv
ansible.builtin.file:
Expand All @@ -28,3 +39,14 @@
notify:
- reload nginx
- restart nginx

- name: Add prometheus proxy nginx config
tags: webserv
template:
src: templates/prometheus-proxy.conf
dest: /etc/nginx/conf.d/prometheus-proxy.conf
mode: 0755
owner: root
notify:
- reload nginx
- restart nginx
16 changes: 16 additions & 0 deletions ansible/roles/clickhouse_proxy/templates/prometheus-proxy.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
server {
listen 9200 ssl;

server_name {{ clickhouse_proxy_public_fqdn }};

include /etc/nginx/ssl_intermediate.conf;

ssl_certificate {{tls_cert_dir}}/{{inventory_hostname}}/fullchain.pem;
ssl_certificate_key {{tls_cert_dir}}/{{inventory_hostname}}/privkey.pem;
ssl_trusted_certificate {{tls_cert_dir}}/{{inventory_hostname}}/chain.pem;

proxy_ssl_server_name on;
location ~ /([a-zA-Z0-9_\.]+)/(.*) {
proxy_pass http://$1:9100/$2$is_args$args;
}
}
34 changes: 34 additions & 0 deletions ansible/roles/prometheus/templates/prometheus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -214,4 +214,38 @@ scrape_configs:
static_configs:
- targets:
- backend-hel.ooni.org:444

# EC2 instances monitoring:
- job_name: 'ooni-aws-ec2'
scrape_interval: 5s
scheme: https
metrics_path: "/metrics"

# Node level metrics for cluster nodes
ec2_sd_configs:
- access_key: "{{prometheus_aws_access_key_dev}}"
secret_key: "{{prometheus_aws_secret_key_dev}}"
region: "eu-central-1"
port: 9100 # should be the proxy
relabel_configs: # Change the host to the proxy host with relabeling
- source_labels: [__address__]
regex: "([0-9\\.]+):([0-9]+)" # <ip>:<port>"
replacement: "$1"
target_label: "ec2_host"
action: "replace"
- source_labels: [__address__]
regex: "([0-9\\.]+):([0-9]+)" # <ip>:<port>
replacement: "{{clickhouse_proxy_host_dev}}:9200/${1}/metrics"
target_label: "proxy_host"
action: "replace"
- source_labels: [proxy_host]
regex: "([^/]*)/(.*)"
replacement: "$1"
target_label: "__address__"
action: "replace"
- source_labels: [proxy_host]
regex: "([^/]*)/(.*)"
replacement: "/$2"
target_label: "__metrics_path__"
action: "replace"
...
9 changes: 9 additions & 0 deletions ansible/roles/prometheus/vars/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -150,3 +150,12 @@ blackbox_jobs:
- name: icmp
module: icmp
targets: "{{ dom0_hosts | list }}"

prometheus_aws_access_key_dev: "{{ lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/ooni_monitoring/access_key', profile='oonidevops_user_dev') }}"
prometheus_aws_secret_key_dev: "{{ lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/ooni_monitoring/secret_key', profile='oonidevops_user_dev') }}"

prometheus_aws_access_key_prod: "{{ lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/ooni_monitoring/access_key', profile='oonidevops_user_prod') }}"
prometheus_aws_secret_key_prod: "{{ lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/ooni_monitoring/secret_key', profile='oonidevops_user_prod') }}"

clickhouse_proxy_host_dev: "clickhouseproxy.dev.ooni.io"
clickhouse_proxy_host_prod: "clickhouseproxy.dev.ooni.io" # TODO Change for prod
62 changes: 41 additions & 21 deletions tf/environments/dev/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,13 @@ module "ooniapi_cluster" {

instance_type = "t3a.micro"

monitoring_sg_ids = [
# The clickhouse proxy has an nginx configuration
# to proxy requests from the monitoring server
# to the cluster instances
module.ooni_clickhouse_proxy.ec2_sg_id
]

tags = merge(
local.tags,
{ Name = "ooni-tier0-api-ecs-cluster" }
Expand Down Expand Up @@ -411,6 +418,10 @@ module "ooniapi_reverseproxy" {
)
}

data "dns_a_record_set" "monitoring_host" {
host = "monitoring.ooni.org"
}

module "ooni_clickhouse_proxy" {
source = "../../modules/ec2"

Expand All @@ -426,31 +437,37 @@ module "ooni_clickhouse_proxy" {

name = "oonickprx"
ingress_rules = [{
from_port = 22,
to_port = 22,
protocol = "tcp",
from_port = 22,
to_port = 22,
protocol = "tcp",
cidr_blocks = ["0.0.0.0/0"],
}, {
from_port = 80,
to_port = 80,
protocol = "tcp",
}, {
from_port = 80,
to_port = 80,
protocol = "tcp",
cidr_blocks = ["0.0.0.0/0"],
}, {
from_port = 9000,
to_port = 9000,
protocol = "tcp",
}, {
from_port = 9000,
to_port = 9000,
protocol = "tcp",
cidr_blocks = module.network.vpc_subnet_private[*].cidr_block,
}, {
// For the prometheus proxy:
from_port = 9200,
to_port = 9200,
protocol = "tcp"
cidr_blocks = [for ip in flatten(data.dns_a_record_set.monitoring_host.*.addrs) : "${tostring(ip)}/32"]
}]

egress_rules = [{
from_port = 0,
to_port = 0,
protocol = "-1",
from_port = 0,
to_port = 0,
protocol = "-1",
cidr_blocks = ["0.0.0.0/0"],
}, {
from_port = 0,
to_port = 0,
protocol = "-1",
}, {
from_port = 0,
to_port = 0,
protocol = "-1",
ipv6_cidr_blocks = ["::/0"]
}]

Expand Down Expand Up @@ -792,6 +809,9 @@ resource "aws_acm_certificate_validation" "ooniapi_frontend" {
### Ooni monitoring

module "ooni_monitoring" {
source = "../../modules/ooni_monitoring"
tags = local.tags
}
source = "../../modules/ooni_monitoring"
environment = local.environment
aws_region = var.aws_region

tags = local.tags
}
4 changes: 4 additions & 0 deletions tf/modules/ec2/outputs.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,7 @@ output "aws_instance_id" {
output "aws_instance_public_dns" {
value = aws_instance.ooni_ec2.public_dns
}

output "ec2_sg_id" {
value = aws_security_group.ec2_sg.id
}
8 changes: 8 additions & 0 deletions tf/modules/ecs_cluster/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,14 @@ resource "aws_security_group" "container_host" {
]
}

ingress {
protocol = "tcp"
from_port = 9100
to_port = 9100

security_groups = var.monitoring_sg_ids
}

egress {
from_port = 0
to_port = 0
Expand Down
4 changes: 4 additions & 0 deletions tf/modules/ecs_cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ variable "instance_volume_size" {
default = "5"
}

variable "monitoring_sg_ids" {
default = []
}

variable "node_exporter_port" {
default = "9100"
}
10 changes: 9 additions & 1 deletion tf/modules/ooni_monitoring/main.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
locals {
name = "ecs-service-discovery-${var.environment}"

tags = {
Name = local.name
Environment = var.environment
}
}
resource "aws_iam_user" "ooni_monitoring" {
name = "oonidevops-monitoring"
}
Expand Down Expand Up @@ -34,4 +42,4 @@ resource "aws_ssm_parameter" "ooni_monitoring_secret_key" {
name = "/oonidevops/secrets/ooni_monitoring/secret_key"
type = "SecureString"
value = aws_iam_access_key.ooni_monitoring.secret
}
}
17 changes: 16 additions & 1 deletion tf/modules/ooni_monitoring/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,19 @@ variable "tags" {
description = "tags to apply to the resources"
default = {}
type = map(string)
}
}

variable "environment" {
type = string
}

variable "task_memory" {
description = "How much memory to allocate for this task"
type = number
default = 64
}

variable "aws_region" {
description = "AWS region"
type = string
}

0 comments on commit 0fab83c

Please sign in to comment.