Skip to content

Commit 0fab83c

Browse files
authored
Prometheus ec2 monitoring (#182)
Originally this PR was about adding prometheus monitoring to services and nodes in the ECS cluster, but then we realized that the application level metrics are trickier to implement because ECS deployed tasks have a random port assigned, but [the standard ec2 discovery settings in Prometheus](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#ec2_sd_config) require you to provide the port in advance. So for this reason in this PR we only set up node level metrics, since nodes can have a node exporter process running in a fixed port (see #179). Since the ECS nodes are not reachable through the Internet, we added a proxy server to forward scrape requests from the monitoring server to the actual nodes. To achieve node level metrics scraping, we: - Add an Nginx proxy configuration to the clickhouse proxy server to direct traffic from the monitoring server to the actual nodes. In order to know to which node is directed each scraping request, we send the private IP address of the node as a path parameter and using nginx rules we parse the right host to send the metrics request to - Add a Prometheus relabeling configuration that will take what the standard ec2 discovery settings provide and use it to rewrite the address to point to the proxy server and adding the private IP as a parameter - Add the permission configurations required to allow traffic from the monitoring server to the proxy server, and from the proxy server to the cluster nodes This PR solves #171 and #172 and depends on #179
1 parent 12158c1 commit 0fab83c

File tree

12 files changed

+169
-23
lines changed

12 files changed

+169
-23
lines changed

ansible/deploy-clickhouse-proxy.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,17 @@
55
become: true
66
roles:
77
- role: bootstrap
8+
- role: dehydrated
9+
vars:
10+
ssl_domains:
11+
- clickhouseproxy.dev.ooni.io
812
- role: nginx
913
tags: nginx
1014
- role: clickhouse_proxy
1115
vars:
1216
clickhouse_url: "clickhouse3.prod.ooni.io"
1317
clickhouse_port: 9000
18+
clickhouse_proxy_public_fqdn: "clickhouseproxy.dev.ooni.io"
1419
- role: dehydrated
1520
vars:
1621
ssl_domains: "clickhouseproxy.dev.ooni.io"
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
tls_cert_dir: /var/lib/dehydrated/certs

ansible/roles/clickhouse_proxy/tasks/main.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,17 @@
99
notify:
1010
- reload nftables
1111

12+
# For prometheus scrape requests
13+
- name: Allow traffic on port 9200
14+
tags: prometheus-proxy
15+
blockinfile:
16+
path: /etc/ooni/nftables/tcp/9200.nft
17+
create: yes
18+
block: |
19+
add rule inet filter input tcp dport 9200 counter accept comment "prometheus"
20+
notify:
21+
- reload nftables
22+
1223
- name: Create the modules-enabled directory if not exists
1324
tags: webserv
1425
ansible.builtin.file:
@@ -28,3 +39,14 @@
2839
notify:
2940
- reload nginx
3041
- restart nginx
42+
43+
- name: Add prometheus proxy nginx config
44+
tags: webserv
45+
template:
46+
src: templates/prometheus-proxy.conf
47+
dest: /etc/nginx/conf.d/prometheus-proxy.conf
48+
mode: 0755
49+
owner: root
50+
notify:
51+
- reload nginx
52+
- restart nginx
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
server {
2+
listen 9200 ssl;
3+
4+
server_name {{ clickhouse_proxy_public_fqdn }};
5+
6+
include /etc/nginx/ssl_intermediate.conf;
7+
8+
ssl_certificate {{tls_cert_dir}}/{{inventory_hostname}}/fullchain.pem;
9+
ssl_certificate_key {{tls_cert_dir}}/{{inventory_hostname}}/privkey.pem;
10+
ssl_trusted_certificate {{tls_cert_dir}}/{{inventory_hostname}}/chain.pem;
11+
12+
proxy_ssl_server_name on;
13+
location ~ /([a-zA-Z0-9_\.]+)/(.*) {
14+
proxy_pass http://$1:9100/$2$is_args$args;
15+
}
16+
}

ansible/roles/prometheus/templates/prometheus.yml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,4 +214,38 @@ scrape_configs:
214214
static_configs:
215215
- targets:
216216
- backend-hel.ooni.org:444
217+
218+
# EC2 instances monitoring:
219+
- job_name: 'ooni-aws-ec2'
220+
scrape_interval: 5s
221+
scheme: https
222+
metrics_path: "/metrics"
223+
224+
# Node level metrics for cluster nodes
225+
ec2_sd_configs:
226+
- access_key: "{{prometheus_aws_access_key_dev}}"
227+
secret_key: "{{prometheus_aws_secret_key_dev}}"
228+
region: "eu-central-1"
229+
port: 9100 # should be the proxy
230+
relabel_configs: # Change the host to the proxy host with relabeling
231+
- source_labels: [__address__]
232+
regex: "([0-9\\.]+):([0-9]+)" # <ip>:<port>"
233+
replacement: "$1"
234+
target_label: "ec2_host"
235+
action: "replace"
236+
- source_labels: [__address__]
237+
regex: "([0-9\\.]+):([0-9]+)" # <ip>:<port>
238+
replacement: "{{clickhouse_proxy_host_dev}}:9200/${1}/metrics"
239+
target_label: "proxy_host"
240+
action: "replace"
241+
- source_labels: [proxy_host]
242+
regex: "([^/]*)/(.*)"
243+
replacement: "$1"
244+
target_label: "__address__"
245+
action: "replace"
246+
- source_labels: [proxy_host]
247+
regex: "([^/]*)/(.*)"
248+
replacement: "/$2"
249+
target_label: "__metrics_path__"
250+
action: "replace"
217251
...

ansible/roles/prometheus/vars/main.yml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,3 +150,12 @@ blackbox_jobs:
150150
- name: icmp
151151
module: icmp
152152
targets: "{{ dom0_hosts | list }}"
153+
154+
prometheus_aws_access_key_dev: "{{ lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/ooni_monitoring/access_key', profile='oonidevops_user_dev') }}"
155+
prometheus_aws_secret_key_dev: "{{ lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/ooni_monitoring/secret_key', profile='oonidevops_user_dev') }}"
156+
157+
prometheus_aws_access_key_prod: "{{ lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/ooni_monitoring/access_key', profile='oonidevops_user_prod') }}"
158+
prometheus_aws_secret_key_prod: "{{ lookup('amazon.aws.aws_ssm', '/oonidevops/secrets/ooni_monitoring/secret_key', profile='oonidevops_user_prod') }}"
159+
160+
clickhouse_proxy_host_dev: "clickhouseproxy.dev.ooni.io"
161+
clickhouse_proxy_host_prod: "clickhouseproxy.dev.ooni.io" # TODO Change for prod

tf/environments/dev/main.tf

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,13 @@ module "ooniapi_cluster" {
301301

302302
instance_type = "t3a.micro"
303303

304+
monitoring_sg_ids = [
305+
# The clickhouse proxy has an nginx configuration
306+
# to proxy requests from the monitoring server
307+
# to the cluster instances
308+
module.ooni_clickhouse_proxy.ec2_sg_id
309+
]
310+
304311
tags = merge(
305312
local.tags,
306313
{ Name = "ooni-tier0-api-ecs-cluster" }
@@ -411,6 +418,10 @@ module "ooniapi_reverseproxy" {
411418
)
412419
}
413420

421+
data "dns_a_record_set" "monitoring_host" {
422+
host = "monitoring.ooni.org"
423+
}
424+
414425
module "ooni_clickhouse_proxy" {
415426
source = "../../modules/ec2"
416427

@@ -426,31 +437,37 @@ module "ooni_clickhouse_proxy" {
426437

427438
name = "oonickprx"
428439
ingress_rules = [{
429-
from_port = 22,
430-
to_port = 22,
431-
protocol = "tcp",
440+
from_port = 22,
441+
to_port = 22,
442+
protocol = "tcp",
432443
cidr_blocks = ["0.0.0.0/0"],
433-
}, {
434-
from_port = 80,
435-
to_port = 80,
436-
protocol = "tcp",
444+
}, {
445+
from_port = 80,
446+
to_port = 80,
447+
protocol = "tcp",
437448
cidr_blocks = ["0.0.0.0/0"],
438-
}, {
439-
from_port = 9000,
440-
to_port = 9000,
441-
protocol = "tcp",
449+
}, {
450+
from_port = 9000,
451+
to_port = 9000,
452+
protocol = "tcp",
442453
cidr_blocks = module.network.vpc_subnet_private[*].cidr_block,
454+
}, {
455+
// For the prometheus proxy:
456+
from_port = 9200,
457+
to_port = 9200,
458+
protocol = "tcp"
459+
cidr_blocks = [for ip in flatten(data.dns_a_record_set.monitoring_host.*.addrs) : "${tostring(ip)}/32"]
443460
}]
444461

445462
egress_rules = [{
446-
from_port = 0,
447-
to_port = 0,
448-
protocol = "-1",
463+
from_port = 0,
464+
to_port = 0,
465+
protocol = "-1",
449466
cidr_blocks = ["0.0.0.0/0"],
450-
}, {
451-
from_port = 0,
452-
to_port = 0,
453-
protocol = "-1",
467+
}, {
468+
from_port = 0,
469+
to_port = 0,
470+
protocol = "-1",
454471
ipv6_cidr_blocks = ["::/0"]
455472
}]
456473

@@ -792,6 +809,9 @@ resource "aws_acm_certificate_validation" "ooniapi_frontend" {
792809
### Ooni monitoring
793810

794811
module "ooni_monitoring" {
795-
source = "../../modules/ooni_monitoring"
796-
tags = local.tags
797-
}
812+
source = "../../modules/ooni_monitoring"
813+
environment = local.environment
814+
aws_region = var.aws_region
815+
816+
tags = local.tags
817+
}

tf/modules/ec2/outputs.tf

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,7 @@ output "aws_instance_id" {
55
output "aws_instance_public_dns" {
66
value = aws_instance.ooni_ec2.public_dns
77
}
8+
9+
output "ec2_sg_id" {
10+
value = aws_security_group.ec2_sg.id
11+
}

tf/modules/ecs_cluster/main.tf

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,14 @@ resource "aws_security_group" "container_host" {
120120
]
121121
}
122122

123+
ingress {
124+
protocol = "tcp"
125+
from_port = 9100
126+
to_port = 9100
127+
128+
security_groups = var.monitoring_sg_ids
129+
}
130+
123131
egress {
124132
from_port = 0
125133
to_port = 0

tf/modules/ecs_cluster/variables.tf

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ variable "instance_volume_size" {
6262
default = "5"
6363
}
6464

65+
variable "monitoring_sg_ids" {
66+
default = []
67+
}
68+
6569
variable "node_exporter_port" {
6670
default = "9100"
6771
}

tf/modules/ooni_monitoring/main.tf

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
locals {
2+
name = "ecs-service-discovery-${var.environment}"
3+
4+
tags = {
5+
Name = local.name
6+
Environment = var.environment
7+
}
8+
}
19
resource "aws_iam_user" "ooni_monitoring" {
210
name = "oonidevops-monitoring"
311
}
@@ -34,4 +42,4 @@ resource "aws_ssm_parameter" "ooni_monitoring_secret_key" {
3442
name = "/oonidevops/secrets/ooni_monitoring/secret_key"
3543
type = "SecureString"
3644
value = aws_iam_access_key.ooni_monitoring.secret
37-
}
45+
}

tf/modules/ooni_monitoring/variables.tf

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,19 @@ variable "tags" {
22
description = "tags to apply to the resources"
33
default = {}
44
type = map(string)
5-
}
5+
}
6+
7+
variable "environment" {
8+
type = string
9+
}
10+
11+
variable "task_memory" {
12+
description = "How much memory to allocate for this task"
13+
type = number
14+
default = 64
15+
}
16+
17+
variable "aws_region" {
18+
description = "AWS region"
19+
type = string
20+
}

0 commit comments

Comments
 (0)