diff --git a/README.md b/README.md index 148e39ce..e15c7a58 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,7 @@ The operator's workflow can be described in two different architectural models: ModSecret[Include: modify_alertmanager_secret] Reencode[Re-encode Alertmanager Content] PatchSecret[Patch alertmanager-main Secret] + AddPrometheusRule[Add PrometheusRule] UpdateCR[Update CR Status to ConfigUpdated] Init --> GetClusterName GetClusterName --> CheckIntegration @@ -129,8 +130,8 @@ The operator's workflow can be described in two different architectural models: GO -->|Return: Endpoint| ConfigureSlack ConfigureSlack --> ModSecret ModSecret --> PatchSecret - PatchSecret --> UpdateCR - CheckIntegration -- Integration exists --> UpdateCR + PatchSecret --> AddPrometheusRule + AddPrometheusRule --> UpdateCR ``` *Operator Workflow in Standalone Cluster:* @@ -142,7 +143,7 @@ The operator's workflow can be described in two different architectural models: *In-Cluster Configuration Management:* The operator directly applies configuration changes within the cluster, bypassing the need for `Syncsets`. - It ensures the Alertmanager's alert forwarding settings are correctly configured for seamless communication with Grafana On Call. + It ensures the Alertmanager's alert forwarding settings are correctly configured for seamless communication with Grafana On Call. Additionally, it adds option for On call Heartbeat which acts as a monitoring for monitoring systems. It also creates PrometheusRule that adds a Vector as heartbeat generator. *Local Secret Management:* Managing the `alertmanager-main-generated` secret locally, the operator updates its configurations. diff --git a/bundle/manifests/grafana-cloud-ansible-operator.clusterserviceversion.yaml b/bundle/manifests/grafana-cloud-ansible-operator.clusterserviceversion.yaml index 4847a4c9..6146a0f5 100644 --- a/bundle/manifests/grafana-cloud-ansible-operator.clusterserviceversion.yaml +++ b/bundle/manifests/grafana-cloud-ansible-operator.clusterserviceversion.yaml @@ -157,6 +157,12 @@ spec: - patch - update - watch + - apiGroups: + - monitoring.coreos.com + resources: + - prometheusrules + verbs: + - '*' - apiGroups: - "" resources: diff --git a/charts/grafana-oncall/templates/role.yaml b/charts/grafana-oncall/templates/role.yaml index e67bfd78..bcf0160d 100644 --- a/charts/grafana-oncall/templates/role.yaml +++ b/charts/grafana-oncall/templates/role.yaml @@ -76,6 +76,9 @@ rules: - apiGroups: ["slack.stakater.com"] resources: ["channels"] verbs: ["get", "list", "watch"] + - apiGroups: ["monitoring.coreos.com"] + resources: ["prometheusrules"] + verbs: ["*"] - apiGroups: ["hive.openshift.io"] resources: ["syncsets"] verbs: ["*"] diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index e3ad5b85..df588490 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -78,6 +78,9 @@ rules: - apiGroups: ["slack.stakater.com"] resources: ["channels"] verbs: ["create", "get", "list", "patch", "update", "watch"] + - apiGroups: ["monitoring.coreos.com"] + resources: ["prometheusrules"] + verbs: ["*"] - apiGroups: [""] resources: ["secrets"] verbs: ["*"] diff --git a/roles/grafana_cloud_operator/tasks/grafana_oncall_standalone.yml b/roles/grafana_cloud_operator/tasks/grafana_oncall_standalone.yml index 98929df7..ba0bd0fb 100644 --- a/roles/grafana_cloud_operator/tasks/grafana_oncall_standalone.yml +++ b/roles/grafana_cloud_operator/tasks/grafana_oncall_standalone.yml @@ -103,6 +103,28 @@ alertmanager.yaml: "{{ encoded_alertmanager_secret_content }}" when: not integration_exists_for_cluster +- name: Add prometheus rule for cluster + kubernetes.core.k8s: + state: present + namespace: "openshift-monitoring" + definition: + apiVersion: monitoring.coreos.com/v1 + kind: PrometheusRule + metadata: + name: "heartbeat-grafana-oncall" + namespace: "openshift-monitoring" + spec: + groups: + - name: meta + rules: + - alert: heartbeat + annotations: + description: This is a heartbeat alert for Grafana OnCall + summary: Heartbeat for Grafana OnCall + expr: vector(1) + labels: + severity: none + - name: Update CR status to ConfigUpdated kubernetes.core.k8s: state: present diff --git a/roles/grafana_cloud_operator/tasks/modify_alertmanager_secret.yml b/roles/grafana_cloud_operator/tasks/modify_alertmanager_secret.yml index dec0509b..98d16129 100644 --- a/roles/grafana_cloud_operator/tasks/modify_alertmanager_secret.yml +++ b/roles/grafana_cloud_operator/tasks/modify_alertmanager_secret.yml @@ -4,10 +4,21 @@ {{ (fetched_alertmanager_secret.resources[0].data['alertmanager.yaml'] | b64decode | from_yaml) | combine({ 'receivers': [ { - 'name': receiver_name, - 'webhook_configs': [{ - 'url': receiver_url - }] + "name": receiver_name, + "webhook_configs": [ + { + "url": receiver_url + } + ] + }, + { + "name": "grafana-oncall-heartbeat", + "webhook_configs": [ + { + "url": receiver_url + "heartbeat/", + "send_resolved": false + } + ] } ], 'route': { @@ -18,6 +29,15 @@ 'match': { 'severity': 'info | warning | critical' } + }, + { + "match": { + "alertname": "heartbeat" + }, + "receiver": "grafana-oncall-heartbeat", + "group_wait": "0s", + "group_interval": "1m", + "repeat_interval": "50s" } ] }