Skip to content

Commit

Permalink
metrics: Add node label to sriov_* metrics
Browse files Browse the repository at this point in the history
It might happen that two SR-IOV pods, deployed on different node, are using devices
with the same PCI address. In such cases, the query suggested [1] by the sriov-network-metrics-exporter produces the error:

```

Error loading values found duplicate series for the match group {pciAddr="0000:3b:02.4"} on the right hand-side of the operation:
    [
        {
            __name__="sriov_kubepoddevice",
            container="test",
            dev_type="openshift.io/intelnetdevice",
            endpoint="sriov-network-metrics",
            instance="10.1.98.60:9110",
            job="sriov-network-metrics-exporter-service",
            namespace="cnf-4916",
            pciAddr="0000:3b:02.4",
            pod="pod-cnfdr22.telco5g.eng.rdu2.redhat.com",
            prometheus="openshift-monitoring/k8s",
            service="sriov-network-metrics-exporter-service"
        }, {
            __name__="sriov_kubepoddevice",
            container="test",
            dev_type="openshift.io/intelnetdevice",
            endpoint="sriov-network-metrics",
            instance="10.1.98.230:9110",
            job="sriov-network-metrics-exporter-service",
            namespace="cnf-4916",
            pciAddr="0000:3b:02.4",
            pod="pod-dhcp-98-230.telco5g.eng.rdu2.redhat.com",
            prometheus="openshift-monitoring/k8s",
            service="sriov-network-metrics-exporter-service"
        }
    ];many-to-many matching not allowed: matching labels must be unique on one side
```

Configure the ServiceMonitor resource to add a `node` label to all metrics.
The right query to get metrics, as updated in the PrometheusRule, will be:

```
sriov_vf_tx_packets * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
```

Also remove `pod`,  `namespace` and `container` label from the `sriov_vf_*` metrics, as they were
wrongly set to `sriov-network-metrics-exporter-zj2n9`, `openshift-sriov-network-operator`, `kube-rbac-proxy`

[1] https://github.com/k8snetworkplumbingwg/sriov-network-metrics-exporter/blob/0f6a784f377ede87b95f31e569116ceb9775b5b9/README.md?plain=1#L38

Signed-off-by: Andrea Panattoni <[email protected]>
  • Loading branch information
zeeke committed Sep 12, 2024
1 parent 504ce7d commit 95737cb
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 30 deletions.
16 changes: 8 additions & 8 deletions bindata/manifests/metrics-exporter/metrics-prometheus-rule.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,28 +11,28 @@ spec:
interval: 30s
rules:
- expr: |
sriov_vf_tx_packets * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
sriov_vf_tx_packets * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_tx_packets
- expr: |
sriov_vf_rx_packets * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
sriov_vf_rx_packets * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_packets
- expr: |
sriov_vf_tx_bytes * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
sriov_vf_tx_bytes * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_tx_bytes
- expr: |
sriov_vf_rx_bytes * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
sriov_vf_rx_bytes * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_bytes
- expr: |
sriov_vf_tx_dropped * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
sriov_vf_tx_dropped * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_tx_dropped
- expr: |
sriov_vf_rx_dropped * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
sriov_vf_rx_dropped * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_dropped
- expr: |
sriov_vf_rx_broadcast * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
sriov_vf_rx_broadcast * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_broadcast
- expr: |
sriov_vf_rx_multicast * on (pciAddr) group_left(pod,namespace,dev_type) sriov_kubepoddevice
sriov_vf_rx_multicast * on (pciAddr,node) group_left(pod,namespace,dev_type) sriov_kubepoddevice
record: network:sriov_vf_rx_multicast
{{ end }}

11 changes: 11 additions & 0 deletions bindata/manifests/metrics-exporter/metrics-prometheus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,17 @@ spec:
bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token"
scheme: "https"
honorLabels: true
relabelings:
- action: replace
sourceLabels:
- __meta_kubernetes_endpoint_node_name
targetLabel: node
- action: labeldrop
regex: pod
- action: labeldrop
regex: container
- action: labeldrop
regex: namespace
tlsConfig:
serverName: sriov-network-metrics-exporter-service.{{.Namespace}}.svc
caFile: /etc/prometheus/configmaps/serving-certs-ca-bundle/service-ca.crt
Expand Down
93 changes: 71 additions & 22 deletions test/conformance/tests/test_exporter_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/prometheus/common/model"

corev1 "k8s.io/api/core/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

. "github.com/onsi/ginkgo/v2"
Expand All @@ -30,10 +31,6 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
var nic *sriovv1.InterfaceExt

BeforeAll(func() {
if cluster.VirtualCluster() {
Skip("IGB driver does not support VF statistics")
}

err := namespaces.Create(namespaces.Test, clients)
Expect(err).ToNot(HaveOccurred())

Expand Down Expand Up @@ -73,6 +70,9 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
})

It("collects metrics regarding receiving traffic via VF", func() {
if cluster.VirtualCluster() {
Skip("IGB driver does not support VF statistics")
}

pod := createTestPod(node, []string{"test-me-network"})
DeferCleanup(namespaces.CleanPods, namespaces.Test, clients)
Expand All @@ -98,27 +98,76 @@ var _ = Describe("[sriov] Metrics Exporter", Ordered, func() {
Expect(finalRxPackets).Should(BeNumerically(">", initialRxPackets))
})

It("PrometheusRule should provide namespaced metrics", func() {
pod := createTestPod(node, []string{"test-me-network"})
DeferCleanup(namespaces.CleanPods, namespaces.Test, clients)
Context("When Prometheus operator is available", func() {
BeforeEach(func() {
_, err := clients.ServiceMonitors(operatorNamespace).List(context.Background(), metav1.ListOptions{})
if k8serrors.IsNotFound(err) {
Skip("Prometheus operator not available in the cluster")
}
})

namespacedMetricNames := []string{
"network:sriov_vf_rx_bytes",
"network:sriov_vf_tx_bytes",
"network:sriov_vf_rx_packets",
"network:sriov_vf_tx_packets",
"network:sriov_vf_rx_dropped",
"network:sriov_vf_tx_dropped",
"network:sriov_vf_rx_broadcast",
"network:sriov_vf_rx_multicast",
}
It("PrometheusRule should provide namespaced metrics", func() {
pod := createTestPod(node, []string{"test-me-network"})
DeferCleanup(namespaces.CleanPods, namespaces.Test, clients)

namespacedMetricNames := []string{
"network:sriov_vf_rx_bytes",
"network:sriov_vf_tx_bytes",
"network:sriov_vf_rx_packets",
"network:sriov_vf_tx_packets",
"network:sriov_vf_rx_dropped",
"network:sriov_vf_tx_dropped",
"network:sriov_vf_rx_broadcast",
"network:sriov_vf_rx_multicast",
}

Eventually(func(g Gomega) {
for _, metricName := range namespacedMetricNames {
values := runPromQLQuery(fmt.Sprintf(`%s{namespace="%s",pod="%s"}`, metricName, pod.Namespace, pod.Name))
g.Expect(values).ToNot(BeEmpty(), "no value for metric %s", metricName)
Eventually(func(g Gomega) {
for _, metricName := range namespacedMetricNames {
values := runPromQLQuery(fmt.Sprintf(`%s{namespace="%s",pod="%s"}`, metricName, pod.Namespace, pod.Name))
g.Expect(values).ToNot(BeEmpty(), "no value for metric %s", metricName)
}
}, "90s", "1s").Should(Succeed())
})

It("Metrics should have the correct labels", func() {
pod := createTestPod(node, []string{"test-me-network"})
DeferCleanup(namespaces.CleanPods, namespaces.Test, clients)

metricsName := []string{
"sriov_vf_rx_bytes",
"sriov_vf_tx_bytes",
"sriov_vf_rx_packets",
"sriov_vf_tx_packets",
"sriov_vf_rx_dropped",
"sriov_vf_tx_dropped",
"sriov_vf_rx_broadcast",
"sriov_vf_rx_multicast",
}
}, "40s", "1s").Should(Succeed())

Eventually(func(g Gomega) {
for _, metricName := range metricsName {
samples := runPromQLQuery(metricName)
g.Expect(samples).ToNot(BeEmpty(), "no value for metric %s", metricName)
g.Expect(samples[0].Metric).To(And(
HaveKey("node"),
HaveKey("pciAddress"),
HaveKey("pf"),
HaveKey("vf"),
))
}
}, "90s", "1s").Should(Succeed())

// sriov_kubepoddevice has a different sets of label than statistics metrics
samples := runPromQLQuery(fmt.Sprintf(`sriov_kubepoddevice{namespace="%s",pod="%s"}`, pod.Namespace, pod.Name))
Expect(samples).ToNot(BeEmpty(), "no value for metric sriov_kubepoddevice")
Expect(samples[0].Metric).To(And(
HaveKeyWithValue("node", pod.Spec.NodeName),
HaveKey("pciAddress"),
HaveKeyWithValue("dev_type", "openshift.io/test-me-network"),
HaveKeyWithValue("namespace", pod.Namespace),
HaveKeyWithValue("pod", pod.Name),
))
})
})
})

Expand Down

0 comments on commit 95737cb

Please sign in to comment.