Skip to content

Commit eac1257

Browse files
committed
Optimize network metrics collection
Apparently, we collect network stats for *all* containers, and then discard most (or some) of these statistics: - for both crio and containerd, we collect and discard stats for non-infra containers containers (i.e. most of containers); - for docker, we collect and discard stats for containers which share netns with another container (which is rare I guess); Instead of reading and parsing a bunch of files in /proc/PID/net and when discarding the just-gathered stats, let's set things up in a way so we don't collect useless stats in the first place. This should improve performance, memory usage, and ease the load on garbage collection. Signed-off-by: Kir Kolyshkin <[email protected]>
1 parent 29e8539 commit eac1257

File tree

6 files changed

+119
-60
lines changed

6 files changed

+119
-60
lines changed

container/common/helpers.go

+17
Original file line numberDiff line numberDiff line change
@@ -437,3 +437,20 @@ func (m deviceIdentifierMap) Find(major, minor uint64, namer DeviceNamer) string
437437
m[d] = s
438438
return s
439439
}
440+
441+
// RemoveNetMetrics is used to remove any network metrics from the given MetricSet.
442+
// It returns the original set as is if remove is false, or if there are no metrics
443+
// to remove.
444+
func RemoveNetMetrics(metrics container.MetricSet, remove bool) container.MetricSet {
445+
if !remove {
446+
return metrics
447+
}
448+
449+
// Check if there is anything we can remove, to avoid useless copying.
450+
if !metrics.HasAny(container.AllNetworkMetrics) {
451+
return metrics
452+
}
453+
454+
// A copy of all metrics except for network ones.
455+
return metrics.Difference(container.AllNetworkMetrics)
456+
}

container/common/helpers_test.go

+54
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,16 @@ package common
1616

1717
import (
1818
"errors"
19+
"fmt"
1920
"math"
2021
"os"
2122
"path/filepath"
23+
"reflect"
2224
"testing"
2325

2426
"github.com/stretchr/testify/assert"
2527

28+
"github.com/google/cadvisor/container"
2629
info "github.com/google/cadvisor/info/v1"
2730
v2 "github.com/google/cadvisor/info/v2"
2831
)
@@ -193,3 +196,54 @@ func TestGetSpecCgroupV2Max(t *testing.T) {
193196

194197
assert.EqualValues(t, spec.Processes.Limit, max)
195198
}
199+
200+
func TestRemoveNetMetrics(t *testing.T) {
201+
for _, ts := range []struct {
202+
desc string
203+
in, out container.MetricSet
204+
}{
205+
{
206+
desc: "nil set",
207+
in: nil,
208+
},
209+
{
210+
desc: "empty set",
211+
in: container.MetricSet{},
212+
},
213+
{
214+
desc: "nothing to remove",
215+
in: container.MetricSet{container.MemoryUsageMetrics: struct{}{}, container.PerfMetrics: struct{}{}},
216+
},
217+
{
218+
desc: "also nothing to remove",
219+
in: container.AllMetrics.Difference(container.AllNetworkMetrics),
220+
},
221+
{
222+
desc: "remove net from all",
223+
in: container.AllMetrics,
224+
out: container.AllMetrics.Difference(container.AllNetworkMetrics),
225+
},
226+
{
227+
desc: "remove net from some",
228+
in: container.MetricSet{container.MemoryUsageMetrics: struct{}{}, container.NetworkTcpUsageMetrics: struct{}{}},
229+
out: container.MetricSet{container.MemoryUsageMetrics: struct{}{}},
230+
},
231+
} {
232+
for _, remove := range []bool{true, false} {
233+
ts, remove := ts, remove
234+
desc := fmt.Sprintf("%s, remove: %v", ts.desc, remove)
235+
t.Run(desc, func(t *testing.T) {
236+
out := RemoveNetMetrics(ts.in, remove)
237+
if !remove || ts.out == nil {
238+
// Compare the actual underlying pointers. Can't use assert.Same
239+
// because it checks for pointer type, and these are maps.
240+
if reflect.ValueOf(ts.in) != reflect.ValueOf(out) {
241+
t.Errorf("expected original map %p, got %p", ts.in, out)
242+
}
243+
} else {
244+
assert.Equal(t, ts.out, out)
245+
}
246+
})
247+
}
248+
}
249+
}

container/containerd/handler.go

+11-21
Original file line numberDiff line numberDiff line change
@@ -126,15 +126,22 @@ func newContainerdContainerHandler(
126126
Aliases: []string{id, name},
127127
}
128128

129-
libcontainerHandler := containerlibcontainer.NewHandler(cgroupManager, rootfs, int(taskPid), includedMetrics)
129+
// Containers that don't have their own network -- this includes
130+
// containers running in Kubernetes pods that use the network of the
131+
// infrastructure container -- does not need their stats to be
132+
// reported. This stops metrics being reported multiple times for each
133+
// container in a pod.
134+
metrics := common.RemoveNetMetrics(includedMetrics, cntr.Labels["io.cri-containerd.kind"] != "sandbox")
135+
136+
libcontainerHandler := containerlibcontainer.NewHandler(cgroupManager, rootfs, int(taskPid), metrics)
130137

131138
handler := &containerdContainerHandler{
132139
machineInfoFactory: machineInfoFactory,
133140
cgroupPaths: cgroupPaths,
134141
fsInfo: fsInfo,
135142
envs: make(map[string]string),
136143
labels: cntr.Labels,
137-
includedMetrics: includedMetrics,
144+
includedMetrics: metrics,
138145
reference: containerReference,
139146
libcontainerHandler: libcontainerHandler,
140147
}
@@ -164,22 +171,12 @@ func (h *containerdContainerHandler) ContainerReference() (info.ContainerReferen
164171
return h.reference, nil
165172
}
166173

167-
func (h *containerdContainerHandler) needNet() bool {
168-
// Since containerd does not handle networking ideally we need to return based
169-
// on includedMetrics list. Here the assumption is the presence of cri-containerd
170-
// label
171-
if h.includedMetrics.Has(container.NetworkUsageMetrics) {
172-
//TODO change it to exported cri-containerd constants
173-
return h.labels["io.cri-containerd.kind"] == "sandbox"
174-
}
175-
return false
176-
}
177-
178174
func (h *containerdContainerHandler) GetSpec() (info.ContainerSpec, error) {
179175
// TODO: Since we dont collect disk usage stats for containerd, we set hasFilesystem
180176
// to false. Revisit when we support disk usage stats for containerd
181177
hasFilesystem := false
182-
spec, err := common.GetSpec(h.cgroupPaths, h.machineInfoFactory, h.needNet(), hasFilesystem)
178+
hasNet := h.includedMetrics.Has(container.NetworkUsageMetrics)
179+
spec, err := common.GetSpec(h.cgroupPaths, h.machineInfoFactory, hasNet, hasFilesystem)
183180
spec.Labels = h.labels
184181
spec.Envs = h.envs
185182
spec.Image = h.image
@@ -204,13 +201,6 @@ func (h *containerdContainerHandler) GetStats() (*info.ContainerStats, error) {
204201
if err != nil {
205202
return stats, err
206203
}
207-
// Clean up stats for containers that don't have their own network - this
208-
// includes containers running in Kubernetes pods that use the network of the
209-
// infrastructure container. This stops metrics being reported multiple times
210-
// for each container in a pod.
211-
if !h.needNet() {
212-
stats.Network = info.NetworkStats{}
213-
}
214204

215205
// Get filesystem stats.
216206
err = h.getFsStats(stats)

container/crio/handler.go

+13-17
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,15 @@ func newCrioContainerHandler(
148148
Namespace: CrioNamespace,
149149
}
150150

151-
libcontainerHandler := containerlibcontainer.NewHandler(cgroupManager, rootFs, cInfo.Pid, includedMetrics)
151+
// Find out if we need network metrics reported for this container.
152+
// Containers that don't have their own network -- this includes
153+
// containers running in Kubernetes pods that use the network of the
154+
// infrastructure container -- does not need their stats to be
155+
// reported. This stops metrics being reported multiple times for each
156+
// container in a pod.
157+
metrics := common.RemoveNetMetrics(includedMetrics, cInfo.Labels["io.kubernetes.container.name"] != "POD")
158+
159+
libcontainerHandler := containerlibcontainer.NewHandler(cgroupManager, rootFs, cInfo.Pid, metrics)
152160

153161
// TODO: extract object mother method
154162
handler := &crioContainerHandler{
@@ -161,7 +169,7 @@ func newCrioContainerHandler(
161169
rootfsStorageDir: rootfsStorageDir,
162170
envs: make(map[string]string),
163171
labels: cInfo.Labels,
164-
includedMetrics: includedMetrics,
172+
includedMetrics: metrics,
165173
reference: containerReference,
166174
libcontainerHandler: libcontainerHandler,
167175
cgroupManager: cgroupManager,
@@ -210,16 +218,10 @@ func (h *crioContainerHandler) ContainerReference() (info.ContainerReference, er
210218
return h.reference, nil
211219
}
212220

213-
func (h *crioContainerHandler) needNet() bool {
214-
if h.includedMetrics.Has(container.NetworkUsageMetrics) {
215-
return h.labels["io.kubernetes.container.name"] == "POD"
216-
}
217-
return false
218-
}
219-
220221
func (h *crioContainerHandler) GetSpec() (info.ContainerSpec, error) {
221222
hasFilesystem := h.includedMetrics.Has(container.DiskUsageMetrics)
222-
spec, err := common.GetSpec(h.cgroupPaths, h.machineInfoFactory, h.needNet(), hasFilesystem)
223+
hasNet := h.includedMetrics.Has(container.NetworkUsageMetrics)
224+
spec, err := common.GetSpec(h.cgroupPaths, h.machineInfoFactory, hasNet, hasFilesystem)
223225

224226
spec.Labels = h.labels
225227
spec.Envs = h.envs
@@ -306,13 +308,7 @@ func (h *crioContainerHandler) GetStats() (*info.ContainerStats, error) {
306308
return stats, err
307309
}
308310

309-
if !h.needNet() {
310-
// Clean up stats for containers that don't have their own network - this
311-
// includes containers running in Kubernetes pods that use the network of the
312-
// infrastructure container. This stops metrics being reported multiple times
313-
// for each container in a pod.
314-
stats.Network = info.NetworkStats{}
315-
} else if len(stats.Network.Interfaces) == 0 {
311+
if h.includedMetrics.Has(container.NetworkUsageMetrics) && len(stats.Network.Interfaces) == 0 {
316312
// No network related information indicates that the pid of the
317313
// container is not longer valid and we need to ask crio to
318314
// provide the pid of another container from that pod

container/docker/handler.go

+7-22
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ import (
3333
"github.com/google/cadvisor/zfs"
3434
"github.com/opencontainers/runc/libcontainer/cgroups"
3535

36-
dockercontainer "github.com/docker/docker/api/types/container"
3736
docker "github.com/docker/docker/client"
3837
"golang.org/x/net/context"
3938
"k8s.io/klog/v2"
@@ -72,9 +71,6 @@ type dockerContainerHandler struct {
7271
// Image name used for this container.
7372
image string
7473

75-
// The network mode of the container
76-
networkMode dockercontainer.NetworkMode
77-
7874
// Filesystem handler.
7975
fsHandler common.FsHandler
8076

@@ -188,6 +184,9 @@ func newDockerContainerHandler(
188184
return nil, fmt.Errorf("failed to inspect container %q: %v", id, err)
189185
}
190186

187+
// Do not report network metrics for containers that share netns with another container.
188+
metrics := common.RemoveNetMetrics(includedMetrics, ctnr.HostConfig.NetworkMode.IsContainer())
189+
191190
// TODO: extract object mother method
192191
handler := &dockerContainerHandler{
193192
machineInfoFactory: machineInfoFactory,
@@ -198,7 +197,7 @@ func newDockerContainerHandler(
198197
rootfsStorageDir: rootfsStorageDir,
199198
envs: make(map[string]string),
200199
labels: ctnr.Config.Labels,
201-
includedMetrics: includedMetrics,
200+
includedMetrics: metrics,
202201
zfsParent: zfsParent,
203202
}
204203
// Timestamp returned by Docker is in time.RFC3339Nano format.
@@ -207,7 +206,7 @@ func newDockerContainerHandler(
207206
// This should not happen, report the error just in case
208207
return nil, fmt.Errorf("failed to parse the create timestamp %q for container %q: %v", ctnr.Created, id, err)
209208
}
210-
handler.libcontainerHandler = containerlibcontainer.NewHandler(cgroupManager, rootFs, ctnr.State.Pid, includedMetrics)
209+
handler.libcontainerHandler = containerlibcontainer.NewHandler(cgroupManager, rootFs, ctnr.State.Pid, metrics)
211210

212211
// Add the name and bare ID as aliases of the container.
213212
handler.reference = info.ContainerReference{
@@ -217,7 +216,6 @@ func newDockerContainerHandler(
217216
Namespace: DockerNamespace,
218217
}
219218
handler.image = ctnr.Config.Image
220-
handler.networkMode = ctnr.HostConfig.NetworkMode
221219
// Only adds restartcount label if it's greater than 0
222220
if ctnr.RestartCount > 0 {
223221
handler.labels["restartcount"] = strconv.Itoa(ctnr.RestartCount)
@@ -344,16 +342,10 @@ func (h *dockerContainerHandler) ContainerReference() (info.ContainerReference,
344342
return h.reference, nil
345343
}
346344

347-
func (h *dockerContainerHandler) needNet() bool {
348-
if h.includedMetrics.Has(container.NetworkUsageMetrics) {
349-
return !h.networkMode.IsContainer()
350-
}
351-
return false
352-
}
353-
354345
func (h *dockerContainerHandler) GetSpec() (info.ContainerSpec, error) {
355346
hasFilesystem := h.includedMetrics.Has(container.DiskUsageMetrics)
356-
spec, err := common.GetSpec(h.cgroupPaths, h.machineInfoFactory, h.needNet(), hasFilesystem)
347+
hasNetwork := h.includedMetrics.Has(container.NetworkUsageMetrics)
348+
spec, err := common.GetSpec(h.cgroupPaths, h.machineInfoFactory, hasNetwork, hasFilesystem)
357349

358350
spec.Labels = h.labels
359351
spec.Envs = h.envs
@@ -462,13 +454,6 @@ func (h *dockerContainerHandler) GetStats() (*info.ContainerStats, error) {
462454
if err != nil {
463455
return stats, err
464456
}
465-
// Clean up stats for containers that don't have their own network - this
466-
// includes containers running in Kubernetes pods that use the network of the
467-
// infrastructure container. This stops metrics being reported multiple times
468-
// for each container in a pod.
469-
if !h.needNet() {
470-
stats.Network = info.NetworkStats{}
471-
}
472457

473458
// Get filesystem stats.
474459
err = h.getFsStats(stats)

container/factory.go

+17
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,14 @@ var AllMetrics = MetricSet{
9393
OOMMetrics: struct{}{},
9494
}
9595

96+
// AllNetworkMetrics represents all network metrics that cAdvisor supports.
97+
var AllNetworkMetrics = MetricSet{
98+
NetworkUsageMetrics: struct{}{},
99+
NetworkTcpUsageMetrics: struct{}{},
100+
NetworkAdvancedTcpUsageMetrics: struct{}{},
101+
NetworkUdpUsageMetrics: struct{}{},
102+
}
103+
96104
func (mk MetricKind) String() string {
97105
return string(mk)
98106
}
@@ -104,6 +112,15 @@ func (ms MetricSet) Has(mk MetricKind) bool {
104112
return exists
105113
}
106114

115+
func (ms MetricSet) HasAny(ms1 MetricSet) bool {
116+
for m := range ms1 {
117+
if _, ok := ms[m]; ok {
118+
return true
119+
}
120+
}
121+
return false
122+
}
123+
107124
func (ms MetricSet) add(mk MetricKind) {
108125
ms[mk] = struct{}{}
109126
}

0 commit comments

Comments
 (0)