Skip to content

Commit 493a5b4

Browse files
committed
Optimize network metrics collection
Apparently, we collect network stats for all containers, and then discarding some or most of it: - for docker, we collect and discard stats for containers which share netns with another container (which is rare I guess); - for both crio and containerd, we collect and discard stats for containers that are not infra (sandbox, pod, pause) containers (which is very common). Instead of reading and parsing a bunch of files in /proc/PID/net and when removing those, let's set things up in a way so we don't collect useless stats in the first place. This should improve performance, memory usage, and ease the load on garbage collection. Signed-off-by: Kir Kolyshkin <[email protected]>
1 parent 29e8539 commit 493a5b4

File tree

4 files changed

+89
-41
lines changed

4 files changed

+89
-41
lines changed

container/containerd/handler.go

+28-16
Original file line numberDiff line numberDiff line change
@@ -126,15 +126,23 @@ func newContainerdContainerHandler(
126126
Aliases: []string{id, name},
127127
}
128128

129-
libcontainerHandler := containerlibcontainer.NewHandler(cgroupManager, rootfs, int(taskPid), includedMetrics)
129+
// Find out if we need network metrics reported for this container.
130+
// Containers that don't have their own network -- this includes
131+
// containers running in Kubernetes pods that use the network of the
132+
// infrastructure container -- does not need their stats to be
133+
// reported. This stops metrics being reported multiple times for each
134+
// container in a pod.
135+
metrics := maybeRemoveNet(includedMetrics, cntr.Labels)
136+
137+
libcontainerHandler := containerlibcontainer.NewHandler(cgroupManager, rootfs, int(taskPid), metrics)
130138

131139
handler := &containerdContainerHandler{
132140
machineInfoFactory: machineInfoFactory,
133141
cgroupPaths: cgroupPaths,
134142
fsInfo: fsInfo,
135143
envs: make(map[string]string),
136144
labels: cntr.Labels,
137-
includedMetrics: includedMetrics,
145+
includedMetrics: metrics,
138146
reference: containerReference,
139147
libcontainerHandler: libcontainerHandler,
140148
}
@@ -164,22 +172,33 @@ func (h *containerdContainerHandler) ContainerReference() (info.ContainerReferen
164172
return h.reference, nil
165173
}
166174

167-
func (h *containerdContainerHandler) needNet() bool {
175+
func maybeRemoveNet(metrics container.MetricSet, labels map[string]string) container.MetricSet {
168176
// Since containerd does not handle networking ideally we need to return based
169177
// on includedMetrics list. Here the assumption is the presence of cri-containerd
170-
// label
171-
if h.includedMetrics.Has(container.NetworkUsageMetrics) {
172-
//TODO change it to exported cri-containerd constants
173-
return h.labels["io.cri-containerd.kind"] == "sandbox"
178+
// label.
179+
180+
// TODO change it to exported cri-containerd constants.
181+
if labels["io.cri-containerd.kind"] == "sandbox" {
182+
return metrics
174183
}
175-
return false
184+
185+
// This is not a pod/sandbox container; remove network metrics.
186+
187+
// Check if there is anything we can remove.
188+
if !metrics.HasAny(container.AllNetworkMetrics) {
189+
return metrics
190+
}
191+
192+
// A copy with all metrics set except for network ones.
193+
return metrics.Difference(container.AllNetworkMetrics)
176194
}
177195

178196
func (h *containerdContainerHandler) GetSpec() (info.ContainerSpec, error) {
179197
// TODO: Since we dont collect disk usage stats for containerd, we set hasFilesystem
180198
// to false. Revisit when we support disk usage stats for containerd
181199
hasFilesystem := false
182-
spec, err := common.GetSpec(h.cgroupPaths, h.machineInfoFactory, h.needNet(), hasFilesystem)
200+
hasNet := h.includedMetrics.Has(container.NetworkUsageMetrics)
201+
spec, err := common.GetSpec(h.cgroupPaths, h.machineInfoFactory, hasNet, hasFilesystem)
183202
spec.Labels = h.labels
184203
spec.Envs = h.envs
185204
spec.Image = h.image
@@ -204,13 +223,6 @@ func (h *containerdContainerHandler) GetStats() (*info.ContainerStats, error) {
204223
if err != nil {
205224
return stats, err
206225
}
207-
// Clean up stats for containers that don't have their own network - this
208-
// includes containers running in Kubernetes pods that use the network of the
209-
// infrastructure container. This stops metrics being reported multiple times
210-
// for each container in a pod.
211-
if !h.needNet() {
212-
stats.Network = info.NetworkStats{}
213-
}
214226

215227
// Get filesystem stats.
216228
err = h.getFsStats(stats)

container/crio/handler.go

+25-7
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,15 @@ func newCrioContainerHandler(
148148
Namespace: CrioNamespace,
149149
}
150150

151-
libcontainerHandler := containerlibcontainer.NewHandler(cgroupManager, rootFs, cInfo.Pid, includedMetrics)
151+
// Find out if we need network metrics reported for this container.
152+
// Containers that don't have their own network -- this includes
153+
// containers running in Kubernetes pods that use the network of the
154+
// infrastructure container -- does not need their stats to be
155+
// reported. This stops metrics being reported multiple times for each
156+
// container in a pod.
157+
metrics := maybeRemoveNet(includedMetrics, cInfo.Labels)
158+
159+
libcontainerHandler := containerlibcontainer.NewHandler(cgroupManager, rootFs, cInfo.Pid, metrics)
152160

153161
// TODO: extract object mother method
154162
handler := &crioContainerHandler{
@@ -161,7 +169,7 @@ func newCrioContainerHandler(
161169
rootfsStorageDir: rootfsStorageDir,
162170
envs: make(map[string]string),
163171
labels: cInfo.Labels,
164-
includedMetrics: includedMetrics,
172+
includedMetrics: metrics,
165173
reference: containerReference,
166174
libcontainerHandler: libcontainerHandler,
167175
cgroupManager: cgroupManager,
@@ -210,16 +218,26 @@ func (h *crioContainerHandler) ContainerReference() (info.ContainerReference, er
210218
return h.reference, nil
211219
}
212220

213-
func (h *crioContainerHandler) needNet() bool {
214-
if h.includedMetrics.Has(container.NetworkUsageMetrics) {
215-
return h.labels["io.kubernetes.container.name"] == "POD"
221+
func maybeRemoveNet(metrics container.MetricSet, labels map[string]string) container.MetricSet {
222+
if labels["io.kubernetes.container.name"] == "POD" {
223+
return metrics
216224
}
217-
return false
225+
226+
// This is not a pod/sandbox container; remove network metrics.
227+
228+
// Check if there is anything we can remove.
229+
if !metrics.HasAny(container.AllNetworkMetrics) {
230+
return metrics
231+
}
232+
233+
// A copy of all metrics except for network ones.
234+
return metrics.Difference(container.AllNetworkMetrics)
218235
}
219236

220237
func (h *crioContainerHandler) GetSpec() (info.ContainerSpec, error) {
221238
hasFilesystem := h.includedMetrics.Has(container.DiskUsageMetrics)
222-
spec, err := common.GetSpec(h.cgroupPaths, h.machineInfoFactory, h.needNet(), hasFilesystem)
239+
hasNet := h.includedMetrics.Has(container.NetworkUsageMetrics)
240+
spec, err := common.GetSpec(h.cgroupPaths, h.machineInfoFactory, hasNet, hasFilesystem)
223241

224242
spec.Labels = h.labels
225243
spec.Envs = h.envs

container/docker/handler.go

+19-18
Original file line numberDiff line numberDiff line change
@@ -72,9 +72,6 @@ type dockerContainerHandler struct {
7272
// Image name used for this container.
7373
image string
7474

75-
// The network mode of the container
76-
networkMode dockercontainer.NetworkMode
77-
7875
// Filesystem handler.
7976
fsHandler common.FsHandler
8077

@@ -188,6 +185,8 @@ func newDockerContainerHandler(
188185
return nil, fmt.Errorf("failed to inspect container %q: %v", id, err)
189186
}
190187

188+
metrics := maybeRemoveNet(includedMetrics, ctnr.HostConfig.NetworkMode)
189+
191190
// TODO: extract object mother method
192191
handler := &dockerContainerHandler{
193192
machineInfoFactory: machineInfoFactory,
@@ -198,7 +197,7 @@ func newDockerContainerHandler(
198197
rootfsStorageDir: rootfsStorageDir,
199198
envs: make(map[string]string),
200199
labels: ctnr.Config.Labels,
201-
includedMetrics: includedMetrics,
200+
includedMetrics: metrics,
202201
zfsParent: zfsParent,
203202
}
204203
// Timestamp returned by Docker is in time.RFC3339Nano format.
@@ -207,7 +206,7 @@ func newDockerContainerHandler(
207206
// This should not happen, report the error just in case
208207
return nil, fmt.Errorf("failed to parse the create timestamp %q for container %q: %v", ctnr.Created, id, err)
209208
}
210-
handler.libcontainerHandler = containerlibcontainer.NewHandler(cgroupManager, rootFs, ctnr.State.Pid, includedMetrics)
209+
handler.libcontainerHandler = containerlibcontainer.NewHandler(cgroupManager, rootFs, ctnr.State.Pid, metrics)
211210

212211
// Add the name and bare ID as aliases of the container.
213212
handler.reference = info.ContainerReference{
@@ -217,7 +216,6 @@ func newDockerContainerHandler(
217216
Namespace: DockerNamespace,
218217
}
219218
handler.image = ctnr.Config.Image
220-
handler.networkMode = ctnr.HostConfig.NetworkMode
221219
// Only adds restartcount label if it's greater than 0
222220
if ctnr.RestartCount > 0 {
223221
handler.labels["restartcount"] = strconv.Itoa(ctnr.RestartCount)
@@ -344,16 +342,26 @@ func (h *dockerContainerHandler) ContainerReference() (info.ContainerReference,
344342
return h.reference, nil
345343
}
346344

347-
func (h *dockerContainerHandler) needNet() bool {
348-
if h.includedMetrics.Has(container.NetworkUsageMetrics) {
349-
return !h.networkMode.IsContainer()
345+
func maybeRemoveNet(metrics container.MetricSet, networkMode dockercontainer.NetworkMode) container.MetricSet {
346+
if !networkMode.IsContainer() {
347+
return metrics
350348
}
351-
return false
349+
350+
// This container shares netns with another one; remove network metrics.
351+
352+
// Check if there is anything we can remove.
353+
if !metrics.HasAny(container.AllNetworkMetrics) {
354+
return metrics
355+
}
356+
357+
// A copy with all metrics set except for network ones.
358+
return metrics.Difference(container.AllNetworkMetrics)
352359
}
353360

354361
func (h *dockerContainerHandler) GetSpec() (info.ContainerSpec, error) {
355362
hasFilesystem := h.includedMetrics.Has(container.DiskUsageMetrics)
356-
spec, err := common.GetSpec(h.cgroupPaths, h.machineInfoFactory, h.needNet(), hasFilesystem)
363+
hasNetwork := h.includedMetrics.Has(container.NetworkUsageMetrics)
364+
spec, err := common.GetSpec(h.cgroupPaths, h.machineInfoFactory, hasNetwork, hasFilesystem)
357365

358366
spec.Labels = h.labels
359367
spec.Envs = h.envs
@@ -462,13 +470,6 @@ func (h *dockerContainerHandler) GetStats() (*info.ContainerStats, error) {
462470
if err != nil {
463471
return stats, err
464472
}
465-
// Clean up stats for containers that don't have their own network - this
466-
// includes containers running in Kubernetes pods that use the network of the
467-
// infrastructure container. This stops metrics being reported multiple times
468-
// for each container in a pod.
469-
if !h.needNet() {
470-
stats.Network = info.NetworkStats{}
471-
}
472473

473474
// Get filesystem stats.
474475
err = h.getFsStats(stats)

container/factory.go

+17
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,14 @@ var AllMetrics = MetricSet{
9393
OOMMetrics: struct{}{},
9494
}
9595

96+
// AllNetworkMetrics represents all network metrics that cAdvisor supports.
97+
var AllNetworkMetrics = MetricSet{
98+
NetworkUsageMetrics: struct{}{},
99+
NetworkTcpUsageMetrics: struct{}{},
100+
NetworkAdvancedTcpUsageMetrics: struct{}{},
101+
NetworkUdpUsageMetrics: struct{}{},
102+
}
103+
96104
func (mk MetricKind) String() string {
97105
return string(mk)
98106
}
@@ -104,6 +112,15 @@ func (ms MetricSet) Has(mk MetricKind) bool {
104112
return exists
105113
}
106114

115+
func (ms MetricSet) HasAny(ms1 MetricSet) bool {
116+
for m := range ms1 {
117+
if _, ok := ms[m]; ok {
118+
return true
119+
}
120+
}
121+
return false
122+
}
123+
107124
func (ms MetricSet) add(mk MetricKind) {
108125
ms[mk] = struct{}{}
109126
}

0 commit comments

Comments
 (0)