Skip to content

Commit

Permalink
koordlet: cold memory fix usageWithHotPage and add usageWithPageCache (
Browse files Browse the repository at this point in the history
…#1699)

Signed-off-by: BUPT-wxq <[email protected]>
  • Loading branch information
BUPT-wxq authored Oct 24, 2023
1 parent fdc3b63 commit cae8705
Show file tree
Hide file tree
Showing 20 changed files with 427 additions and 68 deletions.
28 changes: 16 additions & 12 deletions pkg/koordlet/metriccache/metric_resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,28 +20,32 @@ var (
defaultMetricFactory = NewMetricFactory()

// define all kinds of MetricResource
NodeCPUUsageMetric = defaultMetricFactory.New(NodeMetricCPUUsage)
NodeMemoryUsageMetric = defaultMetricFactory.New(NodeMetricMemoryUsage)
NodeGPUCoreUsageMetric = defaultMetricFactory.New(NodeMetricGPUCoreUsage).withPropertySchema(MetricPropertyGPUMinor, MetricPropertyGPUDeviceUUID)
NodeGPUMemUsageMetric = defaultMetricFactory.New(NodeMetricGPUMemUsage).withPropertySchema(MetricPropertyGPUMinor, MetricPropertyGPUDeviceUUID)
NodeGPUMemTotalMetric = defaultMetricFactory.New(NodeMetricGPUMemTotal).withPropertySchema(MetricPropertyGPUMinor, MetricPropertyGPUDeviceUUID)
NodeCPUUsageMetric = defaultMetricFactory.New(NodeMetricCPUUsage)
NodeMemoryUsageMetric = defaultMetricFactory.New(NodeMetricMemoryUsage)
NodeMemoryUsageWithPageCacheMetric = defaultMetricFactory.New(NodeMemoryWithPageCacheUsage)
NodeGPUCoreUsageMetric = defaultMetricFactory.New(NodeMetricGPUCoreUsage).withPropertySchema(MetricPropertyGPUMinor, MetricPropertyGPUDeviceUUID)
NodeGPUMemUsageMetric = defaultMetricFactory.New(NodeMetricGPUMemUsage).withPropertySchema(MetricPropertyGPUMinor, MetricPropertyGPUDeviceUUID)
NodeGPUMemTotalMetric = defaultMetricFactory.New(NodeMetricGPUMemTotal).withPropertySchema(MetricPropertyGPUMinor, MetricPropertyGPUDeviceUUID)

// define system resource usage as independent metric, although this can be calculate by node-sum(pod), but the time series are
// unaligned across different type of metric, which makes it hard to aggregate.
SystemCPUUsageMetric = defaultMetricFactory.New(SysMetricCPUUsage)
SystemMemoryUsageMetric = defaultMetricFactory.New(SysMetricMemoryUsage)

PodCPUUsageMetric = defaultMetricFactory.New(PodMetricCPUUsage).withPropertySchema(MetricPropertyPodUID)
PodMemUsageMetric = defaultMetricFactory.New(PodMetricMemoryUsage).withPropertySchema(MetricPropertyPodUID)
PodCPUUsageMetric = defaultMetricFactory.New(PodMetricCPUUsage).withPropertySchema(MetricPropertyPodUID)
PodMemUsageMetric = defaultMetricFactory.New(PodMetricMemoryUsage).withPropertySchema(MetricPropertyPodUID)
PodMemoryUsageWithPageCacheMetric = defaultMetricFactory.New(PodMemoryWithPageCacheUsage).withPropertySchema(MetricPropertyPodUID)

PodCPUThrottledMetric = defaultMetricFactory.New(PodMetricCPUThrottled).withPropertySchema(MetricPropertyPodUID)
PodGPUCoreUsageMetric = defaultMetricFactory.New(PodMetricGPUCoreUsage).withPropertySchema(MetricPropertyPodUID, MetricPropertyGPUMinor, MetricPropertyGPUDeviceUUID)
PodGPUMemUsageMetric = defaultMetricFactory.New(PodMetricGPUMemUsage).withPropertySchema(MetricPropertyPodUID, MetricPropertyGPUMinor, MetricPropertyGPUDeviceUUID)

ContainerCPUUsageMetric = defaultMetricFactory.New(ContainerMetricCPUUsage).withPropertySchema(MetricPropertyContainerID)
ContainerMemUsageMetric = defaultMetricFactory.New(ContainerMetricMemoryUsage).withPropertySchema(MetricPropertyContainerID)
ContainerGPUCoreUsageMetric = defaultMetricFactory.New(ContainerMetricGPUCoreUsage).withPropertySchema(MetricPropertyContainerID, MetricPropertyGPUMinor, MetricPropertyGPUDeviceUUID)
ContainerGPUMemUsageMetric = defaultMetricFactory.New(ContainerMetricGPUMemUsage).withPropertySchema(MetricPropertyContainerID, MetricPropertyGPUMinor, MetricPropertyGPUDeviceUUID)
ContainerCPUThrottledMetric = defaultMetricFactory.New(ContainerMetricCPUThrottled).withPropertySchema(MetricPropertyContainerID)
ContainerCPUUsageMetric = defaultMetricFactory.New(ContainerMetricCPUUsage).withPropertySchema(MetricPropertyContainerID)
ContainerMemUsageMetric = defaultMetricFactory.New(ContainerMetricMemoryUsage).withPropertySchema(MetricPropertyContainerID)
ContainerMemoryUsageWithPageCacheMetric = defaultMetricFactory.New(ContainerMemoryWithPageCacheUsage).withPropertySchema(MetricPropertyContainerID)
ContainerGPUCoreUsageMetric = defaultMetricFactory.New(ContainerMetricGPUCoreUsage).withPropertySchema(MetricPropertyContainerID, MetricPropertyGPUMinor, MetricPropertyGPUDeviceUUID)
ContainerGPUMemUsageMetric = defaultMetricFactory.New(ContainerMetricGPUMemUsage).withPropertySchema(MetricPropertyContainerID, MetricPropertyGPUMinor, MetricPropertyGPUDeviceUUID)
ContainerCPUThrottledMetric = defaultMetricFactory.New(ContainerMetricCPUThrottled).withPropertySchema(MetricPropertyContainerID)
// cold memory metrics
NodeMemoryWithHotPageUsageMetric = defaultMetricFactory.New(NodeMemoryWithHotPageUsage)
PodMemoryWithHotPageUsageMetric = defaultMetricFactory.New(PodMemoryWithHotPageUsage).withPropertySchema(MetricPropertyPodUID)
Expand Down
29 changes: 16 additions & 13 deletions pkg/koordlet/metriccache/metric_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,12 @@ const (
type MetricKind string

const (
NodeMetricCPUUsage MetricKind = "node_cpu_usage"
NodeMetricMemoryUsage MetricKind = "node_memory_usage"
NodeMetricGPUCoreUsage MetricKind = "node_gpu_core_usage"
NodeMetricGPUMemUsage MetricKind = "node_gpu_memory_usage"
NodeMetricGPUMemTotal MetricKind = "node_gpu_memory_total"
NodeMetricCPUUsage MetricKind = "node_cpu_usage"
NodeMetricMemoryUsage MetricKind = "node_memory_usage"
NodeMemoryWithPageCacheUsage MetricKind = "node_memory_usage_with_page_cache"
NodeMetricGPUCoreUsage MetricKind = "node_gpu_core_usage"
NodeMetricGPUMemUsage MetricKind = "node_gpu_memory_usage"
NodeMetricGPUMemTotal MetricKind = "node_gpu_memory_total"

SysMetricCPUUsage MetricKind = "sys_cpu_usage"
SysMetricMemoryUsage MetricKind = "sys_memory_usage"
Expand All @@ -52,16 +53,18 @@ const (
PriorityMetricCPURealLimit MetricKind = "priority_cpu_real_limit"
PriorityMetricCPURequest MetricKind = "priority_cpu_request"

PodMetricCPUUsage MetricKind = "pod_cpu_usage"
PodMetricMemoryUsage MetricKind = "pod_memory_usage"
PodMetricGPUCoreUsage MetricKind = "pod_gpu_core_usage"
PodMetricGPUMemUsage MetricKind = "pod_gpu_memory_usage"
PodMetricCPUUsage MetricKind = "pod_cpu_usage"
PodMetricMemoryUsage MetricKind = "pod_memory_usage"
PodMemoryWithPageCacheUsage MetricKind = "pod_memory_usage_with_page_cache"
PodMetricGPUCoreUsage MetricKind = "pod_gpu_core_usage"
PodMetricGPUMemUsage MetricKind = "pod_gpu_memory_usage"
// PodMetricGPUMemTotal MetricKind = "pod_gpu_memory_total"

ContainerMetricCPUUsage MetricKind = "container_cpu_usage"
ContainerMetricMemoryUsage MetricKind = "container_memory_usage"
ContainerMetricGPUCoreUsage MetricKind = "container_gpu_core_usage"
ContainerMetricGPUMemUsage MetricKind = "container_gpu_memory_usage"
ContainerMetricCPUUsage MetricKind = "container_cpu_usage"
ContainerMetricMemoryUsage MetricKind = "container_memory_usage"
ContainerMemoryWithPageCacheUsage MetricKind = "container_memory_usage_with_page_cache"
ContainerMetricGPUCoreUsage MetricKind = "container_gpu_core_usage"
ContainerMetricGPUMemUsage MetricKind = "container_gpu_memory_usage"
// ContainerMetricGPUMemTotal MetricKind = "container_gpu_memory_total"

PodMetricCPUThrottled MetricKind = "pod_cpu_throttled"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ type nonColdPageCollector struct {
func New(opt *framework.Options) framework.Collector {
// check whether support kidled cold page info collector
if system.IsKidledSupport() {
kidledConfig := system.NewDefaultKidledConfig()
return &kidledcoldPageCollector{
collectInterval: opt.Config.ColdPageCollectorInterval,
cgroupReader: opt.CgroupReader,
Expand All @@ -42,6 +43,7 @@ func New(opt *framework.Options) framework.Collector {
appendableDB: opt.MetricCache,
metricDB: opt.MetricCache,
started: atomic.NewBool(false),
coldBoundary: kidledConfig.KidledColdBoundary,
}
}
// TODO(BUPT-wxq): check kstaled cold page collector
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ func Test_NewColdPageCollector(t *testing.T) {
appendableDB: opt.MetricCache,
metricDB: opt.MetricCache,
started: atomic.NewBool(false),
coldBoundary: 3,
},
},
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ type kidledcoldPageCollector struct {
podFilter framework.PodFilter
appendableDB metriccache.Appendable
metricDB metriccache.MetricCache
coldBoundary int
}

func (k *kidledcoldPageCollector) Run(stopCh <-chan struct{}) {
Expand All @@ -68,6 +69,7 @@ func (k *kidledcoldPageCollector) Enabled() bool {
return false
}
system.SetIsStartColdMemory(true)
system.SetKidledColdBoundary(k.coldBoundary)
return true
}
return false
Expand Down Expand Up @@ -121,7 +123,7 @@ func (k *kidledcoldPageCollector) collectNodeColdPageInfo() ([]metriccache.Metri
}
coldPageMetrics = append(coldPageMetrics, nodeColdPageMetrics)

memUsageWithHotPageBytes, err := koordletutil.GetNodeMemUsageWithHotPage(nodeColdPageBytes)
memUsageWithHotPageBytes, err := koordletutil.GetNodeMemUsageWithHotPageCache(nodeColdPageBytes)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -167,7 +169,7 @@ func (k *kidledcoldPageCollector) collectPodsColdPageInfo() ([]metriccache.Metri
}
coldMetrics = append(coldMetrics, podColdPageMetrics)

podMemUsageWithHotPageBytes, err := koordletutil.GetPodMemUsageWithHotPage(k.cgroupReader, podCgroupDir, podColdPageBytes)
podMemUsageWithHotPageBytes, err := koordletutil.GetPodMemUsageWithHotPageCache(k.cgroupReader, podCgroupDir, podColdPageBytes)
if err != nil {
klog.Warningf("failed to collect pod usage for Memory err: %s pod: %s/%s", err, pod.Namespace, pod.Name)
continue
Expand Down Expand Up @@ -220,7 +222,7 @@ func (k *kidledcoldPageCollector) collectContainersColdPageInfo(meta *statesinfo
}
coldMetrics = append(coldMetrics, containerColdPageMetrics)

containerMemUsageWithHotPageBytes, err := koordletutil.GetContainerMemUsageWithHotPage(k.cgroupReader, containerCgroupDir, containerColdPageBytes)
containerMemUsageWithHotPageBytes, err := koordletutil.GetContainerMemUsageWithHotPageCache(k.cgroupReader, containerCgroupDir, containerColdPageBytes)
if err != nil {
return nil, err
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,7 @@ func Test_collectNodeColdPageInfo(t *testing.T) {
helper.SetResourcesSupported(true, system.MemoryIdlePageStats)
helper.WriteCgroupFileContents("", system.MemoryIdlePageStats, idleInfoContentStr)
helper.WriteProcSubFileContents(system.ProcMemInfoName, meminfo)
kidledConfig := system.NewDefaultKidledConfig()
c := &kidledcoldPageCollector{
collectInterval: 5 * time.Second,
cgroupReader: resourceexecutor.NewCgroupReader(),
Expand All @@ -493,6 +494,7 @@ func Test_collectNodeColdPageInfo(t *testing.T) {
appendableDB: metricCache,
metricDB: metricCache,
started: atomic.NewBool(false),
coldBoundary: kidledConfig.KidledColdBoundary,
}
testNow := time.Now()
metrics, err := c.collectNodeColdPageInfo()
Expand All @@ -509,8 +511,8 @@ func Test_collectNodeColdPageInfo(t *testing.T) {
}
assert.NoError(t, err)
got1, got2 := testGetNodeMetrics(t, c.metricDB, testNow, 5*time.Second)
assert.Equal(t, float64(18446744073419457000), got1)
assert.Equal(t, float64(1363836928), got2)
assert.Equal(t, float64(7.33569024e+08), got1)
assert.Equal(t, float64(3.401728e+08), got2)
// test collect failed
helper.WriteCgroupFileContents("", system.MemoryIdlePageStats, ``)
helper.WriteProcSubFileContents(system.ProcMemInfoName, ``)
Expand Down
Loading

0 comments on commit cae8705

Please sign in to comment.