From 773eadd610c39087e927a0e305da3a575e72fdae Mon Sep 17 00:00:00 2001 From: BUPT-wxq <58139533+BUPT-wxq@users.noreply.github.com> Date: Thu, 21 Sep 2023 14:44:51 +0800 Subject: [PATCH] koordlet: add cold memory collection and reporting (#1574) Signed-off-by: BUPT-wxq <1725712048@qq.com> --- pkg/features/koordlet_features.go | 7 + pkg/koordlet/metriccache/metric_resources.go | 7 + pkg/koordlet/metriccache/metric_types.go | 8 + .../coldmemoryresource/cold_page_collector.go | 62 ++ .../cold_page_collector_test.go | 102 +++ .../coldmemoryresource/cold_page_kidled.go | 243 ++++++ .../cold_page_kidled_test.go | 768 ++++++++++++++++++ .../metricsadvisor/framework/config.go | 3 + .../metricsadvisor/framework/config_test.go | 5 + .../metricsadvisor/plugins_profile.go | 18 +- pkg/koordlet/resourceexecutor/reader.go | 22 + pkg/koordlet/resourceexecutor/reader_test.go | 105 +++ .../statesinformer/impl/states_nodemetric.go | 41 +- .../impl/states_nodemetric_test.go | 128 ++- pkg/koordlet/util/cold_page.go | 44 + pkg/koordlet/util/cold_page_test.go | 240 ++++++ pkg/koordlet/util/system/cgroup_resource.go | 3 + pkg/koordlet/util/system/kidled_util.go | 206 +++++ pkg/koordlet/util/system/kidled_util_test.go | 236 ++++++ pkg/koordlet/util/system/system_resource.go | 26 +- .../util/system/system_resource_test.go | 52 ++ 21 files changed, 2294 insertions(+), 32 deletions(-) create mode 100644 pkg/koordlet/metricsadvisor/collectors/coldmemoryresource/cold_page_collector.go create mode 100644 pkg/koordlet/metricsadvisor/collectors/coldmemoryresource/cold_page_collector_test.go create mode 100644 pkg/koordlet/metricsadvisor/collectors/coldmemoryresource/cold_page_kidled.go create mode 100644 pkg/koordlet/metricsadvisor/collectors/coldmemoryresource/cold_page_kidled_test.go create mode 100644 pkg/koordlet/util/cold_page.go create mode 100644 pkg/koordlet/util/cold_page_test.go create mode 100644 pkg/koordlet/util/system/kidled_util.go create mode 100644 pkg/koordlet/util/system/kidled_util_test.go diff --git a/pkg/features/koordlet_features.go b/pkg/features/koordlet_features.go index c575c8d59..cb14d7cd6 100644 --- a/pkg/features/koordlet_features.go +++ b/pkg/features/koordlet_features.go @@ -127,6 +127,12 @@ const ( // // BlkIOReconcile enables block I/O QoS feature of koordlet. BlkIOReconcile featuregate.Feature = "BlkIOReconcile" + + // owner: @BUPT-wxq + // alpha v1.4 + // + // ColdPageCollector enables coldPageCollector feature of koordlet. + ColdPageCollector featuregate.Feature = "ColdPageCollector" ) func init() { @@ -154,6 +160,7 @@ var ( Libpfm4: {Default: false, PreRelease: featuregate.Alpha}, PSICollector: {Default: false, PreRelease: featuregate.Alpha}, BlkIOReconcile: {Default: false, PreRelease: featuregate.Alpha}, + ColdPageCollector: {Default: false, PreRelease: featuregate.Alpha}, } ) diff --git a/pkg/koordlet/metriccache/metric_resources.go b/pkg/koordlet/metriccache/metric_resources.go index f7202bc26..72e4d0c25 100644 --- a/pkg/koordlet/metriccache/metric_resources.go +++ b/pkg/koordlet/metriccache/metric_resources.go @@ -42,6 +42,13 @@ var ( ContainerGPUCoreUsageMetric = defaultMetricFactory.New(ContainerMetricGPUCoreUsage).withPropertySchema(MetricPropertyContainerID, MetricPropertyGPUMinor, MetricPropertyGPUDeviceUUID) ContainerGPUMemUsageMetric = defaultMetricFactory.New(ContainerMetricGPUMemUsage).withPropertySchema(MetricPropertyContainerID, MetricPropertyGPUMinor, MetricPropertyGPUDeviceUUID) ContainerCPUThrottledMetric = defaultMetricFactory.New(ContainerMetricCPUThrottled).withPropertySchema(MetricPropertyContainerID) + // cold memory metrics + NodeMemoryWithHotPageUsageMetric = defaultMetricFactory.New(NodeMemoryWithHotPageUsage) + PodMemoryWithHotPageUsageMetric = defaultMetricFactory.New(PodMemoryWithHotPageUsage).withPropertySchema(MetricPropertyPodUID) + ContainerMemoryWithHotPageUsageMetric = defaultMetricFactory.New(ContainerMemoryWithHotPageUsage).withPropertySchema(MetricPropertyContainerID) + NodeMemoryColdPageSizeMetric = defaultMetricFactory.New(NodeMemoryColdPageSize) + PodMemoryColdPageSizeMetric = defaultMetricFactory.New(PodMemoryColdPageSize).withPropertySchema(MetricPropertyPodUID) + ContainerMemoryColdPageSizeMetric = defaultMetricFactory.New(ContainerMemoryColdPageSize).withPropertySchema(MetricPropertyContainerID) // CPI ContainerCPI = defaultMetricFactory.New(ContainerMetricCPI).withPropertySchema(MetricPropertyPodUID, MetricPropertyContainerID, MetricPropertyCPIResource) diff --git a/pkg/koordlet/metriccache/metric_types.go b/pkg/koordlet/metriccache/metric_types.go index 1e1945984..0428be8b9 100644 --- a/pkg/koordlet/metriccache/metric_types.go +++ b/pkg/koordlet/metriccache/metric_types.go @@ -75,6 +75,14 @@ const ( ContainerMetricPSICPUFullSupported MetricKind = "container_psi_cpu_full_supported" PodMetricPSI MetricKind = "pod_psi" PodMetricPSICPUFullSupported MetricKind = "pod_psi_cpu_full_supported" + + //cold memory metrics + NodeMemoryWithHotPageUsage MetricKind = "node_memory_with_hot_page_usage" + PodMemoryWithHotPageUsage MetricKind = "pod_memory_with_hot_page_usage" + ContainerMemoryWithHotPageUsage MetricKind = "container_memory_with_hot_page_usage" + NodeMemoryColdPageSize MetricKind = "node_memory_cold_page_size" + PodMemoryColdPageSize MetricKind = "pod_memory_cold_page_size" + ContainerMemoryColdPageSize MetricKind = "container_memory_cold_page_size" ) // MetricProperty is the property of metric diff --git a/pkg/koordlet/metricsadvisor/collectors/coldmemoryresource/cold_page_collector.go b/pkg/koordlet/metricsadvisor/collectors/coldmemoryresource/cold_page_collector.go new file mode 100644 index 000000000..3cec12e4b --- /dev/null +++ b/pkg/koordlet/metricsadvisor/collectors/coldmemoryresource/cold_page_collector.go @@ -0,0 +1,62 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package coldmemoryresource + +import ( + "go.uber.org/atomic" + + "github.com/koordinator-sh/koordinator/pkg/koordlet/metricsadvisor/framework" + "github.com/koordinator-sh/koordinator/pkg/koordlet/util/system" +) + +const ( + CollectorName = "ColdPageCollector" +) + +type nonColdPageCollector struct { +} + +func New(opt *framework.Options) framework.Collector { + // check whether support kidled cold page info collector + if system.IsKidledSupport() { + return &kidledcoldPageCollector{ + collectInterval: opt.Config.ColdPageCollectorInterval, + cgroupReader: opt.CgroupReader, + statesInformer: opt.StatesInformer, + // TODO(BUPT-wxq): implement podFilter for the VM-based pods and containers + podFilter: framework.DefaultPodFilter, + appendableDB: opt.MetricCache, + metricDB: opt.MetricCache, + started: atomic.NewBool(false), + } + } + // TODO(BUPT-wxq): check kstaled cold page collector + // nonCollector does nothing + return &nonColdPageCollector{} +} + +func (n *nonColdPageCollector) Run(stopCh <-chan struct{}) {} + +func (n *nonColdPageCollector) Started() bool { + return false +} + +func (n *nonColdPageCollector) Enabled() bool { + return false +} + +func (n *nonColdPageCollector) Setup(c1 *framework.Context) {} diff --git a/pkg/koordlet/metricsadvisor/collectors/coldmemoryresource/cold_page_collector_test.go b/pkg/koordlet/metricsadvisor/collectors/coldmemoryresource/cold_page_collector_test.go new file mode 100644 index 000000000..2f97a741e --- /dev/null +++ b/pkg/koordlet/metricsadvisor/collectors/coldmemoryresource/cold_page_collector_test.go @@ -0,0 +1,102 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package coldmemoryresource + +import ( + "testing" + "time" + + "github.com/golang/mock/gomock" + "github.com/stretchr/testify/assert" + "go.uber.org/atomic" + + "github.com/koordinator-sh/koordinator/pkg/koordlet/metriccache" + "github.com/koordinator-sh/koordinator/pkg/koordlet/metricsadvisor/framework" + "github.com/koordinator-sh/koordinator/pkg/koordlet/resourceexecutor" + mock_statesinformer "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer/mockstatesinformer" + "github.com/koordinator-sh/koordinator/pkg/koordlet/util/system" +) + +func Test_NewColdPageCollector(t *testing.T) { + helper := system.NewFileTestUtil(t) + defer helper.Cleanup() + metricCache, err := metriccache.NewMetricCache(&metriccache.Config{ + TSDBPath: t.TempDir(), + TSDBEnablePromMetrics: false, + }) + defer func() { + err = metricCache.Close() + assert.NoError(t, err) + }() + ctrl := gomock.NewController(t) + defer ctrl.Finish() + statesInformer := mock_statesinformer.NewMockStatesInformer(ctrl) + opt := &framework.Options{ + Config: &framework.Config{ + ColdPageCollectorInterval: 1 * time.Second, + }, + CgroupReader: resourceexecutor.NewCgroupReader(), + StatesInformer: statesInformer, + MetricCache: metricCache, + } + type fields struct { + SetSysUtil func(helper *system.FileTestUtil) + } + tests := []struct { + name string + fields fields + want framework.Collector + }{ + { + name: "os doesn't support cold page collector and return nonCollector", + want: &nonColdPageCollector{}, + }, + { + name: "os support kidled cold page collector but cold page collector feature-gate false", + fields: fields{ + SetSysUtil: func(helper *system.FileTestUtil) { + + helper.SetResourcesSupported(true, system.KidledScanPeriodInSeconds) + helper.SetResourcesSupported(true, system.KidledUseHierarchy) + }, + }, + want: &kidledcoldPageCollector{ + collectInterval: opt.Config.ColdPageCollectorInterval, + cgroupReader: opt.CgroupReader, + statesInformer: opt.StatesInformer, + podFilter: framework.DefaultPodFilter, + appendableDB: opt.MetricCache, + metricDB: opt.MetricCache, + started: atomic.NewBool(false), + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + helper := system.NewFileTestUtil(t) + defer helper.Cleanup() + if tt.fields.SetSysUtil != nil { + tt.fields.SetSysUtil(helper) + } + got := New(opt) + assert.Equal(t, tt.want, got) + assert.NotPanics(t, func() { + got.Setup(&framework.Context{}) + }) + }) + } +} diff --git a/pkg/koordlet/metricsadvisor/collectors/coldmemoryresource/cold_page_kidled.go b/pkg/koordlet/metricsadvisor/collectors/coldmemoryresource/cold_page_kidled.go new file mode 100644 index 000000000..6d1aa02cf --- /dev/null +++ b/pkg/koordlet/metricsadvisor/collectors/coldmemoryresource/cold_page_kidled.go @@ -0,0 +1,243 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package coldmemoryresource + +import ( + "fmt" + "time" + + "go.uber.org/atomic" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/klog/v2" + + "github.com/koordinator-sh/koordinator/pkg/features" + "github.com/koordinator-sh/koordinator/pkg/koordlet/metriccache" + "github.com/koordinator-sh/koordinator/pkg/koordlet/metricsadvisor/framework" + "github.com/koordinator-sh/koordinator/pkg/koordlet/resourceexecutor" + "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer" + koordletutil "github.com/koordinator-sh/koordinator/pkg/koordlet/util" + "github.com/koordinator-sh/koordinator/pkg/koordlet/util/system" + "github.com/koordinator-sh/koordinator/pkg/util" +) + +type kidledcoldPageCollector struct { + collectInterval time.Duration + started *atomic.Bool + cgroupReader resourceexecutor.CgroupReader + statesInformer statesinformer.StatesInformer + podFilter framework.PodFilter + appendableDB metriccache.Appendable + metricDB metriccache.MetricCache +} + +func (k *kidledcoldPageCollector) Run(stopCh <-chan struct{}) { + go wait.Until(k.collectColdPageInfo, k.collectInterval, stopCh) +} + +func (k *kidledcoldPageCollector) Started() bool { + return k.started.Load() +} + +func (k *kidledcoldPageCollector) Enabled() bool { + if features.DefaultKoordletFeatureGate.Enabled(features.ColdPageCollector) { + // start kidled + kidledConfig := system.NewDefaultKidledConfig() + err := system.SetKidledScanPeriodInSeconds(kidledConfig.ScanPeriodInseconds) + if err != nil { + klog.V(4).Infof("cold page collector start kidled err:", err) + return false + } + err = system.SetKidledUseHierarchy(kidledConfig.UseHierarchy) + if err != nil { + klog.V(4).Infof("cold page collector start kidled err:", err) + return false + } + system.SetIsStartColdMemory(true) + return true + } + return false +} + +func (k *kidledcoldPageCollector) Setup(c1 *framework.Context) {} + +func (k *kidledcoldPageCollector) collectColdPageInfo() { + if k.statesInformer == nil { + return + } + coldPageMetrics := make([]metriccache.MetricSample, 0) + + nodeColdPageInfoMetric, err := k.collectNodeColdPageInfo() + if err != nil { + klog.Warningf("generate node cold page info metrics failed, err %v", err) + } + coldPageMetrics = append(coldPageMetrics, nodeColdPageInfoMetric...) + + podsColdPageInfoMetric, err := k.collectPodsColdPageInfo() + if err != nil { + klog.Warningf("generate pods or container cold page info metrics failed, err %v", err) + } + coldPageMetrics = append(coldPageMetrics, podsColdPageInfoMetric...) + + appender := k.appendableDB.Appender() + if err := appender.Append(coldPageMetrics); err != nil { + klog.ErrorS(err, "Append node metrics error") + return + } + + if err := appender.Commit(); err != nil { + klog.Warningf("Commit node metrics failed, reason: %v", err) + return + } + + k.started.Store(true) +} + +func (k *kidledcoldPageCollector) collectNodeColdPageInfo() ([]metriccache.MetricSample, error) { + coldPageMetrics := make([]metriccache.MetricSample, 0) + collectTime := time.Now() + nodeColdPageBytes, err := k.cgroupReader.ReadMemoryColdPageUsage("") + if err != nil { + return nil, err + } + nodeColdPageBytesValue := float64(nodeColdPageBytes) + nodeColdPageMetrics, err := metriccache.NodeMemoryColdPageSizeMetric.GenerateSample(nil, collectTime, nodeColdPageBytesValue) + if err != nil { + return nil, err + } + coldPageMetrics = append(coldPageMetrics, nodeColdPageMetrics) + + memUsageWithHotPageBytes, err := koordletutil.GetNodeMemUsageWithHotPage(nodeColdPageBytes) + if err != nil { + return nil, err + } + memUsageWithHotPageValue := float64(memUsageWithHotPageBytes) + memUsageWithHotPageMetrics, err := metriccache.NodeMemoryWithHotPageUsageMetric.GenerateSample(nil, collectTime, memUsageWithHotPageValue) + if err != nil { + return nil, err + } + coldPageMetrics = append(coldPageMetrics, memUsageWithHotPageMetrics) + klog.V(4).Infof("collectNodeResUsed finished, count %v, memUsageWithHotPage[%v], coldPageSize[%v]", + len(coldPageMetrics), memUsageWithHotPageValue, nodeColdPageBytes) + return coldPageMetrics, nil +} + +func (k *kidledcoldPageCollector) collectPodsColdPageInfo() ([]metriccache.MetricSample, error) { + podMetas := k.statesInformer.GetAllPods() + count := 0 + coldMetrics := make([]metriccache.MetricSample, 0) + for _, meta := range podMetas { + pod := meta.Pod + uid := string(pod.UID) // types.UID + podKey := util.GetPodKey(pod) + if filtered, msg := k.FilterPod(meta); filtered { + klog.V(5).Infof("skip collect pod %s, reason: %s", podKey, msg) + continue + } + collectTime := time.Now() + podCgroupDir := meta.CgroupDir + podColdPageBytes, err := k.cgroupReader.ReadMemoryColdPageUsage(podCgroupDir) + if err != nil { + if pod.Status.Phase != corev1.PodRunning && pod.Status.Phase != corev1.PodPending { + klog.V(6).Infof("failed to collect non-running pod cold page usage for %s, cold page err: %s", + podKey, err) + } else { + klog.Warningf("can not get cold page info from memory.idle_page_stats file for pod %s/%s", pod.Namespace, pod.Name) + } + continue + } + podColdPageBytesValue := float64(podColdPageBytes) + podColdPageMetrics, err := metriccache.PodMemoryColdPageSizeMetric.GenerateSample(metriccache.MetricPropertiesFunc.Pod(uid), collectTime, podColdPageBytesValue) + if err != nil { + return nil, err + } + coldMetrics = append(coldMetrics, podColdPageMetrics) + + podMemUsageWithHotPageBytes, err := koordletutil.GetPodMemUsageWithHotPage(k.cgroupReader, podCgroupDir, podColdPageBytes) + if err != nil { + klog.Warningf("failed to collect pod usage for Memory err: %s pod: %s/%s", err, pod.Namespace, pod.Name) + continue + } + + podMemUsageWithHotPageValue := float64(podMemUsageWithHotPageBytes) + podMemUsageWithHotPageMetrics, err := metriccache.PodMemoryWithHotPageUsageMetric.GenerateSample(metriccache.MetricPropertiesFunc.Pod(uid), collectTime, podMemUsageWithHotPageValue) + if err != nil { + return nil, err + } + coldMetrics = append(coldMetrics, podMemUsageWithHotPageMetrics) + count++ + containerColdPageMetrics, err := k.collectContainersColdPageInfo(meta) + if err != nil { + return nil, err + } + coldMetrics = append(coldMetrics, containerColdPageMetrics...) + } + klog.V(4).Infof("collectPodResUsed finished, pod num %d, collected %d", len(podMetas), count) + return coldMetrics, nil +} + +func (k *kidledcoldPageCollector) collectContainersColdPageInfo(meta *statesinformer.PodMeta) ([]metriccache.MetricSample, error) { + pod := meta.Pod + count := 0 + coldMetrics := make([]metriccache.MetricSample, 0) + for i := range pod.Status.ContainerStatuses { + containerStat := &pod.Status.ContainerStatuses[i] + containerKey := fmt.Sprintf("%s/%s/%s", pod.Namespace, pod.Name, containerStat.Name) + collectTime := time.Now() + if len(containerStat.ContainerID) == 0 { + klog.Warningf("container %s id is empty, maybe not ready, skip this round", containerKey) + continue + } + containerCgroupDir, err := koordletutil.GetContainerCgroupParentDir(meta.CgroupDir, containerStat) + if err != nil { + klog.Warningf("failed to collect container usage for %s, cannot get container cgroup, err: %s", + containerKey, err) + continue + } + containerColdPageBytes, err := k.cgroupReader.ReadMemoryColdPageUsage(containerCgroupDir) + if err != nil { + klog.Warningf("can not get cold page info from memory.idle_page_stats file for container %s", containerKey) + continue + } + containerColdPageBytesValue := float64(containerColdPageBytes) + containerColdPageMetrics, err := metriccache.ContainerMemoryColdPageSizeMetric.GenerateSample(metriccache.MetricPropertiesFunc.Container(containerStat.ContainerID), collectTime, containerColdPageBytesValue) + if err != nil { + return nil, err + } + coldMetrics = append(coldMetrics, containerColdPageMetrics) + + containerMemUsageWithHotPageBytes, err := koordletutil.GetContainerMemUsageWithHotPage(k.cgroupReader, containerCgroupDir, containerColdPageBytes) + if err != nil { + return nil, err + } + containerMemUsageWithHotPageValue := float64(containerMemUsageWithHotPageBytes) + containerMemUsageWithHotPageMetrics, err := metriccache.ContainerMemoryWithHotPageUsageMetric.GenerateSample(metriccache.MetricPropertiesFunc.Container(containerStat.ContainerID), collectTime, containerMemUsageWithHotPageValue) + if err != nil { + return nil, err + } + coldMetrics = append(coldMetrics, containerMemUsageWithHotPageMetrics) + count++ + klog.V(6).Infof("collect container %s, id %s finished, metric %+v", containerKey, pod.UID, coldMetrics) + } + klog.V(6).Infof("collect Container ColdPageInfo for pod %s/%s finished, container num %d, collected %d", + pod.Namespace, pod.Name, len(pod.Status.ContainerStatuses), count) + return coldMetrics, nil +} + +func (k *kidledcoldPageCollector) FilterPod(meta *statesinformer.PodMeta) (bool, string) { + return k.podFilter.FilterPod(meta) +} diff --git a/pkg/koordlet/metricsadvisor/collectors/coldmemoryresource/cold_page_kidled_test.go b/pkg/koordlet/metricsadvisor/collectors/coldmemoryresource/cold_page_kidled_test.go new file mode 100644 index 000000000..396cb932e --- /dev/null +++ b/pkg/koordlet/metricsadvisor/collectors/coldmemoryresource/cold_page_kidled_test.go @@ -0,0 +1,768 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package coldmemoryresource + +import ( + "testing" + "time" + + "github.com/golang/mock/gomock" + gocache "github.com/patrickmn/go-cache" + "github.com/stretchr/testify/assert" + "go.uber.org/atomic" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + + "github.com/koordinator-sh/koordinator/pkg/features" + "github.com/koordinator-sh/koordinator/pkg/koordlet/metriccache" + "github.com/koordinator-sh/koordinator/pkg/koordlet/metricsadvisor/framework" + "github.com/koordinator-sh/koordinator/pkg/koordlet/resourceexecutor" + "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer" + mock_statesinformer "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer/mockstatesinformer" + "github.com/koordinator-sh/koordinator/pkg/koordlet/util/system" +) + +func Test_kideldEnable(t *testing.T) { + type fields struct { + SetSysUtil func(helper *system.FileTestUtil) + fg map[string]bool + } + tests := []struct { + name string + fields fields + wantsupport bool + wantenable bool + }{ + { + name: "os doesn't support kidled and koordlet feature-gate doesn't support kidled", + fields: fields{ + SetSysUtil: func(helper *system.FileTestUtil) { + helper.SetResourcesSupported(false, system.KidledScanPeriodInSeconds) + helper.SetResourcesSupported(false, system.KidledUseHierarchy) + }, + fg: map[string]bool{ + string(features.ColdPageCollector): false, + }, + }, + wantsupport: false, + wantenable: false, + }, + { + name: "os doesn't support kidled and koordlet feature-gate supports kidled", + fields: fields{ + SetSysUtil: func(helper *system.FileTestUtil) { + helper.SetResourcesSupported(false, system.KidledScanPeriodInSeconds) + helper.SetResourcesSupported(false, system.KidledUseHierarchy) + }, + fg: map[string]bool{ + string(features.ColdPageCollector): true, + }, + }, + wantsupport: false, + wantenable: false, + }, + { + name: "os supports kidled and koordlet feature-gate doesn't support kidled", + fields: fields{ + SetSysUtil: func(helper *system.FileTestUtil) { + helper.SetResourcesSupported(true, system.KidledScanPeriodInSeconds) + helper.SetResourcesSupported(true, system.KidledUseHierarchy) + helper.CreateCgroupFile("", system.KidledScanPeriodInSeconds) + helper.CreateCgroupFile("", system.KidledUseHierarchy) + helper.WriteFileContents(system.KidledScanPeriodInSeconds.Path(""), `120`) + helper.WriteFileContents(system.KidledUseHierarchy.Path(""), `1`) + }, + fg: map[string]bool{ + string(features.ColdPageCollector): false, + }, + }, + wantsupport: true, + wantenable: false, + }, + { + name: "os supports kidled and koordlet feature-gate supports kidled", + fields: fields{ + SetSysUtil: func(helper *system.FileTestUtil) { + helper.SetResourcesSupported(true, system.KidledScanPeriodInSeconds) + helper.SetResourcesSupported(true, system.KidledUseHierarchy) + helper.CreateCgroupFile("", system.KidledScanPeriodInSeconds) + helper.CreateCgroupFile("", system.KidledUseHierarchy) + helper.WriteFileContents(system.KidledScanPeriodInSeconds.Path(""), `120`) + helper.WriteFileContents(system.KidledUseHierarchy.Path(""), `1`) + }, + fg: map[string]bool{ + string(features.ColdPageCollector): true, + }, + }, + wantsupport: true, + wantenable: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + features.DefaultMutableKoordletFeatureGate.SetFromMap(tt.fields.fg) + helper := system.NewFileTestUtil(t) + defer helper.Cleanup() + if tt.fields.SetSysUtil != nil { + tt.fields.SetSysUtil(helper) + } + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + metricCache, err := metriccache.NewMetricCache(&metriccache.Config{ + TSDBPath: t.TempDir(), + TSDBEnablePromMetrics: false, + }) + assert.NoError(t, err) + defer func() { + metricCache.Close() + }() + statesInformer := mock_statesinformer.NewMockStatesInformer(ctrl) + collector := New(&framework.Options{ + Config: &framework.Config{ + ColdPageCollectorInterval: 1 * time.Second, + }, + StatesInformer: statesInformer, + MetricCache: metricCache, + CgroupReader: resourceexecutor.NewCgroupReader(), + }) + assert.Equal(t, tt.wantsupport, system.IsKidledSupport()) + assert.Equal(t, tt.wantenable, collector.Enabled()) + }) + } +} + +func Test_collectColdPageInfo(t *testing.T) { + testNow := time.Now() + testContainerID := "containerd://123abc" + testPodMetaDir := "kubepods.slice/kubepods-podxxxxxxxx.slice" + testPodParentDir := "/kubepods.slice/kubepods-podxxxxxxxx.slice" + testContainerParentDir := "/kubepods.slice/kubepods-podxxxxxxxx.slice/cri-containerd-123abc.scope" + testMemoryIdlePageStatsContent := `# version: 1.0 + # page_scans: 24 + # slab_scans: 0 + # scan_period_in_seconds: 120 + # use_hierarchy: 1 + # buckets: 1,2,5,15,30,60,120,240 + # + # _-----=> clean/dirty + # / _----=> swap/file + # | / _---=> evict/unevict + # || / _--=> inactive/active + # ||| / _-=> slab + # |||| / + # ||||| [1,2) [2,5) [5,15) [15,30) [30,60) [60,120) [120,240) [240,+inf) + csei 2613248 4657152 18182144 293683200 0 0 0 0 + dsei 2568192 5140480 15306752 48648192 0 0 0 0 + cfei 2633728 4640768 66531328 340172800 0 0 0 0 + dfei 0 0 4096 0 0 0 0 0 + csui 0 0 0 0 0 0 0 0 + dsui 0 0 0 0 0 0 0 0 + cfui 0 0 0 0 0 0 0 0 + dfui 0 0 0 0 0 0 0 0 + csea 765952 1044480 3784704 52834304 0 0 0 0 + dsea 286720 270336 1564672 5390336 0 0 0 0 + cfea 9273344 16609280 152109056 315121664 0 0 0 0 + dfea 0 0 0 0 0 0 0 0 + csua 0 0 0 0 0 0 0 0 + dsua 0 0 0 0 0 0 0 0 + cfua 0 0 0 0 0 0 0 0 + dfua 0 0 0 0 0 0 0 0 + slab 0 0 0 0 0 0 0 0` + testMemStat := ` + total_cache 104857600 + total_rss 104857600 + total_inactive_anon 104857600 + total_active_anon 0 + total_inactive_file 104857600 + total_active_file 0 + total_unevictable 0 + ` + meminfo := `MemTotal: 1048576 kB + MemFree: 262144 kB + MemAvailable: 524288 kB + Buffers: 0 kB + Cached: 262144 kB + SwapCached: 0 kB + Active: 524288 kB + Inactive: 262144 kB + Active(anon): 262144 kB + Inactive(anon): 262144 kB + Active(file): 0 kB + Inactive(file): 262144 kB + Unevictable: 0 kB + Mlocked: 0 kB + SwapTotal: 0 kB + SwapFree: 0 kB + Dirty: 0 kB + Writeback: 0 kB + AnonPages: 0 kB + Mapped: 0 kB + Shmem: 0 kB + Slab: 0 kB + SReclaimable: 0 kB + SUnreclaim: 0 kB + KernelStack: 0 kB + PageTables: 0 kB + NFS_Unstable: 0 kB + Bounce: 0 kB + WritebackTmp: 0 kB + CommitLimit: 0 kB + Committed_AS: 0 kB + VmallocTotal: 0 kB + VmallocUsed: 0 kB + VmallocChunk: 0 kB + HardwareCorrupted: 0 kB + AnonHugePages: 0 kB + ShmemHugePages: 0 kB + ShmemPmdMapped: 0 kB + CmaTotal: 0 kB + CmaFree: 0 kB + HugePages_Total: 0 + HugePages_Free: 0 + HugePages_Rsvd: 0 + HugePages_Surp: 0 + Hugepagesize: 0 kB + DirectMap4k: 0 kB + DirectMap2M: 0 kB + DirectMap1G: 0 kB` + testPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "test", + UID: "xxxxxxxx", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "test-container", + ContainerID: testContainerID, + State: corev1.ContainerState{ + Running: &corev1.ContainerStateRunning{}, + }, + }, + }, + }, + } + type fields struct { + podFilterOption framework.PodFilter + getPodMetas []*statesinformer.PodMeta + initPodLastStat func(lastState *gocache.Cache) + initContainerLastStat func(lastState *gocache.Cache) + SetSysUtil func(helper *system.FileTestUtil) + } + tests := []struct { + name string + fields fields + wantstrated bool + }{ + { + name: "success collect node, pod and container cold page info for cgroup v1", + fields: fields{ + podFilterOption: framework.DefaultPodFilter, + getPodMetas: []*statesinformer.PodMeta{ + { + CgroupDir: testPodMetaDir, + Pod: testPod, + }, + }, + initPodLastStat: func(lastState *gocache.Cache) { + lastState.Set(string(testPod.UID), framework.CPUStat{ + CPUUsage: 0, + Timestamp: testNow.Add(-time.Second), + }, gocache.DefaultExpiration) + }, + initContainerLastStat: func(lastState *gocache.Cache) { + lastState.Set(testContainerID, framework.CPUStat{ + CPUUsage: 0, + Timestamp: testNow.Add(-time.Second), + }, gocache.DefaultExpiration) + }, + SetSysUtil: func(helper *system.FileTestUtil) { + helper.WriteFileContents(system.KidledScanPeriodInSeconds.Path(""), `120`) + helper.WriteFileContents(system.KidledUseHierarchy.Path(""), `1`) + helper.SetResourcesSupported(true, system.MemoryIdlePageStats) + helper.WriteProcSubFileContents(system.ProcMemInfoName, meminfo) + helper.WriteCgroupFileContents(testPodParentDir, system.MemoryStat, testMemStat) + helper.WriteCgroupFileContents(testPodParentDir, system.MemoryIdlePageStats, testMemoryIdlePageStatsContent) + helper.WriteCgroupFileContents(testContainerParentDir, system.MemoryStat, testMemStat) + helper.WriteCgroupFileContents(testContainerParentDir, system.MemoryIdlePageStats, testMemoryIdlePageStatsContent) + }, + }, + wantstrated: true, + }, + { + name: "states informer nil", + fields: fields{ + podFilterOption: framework.DefaultPodFilter, + getPodMetas: []*statesinformer.PodMeta{ + { + CgroupDir: testPodMetaDir, + Pod: testPod, + }, + }, + initPodLastStat: func(lastState *gocache.Cache) { + lastState.Set(string(testPod.UID), framework.CPUStat{ + CPUUsage: 0, + Timestamp: testNow.Add(-time.Second), + }, gocache.DefaultExpiration) + }, + initContainerLastStat: func(lastState *gocache.Cache) { + lastState.Set(testContainerID, framework.CPUStat{ + CPUUsage: 0, + Timestamp: testNow.Add(-time.Second), + }, gocache.DefaultExpiration) + }, + SetSysUtil: func(helper *system.FileTestUtil) { + helper.WriteFileContents(system.KidledScanPeriodInSeconds.Path(""), `120`) + helper.WriteFileContents(system.KidledUseHierarchy.Path(""), `1`) + helper.SetResourcesSupported(true, system.MemoryIdlePageStats) + helper.WriteProcSubFileContents(system.ProcMemInfoName, meminfo) + helper.WriteCgroupFileContents(testPodParentDir, system.MemoryStat, testMemStat) + helper.WriteCgroupFileContents(testPodParentDir, system.MemoryIdlePageStats, testMemoryIdlePageStatsContent) + helper.WriteCgroupFileContents(testContainerParentDir, system.MemoryStat, testMemStat) + helper.WriteCgroupFileContents(testContainerParentDir, system.MemoryIdlePageStats, testMemoryIdlePageStatsContent) + }, + }, + wantstrated: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + helper := system.NewFileTestUtil(t) + defer helper.Cleanup() + if tt.fields.SetSysUtil != nil { + tt.fields.SetSysUtil(helper) + } + + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + metricCache, err := metriccache.NewMetricCache(&metriccache.Config{ + TSDBPath: t.TempDir(), + TSDBEnablePromMetrics: false, + }) + assert.NoError(t, err) + defer func() { + metricCache.Close() + }() + statesInformer := mock_statesinformer.NewMockStatesInformer(ctrl) + if tt.name != "states informer nil" { + statesInformer.EXPECT().HasSynced().Return(true).AnyTimes() + statesInformer.EXPECT().GetAllPods().Return(tt.fields.getPodMetas).Times(1) + } + c := &kidledcoldPageCollector{ + collectInterval: 1 * time.Second, + cgroupReader: resourceexecutor.NewCgroupReader(), + statesInformer: statesInformer, + podFilter: framework.DefaultPodFilter, + appendableDB: metricCache, + metricDB: metricCache, + started: atomic.NewBool(false), + } + if tt.name == "states informer nil" { + c.statesInformer = nil + } + assert.NotPanics(t, func() { + c.collectColdPageInfo() + }) + assert.Equal(t, tt.wantstrated, c.Started()) + }) + } +} + +func Test_collectNodeColdPageInfo(t *testing.T) { + // test collect success + idleInfoContentStr := `# version: 1.0 + # page_scans: 24 + # slab_scans: 0 + # scan_period_in_seconds: 120 + # use_hierarchy: 1 + # buckets: 1,2,5,15,30,60,120,240 + # + # _-----=> clean/dirty + # / _----=> swap/file + # | / _---=> evict/unevict + # || / _--=> inactive/active + # ||| / _-=> slab + # |||| / + # ||||| [1,2) [2,5) [5,15) [15,30) [30,60) [60,120) [120,240) [240,+inf) + csei 2613248 4657152 18182144 293683200 0 0 0 0 + dsei 2568192 5140480 15306752 48648192 0 0 0 0 + cfei 2633728 4640768 66531328 340172800 0 0 0 0 + dfei 0 0 4096 0 0 0 0 0 + csui 0 0 0 0 0 0 0 0 + dsui 0 0 0 0 0 0 0 0 + cfui 0 0 0 0 0 0 0 0 + dfui 0 0 0 0 0 0 0 0 + csea 765952 1044480 3784704 52834304 0 0 0 0 + dsea 286720 270336 1564672 5390336 0 0 0 0 + cfea 9273344 16609280 152109056 315121664 0 0 0 0 + dfea 0 0 0 0 0 0 0 0 + csua 0 0 0 0 0 0 0 0 + dsua 0 0 0 0 0 0 0 0 + cfua 0 0 0 0 0 0 0 0 + dfua 0 0 0 0 0 0 0 0 + slab 0 0 0 0 0 0 0 0` + meminfo := `MemTotal: 1048576 kB + MemFree: 262144 kB + MemAvailable: 524288 kB + Buffers: 0 kB + Cached: 262144 kB + SwapCached: 0 kB + Active: 524288 kB + Inactive: 262144 kB + Active(anon): 262144 kB + Inactive(anon): 262144 kB + Active(file): 0 kB + Inactive(file): 262144 kB + Unevictable: 0 kB + Mlocked: 0 kB + SwapTotal: 0 kB + SwapFree: 0 kB + Dirty: 0 kB + Writeback: 0 kB + AnonPages: 0 kB + Mapped: 0 kB + Shmem: 0 kB + Slab: 0 kB + SReclaimable: 0 kB + SUnreclaim: 0 kB + KernelStack: 0 kB + PageTables: 0 kB + NFS_Unstable: 0 kB + Bounce: 0 kB + WritebackTmp: 0 kB + CommitLimit: 0 kB + Committed_AS: 0 kB + VmallocTotal: 0 kB + VmallocUsed: 0 kB + VmallocChunk: 0 kB + HardwareCorrupted: 0 kB + AnonHugePages: 0 kB + ShmemHugePages: 0 kB + ShmemPmdMapped: 0 kB + CmaTotal: 0 kB + CmaFree: 0 kB + HugePages_Total: 0 + HugePages_Free: 0 + HugePages_Rsvd: 0 + HugePages_Surp: 0 + Hugepagesize: 0 kB + DirectMap4k: 0 kB + DirectMap2M: 0 kB + DirectMap1G: 0 kB` + helper := system.NewFileTestUtil(t) + defer helper.Cleanup() + ctrl := gomock.NewController(t) + defer ctrl.Finish() + metricCache, err := metriccache.NewMetricCache(&metriccache.Config{ + TSDBPath: t.TempDir(), + TSDBEnablePromMetrics: false, + }) + assert.NoError(t, err) + defer func() { + metricCache.Close() + }() + statesInformer := mock_statesinformer.NewMockStatesInformer(ctrl) + helper.CreateCgroupFile("", system.MemoryIdlePageStats) + helper.SetResourcesSupported(true, system.MemoryIdlePageStats) + helper.WriteCgroupFileContents("", system.MemoryIdlePageStats, idleInfoContentStr) + helper.WriteProcSubFileContents(system.ProcMemInfoName, meminfo) + c := &kidledcoldPageCollector{ + collectInterval: 5 * time.Second, + cgroupReader: resourceexecutor.NewCgroupReader(), + statesInformer: statesInformer, + podFilter: framework.DefaultPodFilter, + appendableDB: metricCache, + metricDB: metricCache, + started: atomic.NewBool(false), + } + testNow := time.Now() + metrics, err := c.collectNodeColdPageInfo() + assert.NoError(t, err) + appender := c.appendableDB.Appender() + if err := appender.Append(metrics); err != nil { + klog.ErrorS(err, "Append node metrics error") + return + } + + if err := appender.Commit(); err != nil { + klog.Warningf("Commit node metrics failed, reason: %v", err) + return + } + assert.NoError(t, err) + got1, got2 := testGetNodeMetrics(t, c.metricDB, testNow, 5*time.Second) + assert.Equal(t, float64(18446744073419457000), got1) + assert.Equal(t, float64(1363836928), got2) + // test collect failed + helper.WriteCgroupFileContents("", system.MemoryIdlePageStats, ``) + helper.WriteProcSubFileContents(system.ProcMemInfoName, ``) + t.Log(helper.ReadProcSubFileContents(system.ProcMemInfoName)) + assert.NotPanics(t, func() { + c.collectNodeColdPageInfo() + }) +} + +func Test_collectPodColdPageInfo(t *testing.T) { + testNow := time.Now() + testContainerID := "containerd://123abc" + testPodMetaDir := "kubepods.slice/kubepods-podxxxxxxxx.slice" + testPodParentDir := "/kubepods.slice/kubepods-podxxxxxxxx.slice" + testContainerParentDir := "/kubepods.slice/kubepods-podxxxxxxxx.slice/cri-containerd-123abc.scope" + testMemoryIdlePageStatsContent := `# version: 1.0 + # page_scans: 24 + # slab_scans: 0 + # scan_period_in_seconds: 120 + # use_hierarchy: 1 + # buckets: 1,2,5,15,30,60,120,240 + # + # _-----=> clean/dirty + # / _----=> swap/file + # | / _---=> evict/unevict + # || / _--=> inactive/active + # ||| / _-=> slab + # |||| / + # ||||| [1,2) [2,5) [5,15) [15,30) [30,60) [60,120) [120,240) [240,+inf) + csei 2613248 4657152 18182144 293683200 0 0 0 0 + dsei 2568192 5140480 15306752 48648192 0 0 0 0 + cfei 2633728 4640768 66531328 340172800 0 0 0 0 + dfei 0 0 4096 0 0 0 0 0 + csui 0 0 0 0 0 0 0 0 + dsui 0 0 0 0 0 0 0 0 + cfui 0 0 0 0 0 0 0 0 + dfui 0 0 0 0 0 0 0 0 + csea 765952 1044480 3784704 52834304 0 0 0 0 + dsea 286720 270336 1564672 5390336 0 0 0 0 + cfea 9273344 16609280 152109056 315121664 0 0 0 0 + dfea 0 0 0 0 0 0 0 0 + csua 0 0 0 0 0 0 0 0 + dsua 0 0 0 0 0 0 0 0 + cfua 0 0 0 0 0 0 0 0 + dfua 0 0 0 0 0 0 0 0 + slab 0 0 0 0 0 0 0 0` + testMemStat := ` + total_cache 104857600 + total_rss 104857600 + total_inactive_anon 104857600 + total_active_anon 0 + total_inactive_file 104857600 + total_active_file 0 + total_unevictable 0 + ` + testPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "test", + UID: "xxxxxxxx", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "test-container", + ContainerID: testContainerID, + State: corev1.ContainerState{ + Running: &corev1.ContainerStateRunning{}, + }, + }, + }, + }, + } + testFailedPod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-failed-pod", + Namespace: "test", + UID: "yyyyyy", + }, + Status: corev1.PodStatus{ + Phase: corev1.PodFailed, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "test-container", + ContainerID: testContainerID, + }, + }, + }, + } + type fields struct { + podFilterOption framework.PodFilter + getPodMetas []*statesinformer.PodMeta + initPodLastStat func(lastState *gocache.Cache) + initContainerLastStat func(lastState *gocache.Cache) + SetSysUtil func(helper *system.FileTestUtil) + } + type wantFields struct { + podResourceMetric bool + containerResourceMetric bool + } + tests := []struct { + name string + fields fields + want wantFields + }{ + { + name: "success collect pod cold page info for cgroup v1", + fields: fields{ + podFilterOption: framework.DefaultPodFilter, + getPodMetas: []*statesinformer.PodMeta{ + { + CgroupDir: testPodMetaDir, + Pod: testPod, + }, + }, + initPodLastStat: func(lastState *gocache.Cache) { + lastState.Set(string(testPod.UID), framework.CPUStat{ + CPUUsage: 0, + Timestamp: testNow.Add(-time.Second), + }, gocache.DefaultExpiration) + }, + initContainerLastStat: func(lastState *gocache.Cache) { + lastState.Set(testContainerID, framework.CPUStat{ + CPUUsage: 0, + Timestamp: testNow.Add(-time.Second), + }, gocache.DefaultExpiration) + }, + SetSysUtil: func(helper *system.FileTestUtil) { + helper.WriteFileContents(system.KidledScanPeriodInSeconds.Path(""), `120`) + helper.WriteFileContents(system.KidledUseHierarchy.Path(""), `1`) + helper.SetResourcesSupported(true, system.MemoryIdlePageStats) + helper.WriteCgroupFileContents(testPodParentDir, system.MemoryStat, testMemStat) + helper.WriteCgroupFileContents(testPodParentDir, system.MemoryIdlePageStats, testMemoryIdlePageStatsContent) + helper.WriteCgroupFileContents(testContainerParentDir, system.MemoryStat, testMemStat) + helper.WriteCgroupFileContents(testContainerParentDir, system.MemoryIdlePageStats, testMemoryIdlePageStatsContent) + }, + }, + want: wantFields{ + podResourceMetric: true, + containerResourceMetric: true, + }, + }, + { + name: "cgroups v1, filter non-running pods", + fields: fields{ + podFilterOption: &framework.TerminatedPodFilter{}, + getPodMetas: []*statesinformer.PodMeta{ + { + CgroupDir: testPodMetaDir, + Pod: testPod, + }, + { + Pod: testFailedPod, + }, + }, + initPodLastStat: func(lastState *gocache.Cache) { + lastState.Set(string(testPod.UID), framework.CPUStat{ + CPUUsage: 0, + Timestamp: testNow.Add(-time.Second), + }, gocache.DefaultExpiration) + }, + initContainerLastStat: func(lastState *gocache.Cache) { + lastState.Set(testContainerID, framework.CPUStat{ + CPUUsage: 0, + Timestamp: testNow.Add(-time.Second), + }, gocache.DefaultExpiration) + }, + SetSysUtil: func(helper *system.FileTestUtil) { + helper.WriteFileContents(system.KidledScanPeriodInSeconds.Path(""), `120`) + helper.WriteFileContents(system.KidledUseHierarchy.Path(""), `1`) + helper.SetResourcesSupported(true, system.MemoryIdlePageStats) + helper.WriteCgroupFileContents(testPodParentDir, system.MemoryStat, testMemStat) + helper.WriteCgroupFileContents(testPodParentDir, system.MemoryIdlePageStats, testMemoryIdlePageStatsContent) + helper.WriteCgroupFileContents(testContainerParentDir, system.MemoryStat, testMemStat) + helper.WriteCgroupFileContents(testContainerParentDir, system.MemoryIdlePageStats, testMemoryIdlePageStatsContent) + }, + }, + want: wantFields{ + podResourceMetric: true, + containerResourceMetric: true, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + helper := system.NewFileTestUtil(t) + defer helper.Cleanup() + if tt.fields.SetSysUtil != nil { + tt.fields.SetSysUtil(helper) + } + + ctrl := gomock.NewController(t) + defer ctrl.Finish() + + metricCache, err := metriccache.NewMetricCache(&metriccache.Config{ + TSDBPath: t.TempDir(), + TSDBEnablePromMetrics: false, + }) + assert.NoError(t, err) + defer func() { + metricCache.Close() + }() + statesInformer := mock_statesinformer.NewMockStatesInformer(ctrl) + statesInformer.EXPECT().HasSynced().Return(true).AnyTimes() + statesInformer.EXPECT().GetAllPods().Return(tt.fields.getPodMetas).Times(1) + c := &kidledcoldPageCollector{ + collectInterval: 1 * time.Second, + cgroupReader: resourceexecutor.NewCgroupReader(), + statesInformer: statesInformer, + podFilter: framework.DefaultPodFilter, + appendableDB: metricCache, + metricDB: metricCache, + started: atomic.NewBool(false), + } + assert.NotPanics(t, func() { + c.collectPodsColdPageInfo() + }) + }) + } + +} + +func testGetNodeMetrics(t *testing.T, metricCache metriccache.TSDBStorage, testNow time.Time, d time.Duration) (float64, float64) { + testStart := testNow.Add(-d) + testEnd := testNow.Add(d) + queryParam := metriccache.QueryParam{ + Start: &testStart, + End: &testEnd, + Aggregate: metriccache.AggregationTypeAVG, + } + querier, err := metricCache.Querier(*queryParam.Start, *queryParam.End) + assert.NoError(t, err) + memWithHotPageCacheAggregateResult, err := testQuery(querier, metriccache.NodeMemoryWithHotPageUsageMetric, nil) + assert.NoError(t, err) + memWithHotPageCacheUsed, err := memWithHotPageCacheAggregateResult.Value(queryParam.Aggregate) + assert.NoError(t, err) + coldPageSizeAggregateResult, err := testQuery(querier, metriccache.NodeMemoryColdPageSizeMetric, nil) + assert.NoError(t, err) + coldPageSize, err := coldPageSizeAggregateResult.Value(queryParam.Aggregate) + assert.NoError(t, err) + return memWithHotPageCacheUsed, coldPageSize +} + +func testQuery(querier metriccache.Querier, resource metriccache.MetricResource, properties map[metriccache.MetricProperty]string) (metriccache.AggregateResult, error) { + queryMeta, err := resource.BuildQueryMeta(properties) + if err != nil { + return nil, err + } + aggregateResult := metriccache.DefaultAggregateResultFactory.New(queryMeta) + if err = querier.Query(queryMeta, nil, aggregateResult); err != nil { + return nil, err + } + return aggregateResult, nil +} diff --git a/pkg/koordlet/metricsadvisor/framework/config.go b/pkg/koordlet/metricsadvisor/framework/config.go index 65bc4b739..ab28e4e46 100644 --- a/pkg/koordlet/metricsadvisor/framework/config.go +++ b/pkg/koordlet/metricsadvisor/framework/config.go @@ -34,6 +34,7 @@ type Config struct { CPICollectorInterval time.Duration PSICollectorInterval time.Duration CPICollectorTimeWindow time.Duration + ColdPageCollectorInterval time.Duration } func NewDefaultConfig() *Config { @@ -45,6 +46,7 @@ func NewDefaultConfig() *Config { CPICollectorInterval: 60 * time.Second, PSICollectorInterval: 10 * time.Second, CPICollectorTimeWindow: 10 * time.Second, + ColdPageCollectorInterval: 5 * time.Second, } } @@ -56,4 +58,5 @@ func (c *Config) InitFlags(fs *flag.FlagSet) { fs.DurationVar(&c.CPICollectorInterval, "cpi-collector-interval", c.CPICollectorInterval, "Collect cpi interval. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h).") fs.DurationVar(&c.PSICollectorInterval, "psi-collector-interval", c.PSICollectorInterval, "Collect psi interval. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h).") fs.DurationVar(&c.CPICollectorTimeWindow, "collect-cpi-timewindow", c.CPICollectorTimeWindow, "Collect cpi time window. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h).") + fs.DurationVar(&c.ColdPageCollectorInterval, "coldpage-collector-interval", c.PSICollectorInterval, "Collect cold page interval. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h).") } diff --git a/pkg/koordlet/metricsadvisor/framework/config_test.go b/pkg/koordlet/metricsadvisor/framework/config_test.go index 971c6a90b..e2b4accb9 100644 --- a/pkg/koordlet/metricsadvisor/framework/config_test.go +++ b/pkg/koordlet/metricsadvisor/framework/config_test.go @@ -33,6 +33,7 @@ func Test_NewDefaultConfig(t *testing.T) { CPICollectorInterval: 60 * time.Second, PSICollectorInterval: 10 * time.Second, CPICollectorTimeWindow: 10 * time.Second, + ColdPageCollectorInterval: 5 * time.Second, } defaultConfig := NewDefaultConfig() assert.Equal(t, expectConfig, defaultConfig) @@ -48,6 +49,7 @@ func Test_InitFlags(t *testing.T) { "--cpi-collector-interval=90s", "--psi-collector-interval=5s", "--collect-cpi-timewindow=15s", + "--coldpage-collector-interval=15s", } fs := flag.NewFlagSet(cmdArgs[0], flag.ExitOnError) @@ -59,6 +61,7 @@ func Test_InitFlags(t *testing.T) { CPICollectorInterval time.Duration PSICollectorInterval time.Duration CPICollectorTimeWindow time.Duration + ColdPageCollectorInterval time.Duration } type args struct { fs *flag.FlagSet @@ -78,6 +81,7 @@ func Test_InitFlags(t *testing.T) { CPICollectorInterval: 90 * time.Second, PSICollectorInterval: 5 * time.Second, CPICollectorTimeWindow: 15 * time.Second, + ColdPageCollectorInterval: 15 * time.Second, }, args: args{fs: fs}, }, @@ -92,6 +96,7 @@ func Test_InitFlags(t *testing.T) { CPICollectorInterval: tt.fields.CPICollectorInterval, PSICollectorInterval: tt.fields.PSICollectorInterval, CPICollectorTimeWindow: tt.fields.CPICollectorTimeWindow, + ColdPageCollectorInterval: tt.fields.ColdPageCollectorInterval, } c := NewDefaultConfig() c.InitFlags(tt.args.fs) diff --git a/pkg/koordlet/metricsadvisor/plugins_profile.go b/pkg/koordlet/metricsadvisor/plugins_profile.go index ba1dd217a..d8d4e26fa 100644 --- a/pkg/koordlet/metricsadvisor/plugins_profile.go +++ b/pkg/koordlet/metricsadvisor/plugins_profile.go @@ -18,6 +18,7 @@ package metricsadvisor import ( "github.com/koordinator-sh/koordinator/pkg/koordlet/metricsadvisor/collectors/beresource" + "github.com/koordinator-sh/koordinator/pkg/koordlet/metricsadvisor/collectors/coldmemoryresource" "github.com/koordinator-sh/koordinator/pkg/koordlet/metricsadvisor/collectors/nodeinfo" "github.com/koordinator-sh/koordinator/pkg/koordlet/metricsadvisor/collectors/noderesource" "github.com/koordinator-sh/koordinator/pkg/koordlet/metricsadvisor/collectors/nodestorageinfo" @@ -37,14 +38,15 @@ var ( } collectorPlugins = map[string]framework.CollectorFactory{ - noderesource.CollectorName: noderesource.New, - beresource.CollectorName: beresource.New, - nodeinfo.CollectorName: nodeinfo.New, - nodestorageinfo.CollectorName: nodestorageinfo.New, - podresource.CollectorName: podresource.New, - podthrottled.CollectorName: podthrottled.New, - performance.CollectorName: performance.New, - sysresource.CollectorName: sysresource.New, + noderesource.CollectorName: noderesource.New, + beresource.CollectorName: beresource.New, + nodeinfo.CollectorName: nodeinfo.New, + nodestorageinfo.CollectorName: nodestorageinfo.New, + podresource.CollectorName: podresource.New, + podthrottled.CollectorName: podthrottled.New, + performance.CollectorName: performance.New, + sysresource.CollectorName: sysresource.New, + coldmemoryresource.CollectorName: coldmemoryresource.New, } podFilters = map[string]framework.PodFilter{ diff --git a/pkg/koordlet/resourceexecutor/reader.go b/pkg/koordlet/resourceexecutor/reader.go index 3fb6befa7..6cbd2f181 100644 --- a/pkg/koordlet/resourceexecutor/reader.go +++ b/pkg/koordlet/resourceexecutor/reader.go @@ -38,6 +38,7 @@ type CgroupReader interface { ReadMemoryNumaStat(parentDir string) ([]sysutil.NumaMemoryPages, error) ReadCPUTasks(parentDir string) ([]int32, error) ReadPSI(parentDir string) (*PSIByResource, error) + ReadMemoryColdPageUsage(parentDir string) (uint64, error) } var _ CgroupReader = &CgroupV1Reader{} @@ -198,6 +199,22 @@ func (r *CgroupV1Reader) ReadCPUTasks(parentDir string) ([]int32, error) { return readCgroupAndParseInt32Slice(parentDir, resource) } +func (r *CgroupV1Reader) ReadMemoryColdPageUsage(parentDir string) (uint64, error) { + resource, ok := sysutil.DefaultRegistry.Get(sysutil.CgroupVersionV1, sysutil.MemoryIdlePageStatsName) + if !ok { + return 0, ErrResourceNotRegistered + } + s, err := cgroupFileRead(parentDir, resource) + if err != nil { + return 0, err + } + v, err := sysutil.ParseMemoryIdlePageStats(s) + if err != nil { + return 0, err + } + return v.GetColdPageTotalBytes(), nil +} + var _ CgroupReader = &CgroupV2Reader{} type CgroupV2Reader struct{} @@ -385,6 +402,11 @@ func (r *CgroupV2Reader) ReadPSI(parentDir string) (*PSIByResource, error) { return psi, nil } +// cgroup v2 has not implemented yet +func (r *CgroupV2Reader) ReadMemoryColdPageUsage(parentDir string) (uint64, error) { + return 0, ErrResourceNotRegistered +} + func NewCgroupReader() CgroupReader { if sysutil.GetCurrentCgroupVersion() == sysutil.CgroupVersionV2 { return &CgroupV2Reader{} diff --git a/pkg/koordlet/resourceexecutor/reader_test.go b/pkg/koordlet/resourceexecutor/reader_test.go index 85b2cd6e5..05cb7d46c 100644 --- a/pkg/koordlet/resourceexecutor/reader_test.go +++ b/pkg/koordlet/resourceexecutor/reader_test.go @@ -1341,3 +1341,108 @@ func TestCgroupReader_ReadPSI(t *testing.T) { }) } } + +func TestCgroupReader_ReadColdPageUsage(t *testing.T) { + type fields struct { + UseCgroupsV2 bool + MemoryIdlePageStatsValue string + } + type args struct { + parentDir string + } + tests := []struct { + name string + fields fields + args args + want uint64 + wantErr bool + }{ + { + name: "parse v1 value successfully", + fields: fields{ + UseCgroupsV2: false, + MemoryIdlePageStatsValue: `# version: 1.0 + # page_scans: 24 + # slab_scans: 0 + # scan_period_in_seconds: 120 + # use_hierarchy: 1 + # buckets: 1,2,5,15,30,60,120,240 + # + # _-----=> clean/dirty + # / _----=> swap/file + # | / _---=> evict/unevict + # || / _--=> inactive/active + # ||| / _-=> slab + # |||| / + # ||||| [1,2) [2,5) [5,15) [15,30) [30,60) [60,120) [120,240) [240,+inf) + csei 2613248 4657152 18182144 293683200 0 0 0 0 + dsei 2568192 5140480 15306752 48648192 0 0 0 0 + cfei 2633728 4640768 66531328 340172800 0 0 0 0 + dfei 0 0 4096 0 0 0 0 0 + csui 0 0 0 0 0 0 0 0 + dsui 0 0 0 0 0 0 0 0 + cfui 0 0 0 0 0 0 0 0 + dfui 0 0 0 0 0 0 0 0 + csea 765952 1044480 3784704 52834304 0 0 0 0 + dsea 286720 270336 1564672 5390336 0 0 0 0 + cfea 9273344 16609280 152109056 315121664 0 0 0 0 + dfea 0 0 0 0 0 0 0 0 + csua 0 0 0 0 0 0 0 0 + dsua 0 0 0 0 0 0 0 0 + cfua 0 0 0 0 0 0 0 0 + dfua 0 0 0 0 0 0 0 0 + slab 0 0 0 0 0 0 0 0`, + }, + args: args{ + parentDir: "/kubepods.slice", + }, + want: uint64(1363836928), + wantErr: false, + }, + { + name: "parse v1 value failed", + fields: fields{ + UseCgroupsV2: false, + MemoryIdlePageStatsValue: `abc`, + }, + args: args{ + parentDir: "/kubepods.slice", + }, + want: 0, + wantErr: true, + }, + { + name: "v1 path not exist", + fields: fields{}, + args: args{ + parentDir: "/kubepods.slice", + }, + want: 0, + wantErr: true, + }, + { + name: "cgroup v2 not registered", + fields: fields{ + UseCgroupsV2: true, + }, + args: args{ + parentDir: "/kubepods.slice", + }, + want: uint64(0), + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + helper := sysutil.NewFileTestUtil(t) + defer helper.Cleanup() + helper.SetCgroupsV2(tt.fields.UseCgroupsV2) + if tt.fields.MemoryIdlePageStatsValue != "" { + helper.WriteCgroupFileContents(tt.args.parentDir, sysutil.MemoryIdlePageStats, tt.fields.MemoryIdlePageStatsValue) + } + got, gotErr := NewCgroupReader().ReadMemoryColdPageUsage(tt.args.parentDir) + assert.Equal(t, tt.wantErr, gotErr != nil) + assert.Equal(t, tt.want, got) + }) + } +} diff --git a/pkg/koordlet/statesinformer/impl/states_nodemetric.go b/pkg/koordlet/statesinformer/impl/states_nodemetric.go index 1843efb38..8e5dfa68e 100644 --- a/pkg/koordlet/statesinformer/impl/states_nodemetric.go +++ b/pkg/koordlet/statesinformer/impl/states_nodemetric.go @@ -50,6 +50,7 @@ import ( "github.com/koordinator-sh/koordinator/pkg/koordlet/prediction" "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer" koordletutil "github.com/koordinator-sh/koordinator/pkg/koordlet/util" + "github.com/koordinator-sh/koordinator/pkg/koordlet/util/system" "github.com/koordinator-sh/koordinator/pkg/util" ) @@ -69,9 +70,9 @@ const ( ) var ( - scheme = runtime.NewScheme() - - defaultNodeMetricSpec = slov1alpha1.NodeMetricSpec{ + scheme = runtime.NewScheme() + defaultMemoryCollectPolicy slov1alpha1.NodeMemoryCollectPolicy = slov1alpha1.UsageWithoutPageCache + defaultNodeMetricSpec = slov1alpha1.NodeMetricSpec{ CollectPolicy: &slov1alpha1.NodeMetricCollectPolicy{ AggregateDurationSeconds: pointer.Int64(defaultAggregateDurationSeconds), ReportIntervalSeconds: pointer.Int64(defaultReportIntervalSeconds), @@ -82,6 +83,7 @@ var ( {Duration: 30 * time.Minute}, }, }, + NodeMemoryCollectPolicy: &defaultMemoryCollectPolicy, }, } ) @@ -444,11 +446,20 @@ func (r *nodeMetricInformer) collectNodeMetric(queryparam metriccache.QueryParam return rl, 0, err } - memAggregateResult, err := doQuery(querier, metriccache.NodeMemoryUsageMetric, nil) - if err != nil { - return rl, 0, err + var memAggregateResult metriccache.AggregateResult + // report usageMemoryWithHotPageCache + if *r.getNodeMetricSpec().CollectPolicy.NodeMemoryCollectPolicy == slov1alpha1.UsageWithHotPageCache && system.GetIsStartColdMemory() { + memAggregateResult, err = doQuery(querier, metriccache.NodeMemoryWithHotPageUsageMetric, nil) + if err != nil { + return rl, 0, err + } + } else { + // degrade and apply default memory reporting policy: usageWithoutPageCache + memAggregateResult, err = doQuery(querier, metriccache.NodeMemoryUsageMetric, nil) + if err != nil { + return rl, 0, err + } } - memUsed, err := memAggregateResult.Value(queryparam.Aggregate) if err != nil { return rl, 0, err @@ -628,12 +639,18 @@ func (r *nodeMetricInformer) collectPodMetric(podMeta *statesinformer.PodMeta, q if err != nil { return nil, err } - - memAggregateResult, err := doQuery(querier, metriccache.PodMemUsageMetric, metriccache.MetricPropertiesFunc.Pod(podUID)) - if err != nil { - return nil, err + var memAggregateResult metriccache.AggregateResult + if *r.getNodeMetricSpec().CollectPolicy.NodeMemoryCollectPolicy == slov1alpha1.UsageWithHotPageCache && system.GetIsStartColdMemory() { + memAggregateResult, err = doQuery(querier, metriccache.PodMemoryWithHotPageUsageMetric, metriccache.MetricPropertiesFunc.Pod(podUID)) + if err != nil { + return nil, err + } + } else { + memAggregateResult, err = doQuery(querier, metriccache.PodMemUsageMetric, metriccache.MetricPropertiesFunc.Pod(podUID)) + if err != nil { + return nil, err + } } - memUsed, err := memAggregateResult.Value(queryParam.Aggregate) if err != nil { return nil, err diff --git a/pkg/koordlet/statesinformer/impl/states_nodemetric_test.go b/pkg/koordlet/statesinformer/impl/states_nodemetric_test.go index 154d3bd40..349ec6a5a 100644 --- a/pkg/koordlet/statesinformer/impl/states_nodemetric_test.go +++ b/pkg/koordlet/statesinformer/impl/states_nodemetric_test.go @@ -46,6 +46,7 @@ import ( "github.com/koordinator-sh/koordinator/pkg/koordlet/prediction" "github.com/koordinator-sh/koordinator/pkg/koordlet/statesinformer" "github.com/koordinator-sh/koordinator/pkg/koordlet/util" + "github.com/koordinator-sh/koordinator/pkg/koordlet/util/system" ) var _ listerv1alpha1.NodeMetricLister = &fakeNodeMetricLister{} @@ -216,6 +217,7 @@ func Test_reporter_sync_with_single_node_metric(t *testing.T) { }, }, }, + NodeMemoryCollectPolicy: defaultNodeMetricSpec.CollectPolicy.NodeMemoryCollectPolicy, }, }, }, @@ -552,6 +554,7 @@ func Test_nodeMetricInformer_collectNodeAggregateMetric(t *testing.T) { {Duration: 5 * time.Minute}, }, }, + NodeMemoryCollectPolicy: defaultNodeMetricSpec.CollectPolicy.NodeMemoryCollectPolicy, }, }, }, @@ -640,6 +643,7 @@ func Test_nodeMetricInformer_updateMetricSpec(t *testing.T) { AggregateDurationSeconds: defaultNodeMetricSpec.CollectPolicy.AggregateDurationSeconds, ReportIntervalSeconds: pointer.Int64(180), NodeAggregatePolicy: defaultNodeMetricSpec.CollectPolicy.NodeAggregatePolicy, + NodeMemoryCollectPolicy: defaultNodeMetricSpec.CollectPolicy.NodeMemoryCollectPolicy, }, }, }, @@ -825,7 +829,8 @@ func Test_nodeMetricInformer_collectNodeMetric(t *testing.T) { startTime := now.Add(-time.Second * 120) type args struct { - queryparam metriccache.QueryParam + queryparam metriccache.QueryParam + memoryCollectPolicy slov1alpha1.NodeMemoryCollectPolicy } type samples struct { CPUUsed float64 @@ -839,9 +844,26 @@ func Test_nodeMetricInformer_collectNodeMetric(t *testing.T) { want1 time.Duration }{ { - name: "test-1", + name: "test-1 report usageWithoutPageCache", args: args{ - queryparam: metriccache.QueryParam{Start: &startTime, End: &now, Aggregate: metriccache.AggregationTypeAVG}, + queryparam: metriccache.QueryParam{Start: &startTime, End: &now, Aggregate: metriccache.AggregationTypeAVG}, + memoryCollectPolicy: "usageWithoutPageCache", + }, + samples: samples{ + CPUUsed: 2, + MemUsed: 10 * 1024 * 1024 * 1024, + }, + want: v1.ResourceList{ + v1.ResourceCPU: *resource.NewMilliQuantity(2000, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(10*1024*1024*1024, resource.BinarySI), + }, + want1: now.Sub(startTime), + }, + { + name: "test-2 report usageWithHotPageCache", + args: args{ + queryparam: metriccache.QueryParam{Start: &startTime, End: &now, Aggregate: metriccache.AggregationTypeAVG}, + memoryCollectPolicy: "usageWithHotPageCache", }, samples: samples{ CPUUsed: 2, @@ -856,6 +878,9 @@ func Test_nodeMetricInformer_collectNodeMetric(t *testing.T) { } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { + if tt.args.memoryCollectPolicy == slov1alpha1.UsageWithHotPageCache { + system.SetIsStartColdMemory(true) + } mockMetricCache := mockmetriccache.NewMockMetricCache(ctrl) mockResultFactory := mockmetriccache.NewMockAggregateResultFactory(ctrl) metriccache.DefaultAggregateResultFactory = mockResultFactory @@ -868,11 +893,15 @@ func Test_nodeMetricInformer_collectNodeMetric(t *testing.T) { buildMockQueryResult(ctrl, mockQuerier, mockResultFactory, cpuQueryMeta, tt.samples.CPUUsed, duration) memQueryMeta, err := metriccache.NodeMemoryUsageMetric.BuildQueryMeta(nil) + if tt.args.memoryCollectPolicy == slov1alpha1.UsageWithHotPageCache { + memQueryMeta, err = metriccache.NodeMemoryWithHotPageUsageMetric.BuildQueryMeta(nil) + } assert.NoError(t, err) buildMockQueryResult(ctrl, mockQuerier, mockResultFactory, memQueryMeta, tt.samples.MemUsed, duration) r := &nodeMetricInformer{ metricCache: mockMetricCache, } + r.getNodeMetricSpec().CollectPolicy.NodeMemoryCollectPolicy = &tt.args.memoryCollectPolicy got, got1, err := r.collectNodeMetric(tt.args.queryparam) assert.NoError(t, err) assert.Equalf(t, tt.want, got, "collectNodeMetric(%v)", tt.args.queryparam) @@ -881,6 +910,99 @@ func Test_nodeMetricInformer_collectNodeMetric(t *testing.T) { } } +func Test_nodeMetricInformer_collectPodMetric(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + now := time.Now() + startTime := now.Add(-time.Second * 120) + + type args struct { + queryparam metriccache.QueryParam + memoryCollectPolicy slov1alpha1.NodeMemoryCollectPolicy + pod *statesinformer.PodMeta + } + type samples struct { + CPUUsed float64 + MemUsed float64 + } + tests := []struct { + name string + args args + samples samples + }{ + { + name: "test-1 report usageWithoutPageCache", + args: args{ + queryparam: metriccache.QueryParam{Start: &startTime, End: &now, Aggregate: metriccache.AggregationTypeAVG}, + memoryCollectPolicy: "usageWithoutPageCache", + pod: &statesinformer.PodMeta{ + Pod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "default", + UID: "test-pod", + }, + }, + }, + }, + samples: samples{ + CPUUsed: 2, + MemUsed: 10 * 1024 * 1024 * 1024, + }, + }, + { + name: "test-2 report usageWithHotPageCache", + args: args{ + queryparam: metriccache.QueryParam{Start: &startTime, End: &now, Aggregate: metriccache.AggregationTypeAVG}, + memoryCollectPolicy: "usageWithHotPageCache", + pod: &statesinformer.PodMeta{ + Pod: &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-pod", + Namespace: "default", + UID: "test-pod", + }, + }, + }, + }, + samples: samples{ + CPUUsed: 2, + MemUsed: 10 * 1024 * 1024 * 1024, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if tt.args.memoryCollectPolicy == slov1alpha1.UsageWithHotPageCache { + system.SetIsStartColdMemory(true) + } + mockMetricCache := mockmetriccache.NewMockMetricCache(ctrl) + mockResultFactory := mockmetriccache.NewMockAggregateResultFactory(ctrl) + metriccache.DefaultAggregateResultFactory = mockResultFactory + mockQuerier := mockmetriccache.NewMockQuerier(ctrl) + mockMetricCache.EXPECT().Querier(gomock.Any(), gomock.Any()).Return(mockQuerier, nil).AnyTimes() + + duration := tt.args.queryparam.End.Sub(*tt.args.queryparam.Start) + cpuQueryMeta, err := metriccache.PodCPUUsageMetric.BuildQueryMeta(metriccache.MetricPropertiesFunc.Pod(string(tt.args.pod.Pod.UID))) + assert.NoError(t, err) + buildMockQueryResult(ctrl, mockQuerier, mockResultFactory, cpuQueryMeta, tt.samples.CPUUsed, duration) + + memQueryMeta, err := metriccache.PodMemUsageMetric.BuildQueryMeta(metriccache.MetricPropertiesFunc.Pod(string(tt.args.pod.Pod.UID))) + if tt.args.memoryCollectPolicy == slov1alpha1.UsageWithHotPageCache { + memQueryMeta, err = metriccache.PodMemoryWithHotPageUsageMetric.BuildQueryMeta(metriccache.MetricPropertiesFunc.Pod(string(tt.args.pod.Pod.UID))) + } + assert.NoError(t, err) + buildMockQueryResult(ctrl, mockQuerier, mockResultFactory, memQueryMeta, tt.samples.MemUsed, duration) + r := &nodeMetricInformer{ + metricCache: mockMetricCache, + } + r.getNodeMetricSpec().CollectPolicy.NodeMemoryCollectPolicy = &tt.args.memoryCollectPolicy + _, err = r.collectPodMetric(tt.args.pod, tt.args.queryparam) + assert.NoError(t, err) + }) + } +} + func buildMockQueryResult(ctrl *gomock.Controller, querier *mockmetriccache.MockQuerier, factory *mockmetriccache.MockAggregateResultFactory, queryMeta metriccache.MetricMeta, value float64, duration time.Duration) { result := mockmetriccache.NewMockAggregateResult(ctrl) diff --git a/pkg/koordlet/util/cold_page.go b/pkg/koordlet/util/cold_page.go new file mode 100644 index 000000000..6c1854821 --- /dev/null +++ b/pkg/koordlet/util/cold_page.go @@ -0,0 +1,44 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package util + +import ( + "github.com/koordinator-sh/koordinator/pkg/koordlet/resourceexecutor" +) + +func GetNodeMemUsageWithHotPage(coldPageUsage uint64) (uint64, error) { + memInfo, err := GetMemInfo() + if err != nil { + return 0, err + } + return memInfo.MemTotal*1024 - memInfo.MemFree*1024 - coldPageUsage, nil +} + +func GetPodMemUsageWithHotPage(cgroupReader resourceexecutor.CgroupReader, parentDir string, coldPageUsage uint64) (uint64, error) { + memStat, err := cgroupReader.ReadMemoryStat(parentDir) + if err != nil { + return 0, err + } + return uint64(memStat.Usage()) + uint64(memStat.ActiveFile+memStat.InactiveFile) - coldPageUsage, nil +} + +func GetContainerMemUsageWithHotPage(cgroupReader resourceexecutor.CgroupReader, parentDir string, coldPageUsage uint64) (uint64, error) { + memStat, err := cgroupReader.ReadMemoryStat(parentDir) + if err != nil { + return 0, err + } + return uint64(memStat.Usage()) + uint64(memStat.ActiveFile+memStat.InactiveFile) - coldPageUsage, nil +} diff --git a/pkg/koordlet/util/cold_page_test.go b/pkg/koordlet/util/cold_page_test.go new file mode 100644 index 000000000..fcd07c547 --- /dev/null +++ b/pkg/koordlet/util/cold_page_test.go @@ -0,0 +1,240 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package util + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/koordinator-sh/koordinator/pkg/koordlet/resourceexecutor" + "github.com/koordinator-sh/koordinator/pkg/koordlet/util/system" +) + +func Test_GetNodeMemUsageWithHotPage(t *testing.T) { + testMemInfo := `MemTotal: 263432804 kB +MemFree: 254391744 kB +MemAvailable: 256703236 kB +Buffers: 958096 kB +Cached: 3763224 kB +SwapCached: 0 kB +Active: 2786012 kB +Inactive: 2223752 kB +Active(anon): 289488 kB +Inactive(anon): 1300 kB +Active(file): 2496524 kB +Inactive(file): 2222452 kB +Unevictable: 0 kB +Mlocked: 0 kB +SwapTotal: 0 kB +SwapFree: 0 kB +Dirty: 624 kB +Writeback: 0 kB +AnonPages: 281748 kB +Mapped: 495936 kB +Shmem: 2340 kB +Slab: 1097040 kB +SReclaimable: 445164 kB +SUnreclaim: 651876 kB +KernelStack: 20944 kB +PageTables: 7896 kB +NFS_Unstable: 0 kB +Bounce: 0 kB +WritebackTmp: 0 kB +CommitLimit: 131716400 kB +Committed_AS: 3825364 kB +VmallocTotal: 34359738367 kB +VmallocUsed: 0 kB +VmallocChunk: 0 kB +HardwareCorrupted: 0 kB +AnonHugePages: 38912 kB +ShmemHugePages: 0 kB +ShmemPmdMapped: 0 kB +CmaTotal: 0 kB +CmaFree: 0 kB +HugePages_Total: 0 +HugePages_Free: 0 +HugePages_Rsvd: 0 +HugePages_Surp: 0 +Hugepagesize: 2048 kB +DirectMap4k: 414760 kB +DirectMap2M: 8876032 kB +DirectMap1G: 261095424 kB` + + type fields struct { + SetSysUtil func(helper *system.FileTestUtil) + } + tests := []struct { + name string + fields fields + want uint64 + wantErr bool + }{ + { + name: "read legal nodeMemUsageWithHotPage", + fields: fields{ + SetSysUtil: func(helper *system.FileTestUtil) { + helper.WriteProcSubFileContents(system.ProcMemInfoName, testMemInfo) + }, + }, + want: uint64((263432804-254391744)<<10) - 100, + wantErr: false, + }, + { + name: "path not exit", + want: uint64(0), + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + helper := system.NewFileTestUtil(t) + defer helper.Cleanup() + if tt.fields.SetSysUtil != nil { + tt.fields.SetSysUtil(helper) + } + got, err := GetNodeMemUsageWithHotPage(100) + assert.Equal(t, tt.wantErr, err != nil) + assert.Equal(t, tt.want, got) + }) + } +} + +func Test_GetPodMemUsageWithHotPage(t *testing.T) { + testPodParentDir := "/kubepods.slice/kubepods-podxxxxxxxx.slice" + type fields struct { + SetSysUtil func(helper *system.FileTestUtil) + } + tests := []struct { + name string + fields fields + want uint64 + wantErr bool + }{ + { + name: "read legal podMemUsageWithHotPage", + fields: fields{ + SetSysUtil: func(helper *system.FileTestUtil) { + helper.WriteCgroupFileContents(testPodParentDir, system.MemoryStat, ` +total_cache 104857600 +total_rss 104857600 +total_inactive_anon 104857600 +total_active_anon 0 +total_inactive_file 104857600 +total_active_file 0 +total_unevictable 0 +`) + }, + }, + want: uint64(209715200) - 100, + wantErr: false, + }, + { + name: "read illegal podMemUsageWithHotPage", + fields: fields{ + SetSysUtil: func(helper *system.FileTestUtil) { + helper.WriteCgroupFileContents(testPodParentDir, system.MemoryStat, ` +total_cache 104857600 +totalxxx_rss 104857600 +total_inactive_anon 104857600 +total_active_anon 0 +total_inactive_file 104857600 +total_active_file 0 +total_unevictable 0 +`) + }, + }, + want: uint64(0), + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + helper := system.NewFileTestUtil(t) + defer helper.Cleanup() + if tt.fields.SetSysUtil != nil { + tt.fields.SetSysUtil(helper) + } + cgroupReader := resourceexecutor.NewCgroupReader() + got, err := GetPodMemUsageWithHotPage(cgroupReader, testPodParentDir, 100) + assert.Equal(t, tt.wantErr, err != nil) + assert.Equal(t, tt.want, got) + }) + } +} + +func Test_GetContainerMemUsageWithHotPage(t *testing.T) { + testContainerParentDir := "/kubepods.slice/kubepods-podxxxxxxxx.slice/cri-containerd-123abc.scope" + type fields struct { + SetSysUtil func(helper *system.FileTestUtil) + } + tests := []struct { + name string + fields fields + want uint64 + wantErr bool + }{ + { + name: "read legal podMemUsageWithHotPage", + fields: fields{ + SetSysUtil: func(helper *system.FileTestUtil) { + helper.WriteCgroupFileContents(testContainerParentDir, system.MemoryStat, ` +total_cache 104857600 +total_rss 104857600 +total_inactive_anon 104857600 +total_active_anon 0 +total_inactive_file 104857600 +total_active_file 0 +total_unevictable 0 +`) + }, + }, + want: uint64(209715200) - 100, + wantErr: false, + }, + { + name: "read illegal podMemUsageWithHotPage", + fields: fields{ + SetSysUtil: func(helper *system.FileTestUtil) { + helper.WriteCgroupFileContents(testContainerParentDir, system.MemoryStat, ` +total_cache 104857600 +totalxxxx_rss 104857600 +total_inactive_anon 104857600 +total_active_anon 0 +total_inactive_file 104857600 +total_active_file 0 +total_unevictable 0 +`) + }, + }, + want: uint64(0), + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + helper := system.NewFileTestUtil(t) + defer helper.Cleanup() + if tt.fields.SetSysUtil != nil { + tt.fields.SetSysUtil(helper) + } + cgroupReader := resourceexecutor.NewCgroupReader() + got, err := GetContainerMemUsageWithHotPage(cgroupReader, testContainerParentDir, 100) + assert.Equal(t, tt.wantErr, err != nil) + assert.Equal(t, tt.want, got) + }) + } +} diff --git a/pkg/koordlet/util/system/cgroup_resource.go b/pkg/koordlet/util/system/cgroup_resource.go index ad1434309..2426284d8 100644 --- a/pkg/koordlet/util/system/cgroup_resource.go +++ b/pkg/koordlet/util/system/cgroup_resource.go @@ -160,6 +160,7 @@ const ( MemoryPriorityName = "memory.priority" MemoryUsePriorityOomName = "memory.use_priority_oom" MemoryOomGroupName = "memory.oom.group" + MemoryIdlePageStatsName = "memory.idle_page_stats" BlkioTRIopsName = "blkio.throttle.read_iops_device" BlkioTRBpsName = "blkio.throttle.read_bps_device" @@ -227,6 +228,7 @@ var ( MemoryPriority = DefaultFactory.New(MemoryPriorityName, CgroupMemDir).WithValidator(MemoryPriorityValidator).WithCheckSupported(SupportedIfFileExistsInKubepods).WithCheckOnce(true) MemoryUsePriorityOom = DefaultFactory.New(MemoryUsePriorityOomName, CgroupMemDir).WithValidator(MemoryUsePriorityOomValidator).WithCheckSupported(SupportedIfFileExistsInKubepods).WithCheckOnce(true) MemoryOomGroup = DefaultFactory.New(MemoryOomGroupName, CgroupMemDir).WithValidator(MemoryOomGroupValidator).WithCheckSupported(SupportedIfFileExistsInKubepods).WithCheckOnce(true) + MemoryIdlePageStats = DefaultFactory.New(MemoryIdlePageStatsName, CgroupMemDir).WithCheckSupported(SupportedIfFileExistsInKubepods).WithCheckOnce(true) BlkioReadIops = DefaultFactory.New(BlkioTRIopsName, CgroupBlkioDir).WithValidator(BlkioTRIopsValidator).WithCheckSupported(SupportedIfFileExistsInKubepods).WithCheckOnce(true) BlkioReadBps = DefaultFactory.New(BlkioTRBpsName, CgroupBlkioDir).WithValidator(BlkioTRBpsValidator).WithCheckSupported(SupportedIfFileExistsInKubepods).WithCheckOnce(true) @@ -263,6 +265,7 @@ var ( MemoryPriority, MemoryUsePriorityOom, MemoryOomGroup, + MemoryIdlePageStats, BlkioReadIops, BlkioReadBps, BlkioWriteIops, diff --git a/pkg/koordlet/util/system/kidled_util.go b/pkg/koordlet/util/system/kidled_util.go new file mode 100644 index 000000000..c2f2e66d8 --- /dev/null +++ b/pkg/koordlet/util/system/kidled_util.go @@ -0,0 +1,206 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package system + +import ( + "bufio" + "fmt" + "os" + "reflect" + "strconv" + "strings" + + "go.uber.org/atomic" + "k8s.io/klog/v2" +) + +var ( + isSupportColdMemory *atomic.Bool = atomic.NewBool(false) + isStartColdMemory *atomic.Bool = atomic.NewBool(false) +) + +// the unit of Csei, Dsei, Cfei ... is byte +// the detailed description can be seen in https://github.com/alibaba/cloud-kernel/blob/linux-next/Documentation/vm/kidled.rst +type ColdPageInfoByKidled struct { + Version string `json:"version"` + PageScans uint64 `json:"page_scans"` + SlabScans uint64 `json:"slab_scans"` + ScanPeriodInSeconds uint64 `json:"scan_period_in_seconds"` + UseHierarchy uint64 `json:"use_hierarchy"` + Buckets []uint64 `json:"buckets"` + Csei []uint64 `json:"csei"` + Dsei []uint64 `json:"dsei"` + Cfei []uint64 `json:"cfei"` + Dfei []uint64 `json:"dfei"` + Csui []uint64 `json:"csui"` + Dsui []uint64 `json:"dsui"` + Cfui []uint64 `json:"cfui"` + Dfui []uint64 `json:"dfui"` + Csea []uint64 `json:"csea"` + Dsea []uint64 `json:"dsea"` + Cfea []uint64 `json:"cfea"` + Dfea []uint64 `json:"dfea"` + Csua []uint64 `json:"csua"` + Dsua []uint64 `json:"dsua"` + Cfua []uint64 `json:"cfua"` + Dfua []uint64 `json:"dfua"` + Slab []uint64 `json:"slab"` +} + +type KidledConfig struct { + ScanPeriodInseconds uint32 + UseHierarchy uint8 +} + +func ParseMemoryIdlePageStats(content string) (*ColdPageInfoByKidled, error) { + lines := strings.Split(content, "\n") + statMap := make(map[string]interface{}) + var info = ColdPageInfoByKidled{} + if (len(lines)) != 31 { + return nil, fmt.Errorf("format err") + } + for i, line := range lines { + if i == 0 { + fields := strings.Fields(line) + if len(fields) < 3 { + continue + } + statMap[fields[1][:len(fields[1])-1]] = fields[2] + } else if i < 5 { + fields := strings.Fields(line) + if len(fields) < 3 { + continue + } + val, _ := strconv.ParseUint(fields[2], 10, 64) + statMap[fields[1][:len(fields[1])-1]] = val + } else if i == 5 { + fields := strings.Fields(line) + if len(fields) < 3 { + continue + } + s := strings.Split(fields[2], ",") + var val = make([]uint64, len(s)) + for k, v := range s { + val[k], _ = strconv.ParseUint(v, 10, 64) + } + statMap[fields[1][:len(fields[1])-1]] = val + } else if i >= 14 { + fields := strings.Fields(line) + if len(fields) < 1 { + continue + } + var val = make([]uint64, len(fields)-1) + for i := 1; i < len(fields); i++ { + val[i-1], _ = strconv.ParseUint(fields[i], 10, 64) + } + statMap[fields[0]] = val + } + } + elem := reflect.ValueOf(&info).Elem() + typeOfElem := elem.Type() + for i := 0; i < elem.NumField(); i++ { + val, ok := statMap[typeOfElem.Field(i).Tag.Get("json")] + if ok { + if typeOfElem.Field(i).Type.Kind() == reflect.String { + elem.Field(i).SetString(val.(string)) + } else if typeOfElem.Field(i).Type.Kind() == reflect.Uint64 { + elem.Field(i).SetUint(val.(uint64)) + } else if typeOfElem.Field(i).Type.Kind() == reflect.Slice { + sliceValue := reflect.ValueOf(val) + elem.Field(i).Set(sliceValue) + } + } + } + return &info, nil +} + +func (i *ColdPageInfoByKidled) GetColdPageTotalBytes() uint64 { + sum := func(nums ...[]uint64) uint64 { + var total uint64 + for _, v := range nums { + for _, num := range v { + total += num + } + } + return total + } + return sum(i.Csei, i.Dsei, i.Cfei, i.Dfei, i.Csui, i.Dsui, i.Cfui, i.Dfui, i.Csea, i.Dsea, i.Cfea, i.Dfea, i.Csua, i.Dsua, i.Cfua, i.Dfua, i.Slab) +} + +// check kidled and set var isSupportColdSupport +func IsKidledSupport() bool { + isSupportColdMemory.Store(false) + isSupport, str := KidledScanPeriodInSeconds.IsSupported("") + if !isSupport { + klog.V(4).Infof("file scan_period_in_seconds is not exist %s", str) + return isSupportColdMemory.Load() + } + isSupport, str = KidledUseHierarchy.IsSupported("") + if !isSupport { + klog.V(4).Infof("file use_hierarchy is not exist %s", str) + return isSupportColdMemory.Load() + } + isSupportColdMemory.Store(true) + return isSupportColdMemory.Load() +} + +func GetIsSupportColdMemory() bool { + return isSupportColdMemory.Load() +} + +func SetIsSupportColdMemory(flag bool) { + isSupportColdMemory.Store(flag) +} + +func GetIsStartColdMemory() bool { + return isStartColdMemory.Load() +} + +func SetIsStartColdMemory(flag bool) { + isStartColdMemory.Store(flag) +} + +func SetKidledScanPeriodInSeconds(period uint32) error { + path := KidledScanPeriodInSeconds.Path("") + file, err := os.OpenFile(path, os.O_WRONLY|os.O_TRUNC, 0644) + if err != nil { + return err + } + defer file.Close() + write := bufio.NewWriter(file) + write.WriteString(fmt.Sprintf("%d", period)) + write.Flush() + return nil +} +func SetKidledUseHierarchy(useHierarchy uint8) error { + path := KidledUseHierarchy.Path("") + file, err := os.OpenFile(path, os.O_WRONLY|os.O_TRUNC, 0644) + if err != nil { + return err + } + defer file.Close() + write := bufio.NewWriter(file) + write.WriteString(fmt.Sprintf("%d", useHierarchy)) + write.Flush() + return nil +} +func NewDefaultKidledConfig() *KidledConfig { + return &KidledConfig{ + ScanPeriodInseconds: 5, + UseHierarchy: 1, + } +} diff --git a/pkg/koordlet/util/system/kidled_util_test.go b/pkg/koordlet/util/system/kidled_util_test.go new file mode 100644 index 000000000..960b80f10 --- /dev/null +++ b/pkg/koordlet/util/system/kidled_util_test.go @@ -0,0 +1,236 @@ +/* +Copyright 2022 The Koordinator Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package system + +import ( + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" +) + +func Test_IsKidledSupport(t *testing.T) { + helper := NewFileTestUtil(t) + defer helper.Cleanup() + type fields struct { + SetSysUtil func(helper *FileTestUtil) + } + tests := []struct { + name string + fields fields + want bool + }{ + { + name: "os doesn't support kidled cold page info", + want: false, + }, + { + name: "os support kidled cold page info", + fields: fields{ + SetSysUtil: func(helper *FileTestUtil) { + helper.CreateFile(filepath.Join(GetSysRootDir(), KidledRelativePath, KidledScanPeriodInSecondsFileName)) + helper.CreateFile(filepath.Join(GetSysRootDir(), KidledRelativePath, KidledUseHierarchyFileFileName)) + }, + }, + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + helper := NewFileTestUtil(t) + defer helper.Cleanup() + if tt.fields.SetSysUtil != nil { + tt.fields.SetSysUtil(helper) + } + got := IsKidledSupport() + assert.Equal(t, tt.want, got) + }) + } +} + +func Test_ParseMemoryIdlePageStats(t *testing.T) { + idleInfoContentStr := `# version: 1.0 + # page_scans: 24 + # slab_scans: 0 + # scan_period_in_seconds: 120 + # use_hierarchy: 1 + # buckets: 1,2,5,15,30,60,120,240 + # + # _-----=> clean/dirty + # / _----=> swap/file + # | / _---=> evict/unevict + # || / _--=> inactive/active + # ||| / _-=> slab + # |||| / + # ||||| [1,2) [2,5) [5,15) [15,30) [30,60) [60,120) [120,240) [240,+inf) + csei 2613248 4657152 18182144 293683200 0 0 0 0 + dsei 2568192 5140480 15306752 48648192 0 0 0 0 + cfei 2633728 4640768 66531328 340172800 0 0 0 0 + dfei 0 0 4096 0 0 0 0 0 + csui 0 0 0 0 0 0 0 0 + dsui 0 0 0 0 0 0 0 0 + cfui 0 0 0 0 0 0 0 0 + dfui 0 0 0 0 0 0 0 0 + csea 765952 1044480 3784704 52834304 0 0 0 0 + dsea 286720 270336 1564672 5390336 0 0 0 0 + cfea 9273344 16609280 152109056 315121664 0 0 0 0 + dfea 0 0 0 0 0 0 0 0 + csua 0 0 0 0 0 0 0 0 + dsua 0 0 0 0 0 0 0 0 + cfua 0 0 0 0 0 0 0 0 + dfua 0 0 0 0 0 0 0 0 + slab 0 0 0 0 0 0 0 0` + invalidIdleInfoContentStr1 := `# version: 1.0 + # page_scans: 24 + # slab_scans: 0 + # scan_period_in_seconds: 120 + # use_hierarchy: 1 + # buckets: 1,2,5,15,30,60,120,240 + # + # _-----=> clean/dirty + 0 0 0 0 0 0 0 0 + cfui 0 0 0 0 0 0 0 0 + dfui 0 0 0 0 0 0 0 0 + csea 765952 1044480 3784704 52834304 0 0 0 0 + dsea 286720 270336 1564672 5390336 0 0 0 0 + cfea 9273344 16609280 152109056 315121664 0 0 0 0 + dfea 0 0 0 0 0 0 0 0 + csua 0 0 0 0 0 0 0 0 + dsua 0 0 0 0 0 0 0 0 + cfua 0 0 0 0 0 0 0 0 + dfua 0 0 0 0 0 0 0 0 + slab 0 0 0 0 0 0 0 0` + type args struct { + content string + } + tests := []struct { + name string + args args + want *ColdPageInfoByKidled + wantErr bool + }{ + { + name: "read illegal idle empty stat1", + args: args{content: ""}, + want: nil, + wantErr: true, + }, + { + name: "read illegal idle empty stat2", + args: args{content: invalidIdleInfoContentStr1}, + want: nil, + wantErr: true, + }, + { + name: "read test idle stat path", + args: args{content: idleInfoContentStr}, + want: &ColdPageInfoByKidled{ + Version: "1.0", PageScans: 24, SlabScans: 0, ScanPeriodInSeconds: 120, UseHierarchy: 1, Buckets: []uint64{1, 2, 5, 15, 30, 60, 120, 240}, + Csei: []uint64{2613248, 4657152, 18182144, 293683200, 0, 0, 0, 0}, Dsei: []uint64{2568192, 5140480, 15306752, 48648192, 0, 0, 0, 0}, Cfei: []uint64{2633728, 4640768, 66531328, 340172800, 0, 0, 0, 0}, + Dfei: []uint64{0, 0, 4096, 0, 0, 0, 0, 0}, Csui: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, Dsui: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, + Cfui: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, Dfui: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, Csea: []uint64{765952, 1044480, 3784704, 52834304, 0, 0, 0, 0}, + Dsea: []uint64{286720, 270336, 1564672, 5390336, 0, 0, 0, 0}, Cfea: []uint64{9273344, 16609280, 152109056, 315121664, 0, 0, 0, 0}, Dfea: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, + Csua: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, Dsua: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, Cfua: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, + Dfua: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, Slab: []uint64{0, 0, 0, 0, 0, 0, 0, 0}, + }, + wantErr: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, gotErr := ParseMemoryIdlePageStats(tt.args.content) + assert.Equal(t, tt.wantErr, gotErr != nil) + assert.Equal(t, tt.want, got) + }) + } + +} + +func Test_GetColdPageTotalBytes(t *testing.T) { + coldPageInfoContentStr := `# version: 1.0 + # page_scans: 24 + # slab_scans: 0 + # scan_period_in_seconds: 120 + # use_hierarchy: 1 + # buckets: 1,2,5,15,30,60,120,240 + # + # _-----=> clean/dirty + # / _----=> swap/file + # | / _---=> evict/unevict + # || / _--=> inactive/active + # ||| / _-=> slab + # |||| / + # ||||| [1,2) [2,5) [5,15) [15,30) [30,60) [60,120) [120,240) [240,+inf) + csei 2613248 4657152 18182144 293683200 0 0 0 0 + dsei 2568192 5140480 15306752 48648192 0 0 0 0 + cfei 2633728 4640768 66531328 340172800 0 0 0 0 + dfei 0 0 4096 0 0 0 0 0 + csui 0 0 0 0 0 0 0 0 + dsui 0 0 0 0 0 0 0 0 + cfui 0 0 0 0 0 0 0 0 + dfui 0 0 0 0 0 0 0 0 + csea 765952 1044480 3784704 52834304 0 0 0 0 + dsea 286720 270336 1564672 5390336 0 0 0 0 + cfea 9273344 16609280 152109056 315121664 0 0 0 0 + dfea 0 0 0 0 0 0 0 0 + csua 0 0 0 0 0 0 0 0 + dsua 0 0 0 0 0 0 0 0 + cfua 0 0 0 0 0 0 0 0 + dfua 0 0 0 0 0 0 0 0 + slab 0 0 0 0 0 0 0 0` + coldPageInfo, err := ParseMemoryIdlePageStats(coldPageInfoContentStr) + assert.NoError(t, err) + assert.NotNil(t, coldPageInfo) + got := coldPageInfo.GetColdPageTotalBytes() + assert.Equal(t, uint64(1363836928), got) +} + +func Test_SetKidledScanPeriodInSeconds(t *testing.T) { + helper := NewFileTestUtil(t) + defer helper.Cleanup() + path := KidledScanPeriodInSeconds.Path("") + helper.CreateFile(path) + SetKidledScanPeriodInSeconds(120) + s := helper.ReadFileContents(path) + assert.Equal(t, "120", s) + +} +func Test_SetKidledUseHierarchy(t *testing.T) { + helper := NewFileTestUtil(t) + defer helper.Cleanup() + path := KidledUseHierarchy.Path("") + helper.CreateFile(path) + SetKidledUseHierarchy(1) + s := helper.ReadFileContents(path) + assert.Equal(t, "1", s) +} + +func Test_GetIsSupportColdMemory(t *testing.T) { + SetIsSupportColdMemory(false) + assert.Equal(t, false, GetIsSupportColdMemory()) +} + +func Test_GetIsStartColdMemory(t *testing.T) { + SetIsStartColdMemory(false) + assert.Equal(t, false, GetIsStartColdMemory()) +} + +func Test_NewDefaultKidledConfig(t *testing.T) { + config := NewDefaultKidledConfig() + assert.Equal(t, uint32(5), config.ScanPeriodInseconds) + assert.Equal(t, uint8(1), config.UseHierarchy) +} diff --git a/pkg/koordlet/util/system/system_resource.go b/pkg/koordlet/util/system/system_resource.go index c56315920..59e8972a7 100644 --- a/pkg/koordlet/util/system/system_resource.go +++ b/pkg/koordlet/util/system/system_resource.go @@ -17,6 +17,7 @@ limitations under the License. package system import ( + "math" "path" "k8s.io/utils/pointer" @@ -25,22 +26,29 @@ import ( const ( ProcSysVmRelativePath = "sys/vm/" MemcgReaperRelativePath = "kernel/mm/memcg_reaper/" + KidledRelativePath = "kernel/mm/kidled/" - MinFreeKbytesFileName = "min_free_kbytes" - WatermarkScaleFactorFileName = "watermark_scale_factor" - MemcgReapBackGroundFileName = "reap_background" + MinFreeKbytesFileName = "min_free_kbytes" + WatermarkScaleFactorFileName = "watermark_scale_factor" + MemcgReapBackGroundFileName = "reap_background" + KidledScanPeriodInSecondsFileName = "scan_period_in_seconds" + KidledUseHierarchyFileFileName = "use_hierarchy" ) var ( - MinFreeKbytesValidator = &RangeValidator{min: 10 * 1024, max: 10 * 1024 * 1024} - WatermarkScaleFactorValidator = &RangeValidator{min: 10, max: 400} - MemcgReapBackGroundValidator = &RangeValidator{min: 0, max: 1} + MinFreeKbytesValidator = &RangeValidator{min: 10 * 1024, max: 10 * 1024 * 1024} + WatermarkScaleFactorValidator = &RangeValidator{min: 10, max: 400} + MemcgReapBackGroundValidator = &RangeValidator{min: 0, max: 1} + KidledScanPeriodInSecondsValidator = &RangeValidator{min: 0, max: math.MaxInt64} + KidledUseHierarchyValidator = &RangeValidator{min: 0, max: 1} ) var ( - MinFreeKbytes = NewCommonSystemResource(ProcSysVmRelativePath, MinFreeKbytesFileName, GetProcRootDir).WithValidator(MinFreeKbytesValidator) - WatermarkScaleFactor = NewCommonSystemResource(ProcSysVmRelativePath, WatermarkScaleFactorFileName, GetProcRootDir).WithValidator(WatermarkScaleFactorValidator) - MemcgReapBackGround = NewCommonSystemResource(MemcgReaperRelativePath, MemcgReapBackGroundFileName, GetSysRootDir).WithValidator(MemcgReapBackGroundValidator).WithCheckSupported(SupportedIfFileExists) + MinFreeKbytes = NewCommonSystemResource(ProcSysVmRelativePath, MinFreeKbytesFileName, GetProcRootDir).WithValidator(MinFreeKbytesValidator) + WatermarkScaleFactor = NewCommonSystemResource(ProcSysVmRelativePath, WatermarkScaleFactorFileName, GetProcRootDir).WithValidator(WatermarkScaleFactorValidator) + MemcgReapBackGround = NewCommonSystemResource(MemcgReaperRelativePath, MemcgReapBackGroundFileName, GetSysRootDir).WithValidator(MemcgReapBackGroundValidator).WithCheckSupported(SupportedIfFileExists) + KidledScanPeriodInSeconds = NewCommonSystemResource(KidledRelativePath, KidledScanPeriodInSecondsFileName, GetSysRootDir).WithValidator(KidledScanPeriodInSecondsValidator).WithCheckSupported(SupportedIfFileExists) + KidledUseHierarchy = NewCommonSystemResource(KidledRelativePath, KidledUseHierarchyFileFileName, GetSysRootDir).WithValidator(KidledUseHierarchyValidator).WithCheckSupported(SupportedIfFileExists) ) var _ Resource = &SystemResource{} diff --git a/pkg/koordlet/util/system/system_resource_test.go b/pkg/koordlet/util/system/system_resource_test.go index e8582058a..a2127b955 100644 --- a/pkg/koordlet/util/system/system_resource_test.go +++ b/pkg/koordlet/util/system/system_resource_test.go @@ -117,6 +117,58 @@ func TestSystemResource(t *testing.T) { wantValid: true, wantResourceType: MemcgReapBackGroundFileName, }, + { + name: "kidledScanPeriodInSeconds resource valid and support", + fields: fields{ + resource: NewCommonSystemResource(KidledRelativePath, KidledScanPeriodInSecondsFileName, GetSysRootDir).WithValidator(KidledScanPeriodInSecondsValidator).WithCheckSupported(SupportedIfFileExists), + createdFile: true, + initValue: 0, + newValue: 120, + }, + wantPath: path.Join(GetSysRootDir(), KidledRelativePath, KidledScanPeriodInSecondsFileName), + wantSupported: true, + wantValid: true, + wantResourceType: KidledScanPeriodInSecondsFileName, + }, + { + name: "kidledScanPeriodInSeconds resource invalid and not support", + fields: fields{ + resource: NewCommonSystemResource(KidledRelativePath, KidledScanPeriodInSecondsFileName, GetSysRootDir).WithValidator(KidledScanPeriodInSecondsValidator).WithCheckSupported(SupportedIfFileExists), + createdFile: false, + initValue: 0, + newValue: -1, + }, + wantPath: path.Join(GetSysRootDir(), KidledRelativePath, KidledScanPeriodInSecondsFileName), + wantSupported: false, + wantValid: false, + wantResourceType: KidledScanPeriodInSecondsFileName, + }, + { + name: "kidledUseHierarchy resource valid and supported", + fields: fields{ + resource: NewCommonSystemResource(KidledRelativePath, KidledUseHierarchyFileFileName, GetSysRootDir).WithValidator(KidledUseHierarchyValidator).WithCheckSupported(SupportedIfFileExists), + createdFile: true, + initValue: 0, + newValue: 1, + }, + wantPath: path.Join(GetSysRootDir(), KidledRelativePath, KidledUseHierarchyFileFileName), + wantSupported: true, + wantValid: true, + wantResourceType: KidledUseHierarchyFileFileName, + }, + { + name: "kidledUseHierarchy resource invalid and not support", + fields: fields{ + resource: NewCommonSystemResource(KidledRelativePath, KidledUseHierarchyFileFileName, GetSysRootDir).WithValidator(KidledUseHierarchyValidator).WithCheckSupported(SupportedIfFileExists), + createdFile: false, + initValue: 0, + newValue: -1, + }, + wantPath: path.Join(GetSysRootDir(), KidledRelativePath, KidledUseHierarchyFileFileName), + wantSupported: false, + wantValid: false, + wantResourceType: KidledUseHierarchyFileFileName, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) {