diff --git a/dlrover/python/master/resource/job.py b/dlrover/python/master/resource/job.py index 890ca7572..67a87e286 100644 --- a/dlrover/python/master/resource/job.py +++ b/dlrover/python/master/resource/job.py @@ -351,11 +351,11 @@ def _adjust_oom_ps_resource(self, node: Node): plan = self._resource_optimizer.generate_oom_recovery_plan( [node], JobOptStage.PS_INITIAL ) - if plan and not plan.empty(): - ps = plan.node_group_resources[NodeType.PS] + if plan and not plan.empty() and node.name in plan.node_resources: + resource = plan.node_resources[node.name] self._ps_resource.node_resource.memory = max( self._ps_resource.node_resource.memory, - ps.node_resource.memory, + resource.memory, ) cur_mem = node.config_resource.memory cur_mem *= NodeResourceLimit.INCREMENTAL_MEMORY_FACTOR diff --git a/dlrover/python/tests/test_resource_optimizer.py b/dlrover/python/tests/test_resource_optimizer.py index 12906c240..9949fe0c3 100644 --- a/dlrover/python/tests/test_resource_optimizer.py +++ b/dlrover/python/tests/test_resource_optimizer.py @@ -43,9 +43,8 @@ class MockStub(object): def optimize(self, request): res = brain_pb2.OptimizeResponse() res.job_optimize_plans.add() - group_resources = res.job_optimize_plans[ - 0 - ].resource.task_group_resources + plan = res.job_optimize_plans[0] + group_resources = plan.resource.task_group_resources group_resources[NodeType.WORKER].count = 5 group_resources[NodeType.WORKER].resource.memory = ( _MEMORY * MemoryUnit.MB @@ -55,6 +54,7 @@ def optimize(self, request): group_resources[NodeType.PS].count = 2 group_resources[NodeType.PS].resource.memory = _MEMORY * MemoryUnit.MB group_resources[NodeType.PS].resource.cpu = 16 + plan.resource.pod_resources["ps-0"].memory = _MEMORY * MemoryUnit.MB return res