From da447cdaba2335b7e04d76f5e0edbee4dba60735 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 12 Dec 2024 11:58:26 +0100 Subject: [PATCH 01/16] only tag node if needed --- .../src/simcore_service_autoscaling/utils/utils_docker.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py b/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py index 65caa0f40b1..4c5b5e6f79f 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py +++ b/services/autoscaling/src/simcore_service_autoscaling/utils/utils_docker.py @@ -521,8 +521,14 @@ async def tag_node( tags: dict[DockerLabelKey, str], available: bool, ) -> Node: + assert node.spec # nosec + if (node.spec.labels == tags) and ( + (node.spec.availability is Availability.active) == available + ): + # nothing to do + return node with log_context( - logger, logging.DEBUG, msg=f"tagging {node.id=} with {tags=} and {available=}" + logger, logging.DEBUG, msg=f"tag {node.id=} with {tags=} and {available=}" ): assert node.id # nosec From 190d764a4969a6e8b8a04b45f1627b8cdd832898 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 12 Dec 2024 14:45:37 +0100 Subject: [PATCH 02/16] space --- .../autoscaling/src/simcore_service_autoscaling/modules/dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 4c5ee00f86c..d57508babf8 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -273,7 +273,7 @@ def _list_processing_tasks_on_worker( async with _scheduler_client(scheduler_url, authentication) as client: worker_url, _ = _dask_worker_from_ec2_instance(client, ec2_instance) - _logger.debug("looking for processing tasksfor %s", f"{worker_url=}") + _logger.debug("looking for processing tasks for %s", f"{worker_url=}") # now get the used resources worker_processing_tasks: list[ From faeb390dab3b18b3b5084f3f7b14d2b8dc7fda7e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 12 Dec 2024 15:52:33 +0100 Subject: [PATCH 03/16] moved fixture down --- services/autoscaling/tests/unit/conftest.py | 108 +++++++++++++++++- .../unit/test_modules_auto_scaling_dynamic.py | 13 ++- .../unit/test_modules_buffer_machine_core.py | 97 +--------------- 3 files changed, 119 insertions(+), 99 deletions(-) diff --git a/services/autoscaling/tests/unit/conftest.py b/services/autoscaling/tests/unit/conftest.py index 4a48f2776b6..d4e58d5bf96 100644 --- a/services/autoscaling/tests/unit/conftest.py +++ b/services/autoscaling/tests/unit/conftest.py @@ -28,11 +28,16 @@ EC2InstanceType, Resources, ) +from common_library.json_serialization import json_dumps from deepdiff import DeepDiff from faker import Faker from fakeredis.aioredis import FakeRedis from fastapi import FastAPI -from models_library.docker import DockerLabelKey, StandardSimcoreDockerLabels +from models_library.docker import ( + DockerGenericTag, + DockerLabelKey, + StandardSimcoreDockerLabels, +) from models_library.generated_models.docker_rest_api import Availability from models_library.generated_models.docker_rest_api import Node as DockerNode from models_library.generated_models.docker_rest_api import ( @@ -57,6 +62,7 @@ ) from settings_library.rabbit import RabbitSettings from settings_library.ssm import SSMSettings +from simcore_service_autoscaling.constants import PRE_PULLED_IMAGES_EC2_TAG_KEY from simcore_service_autoscaling.core.application import create_app from simcore_service_autoscaling.core.settings import ( AUTOSCALING_ENV_PREFIX, @@ -71,8 +77,14 @@ DaskTaskResources, ) from simcore_service_autoscaling.modules import auto_scaling_core +from simcore_service_autoscaling.modules.auto_scaling_mode_dynamic import ( + DynamicAutoscaling, +) from simcore_service_autoscaling.modules.docker import AutoscalingDocker from simcore_service_autoscaling.modules.ec2 import SimcoreEC2API +from simcore_service_autoscaling.utils.buffer_machines_pool_core import ( + get_deactivated_buffer_ec2_tags, +) from simcore_service_autoscaling.utils.utils_docker import ( _OSPARC_SERVICE_READY_LABEL_KEY, _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY, @@ -81,7 +93,9 @@ from tenacity.retry import retry_if_exception_type from tenacity.stop import stop_after_delay from tenacity.wait import wait_fixed -from types_aiobotocore_ec2.literals import InstanceTypeType +from types_aiobotocore_ec2 import EC2Client +from types_aiobotocore_ec2.literals import InstanceStateNameType, InstanceTypeType +from types_aiobotocore_ec2.type_defs import TagTypeDef pytest_plugins = [ "pytest_simcore.aws_server", @@ -1042,3 +1056,93 @@ async def _( autospec=True, side_effect=_, ) + + +@pytest.fixture +async def create_buffer_machines( + ec2_client: EC2Client, + aws_ami_id: str, + app_settings: ApplicationSettings, + initialized_app: FastAPI, +) -> Callable[ + [int, InstanceTypeType, InstanceStateNameType, list[DockerGenericTag]], + Awaitable[list[str]], +]: + async def _do( + num: int, + instance_type: InstanceTypeType, + instance_state_name: InstanceStateNameType, + pre_pull_images: list[DockerGenericTag], + ) -> list[str]: + assert app_settings.AUTOSCALING_EC2_INSTANCES + + assert instance_state_name in [ + "running", + "stopped", + ], "only 'running' and 'stopped' are supported for testing" + + resource_tags: list[TagTypeDef] = [ + {"Key": tag_key, "Value": tag_value} + for tag_key, tag_value in get_deactivated_buffer_ec2_tags( + initialized_app, DynamicAutoscaling() + ).items() + ] + if pre_pull_images is not None and instance_state_name == "stopped": + resource_tags.append( + { + "Key": PRE_PULLED_IMAGES_EC2_TAG_KEY, + "Value": f"{json_dumps(pre_pull_images)}", + } + ) + with log_context( + logging.INFO, f"creating {num} buffer machines of {instance_type}" + ): + instances = await ec2_client.run_instances( + ImageId=aws_ami_id, + MaxCount=num, + MinCount=num, + InstanceType=instance_type, + KeyName=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_KEY_NAME, + SecurityGroupIds=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_SECURITY_GROUP_IDS, + SubnetId=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_SUBNET_ID, + IamInstanceProfile={ + "Arn": app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_ATTACHED_IAM_PROFILE + }, + TagSpecifications=[ + {"ResourceType": "instance", "Tags": resource_tags}, + {"ResourceType": "volume", "Tags": resource_tags}, + {"ResourceType": "network-interface", "Tags": resource_tags}, + ], + UserData="echo 'I am pytest'", + ) + instance_ids = [ + i["InstanceId"] for i in instances["Instances"] if "InstanceId" in i + ] + + waiter = ec2_client.get_waiter("instance_exists") + await waiter.wait(InstanceIds=instance_ids) + instances = await ec2_client.describe_instances(InstanceIds=instance_ids) + assert "Reservations" in instances + assert instances["Reservations"] + assert "Instances" in instances["Reservations"][0] + assert len(instances["Reservations"][0]["Instances"]) == num + for instance in instances["Reservations"][0]["Instances"]: + assert "State" in instance + assert "Name" in instance["State"] + assert instance["State"]["Name"] == "running" + + if instance_state_name == "stopped": + await ec2_client.stop_instances(InstanceIds=instance_ids) + instances = await ec2_client.describe_instances(InstanceIds=instance_ids) + assert "Reservations" in instances + assert instances["Reservations"] + assert "Instances" in instances["Reservations"][0] + assert len(instances["Reservations"][0]["Instances"]) == num + for instance in instances["Reservations"][0]["Instances"]: + assert "State" in instance + assert "Name" in instance["State"] + assert instance["State"]["Name"] == "stopped" + + return instance_ids + + return _do diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index ccdb2461c04..51be00eeea5 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -24,6 +24,7 @@ from fastapi import FastAPI from models_library.docker import ( DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY, + DockerGenericTag, DockerLabelKey, StandardSimcoreDockerLabels, ) @@ -68,7 +69,7 @@ _OSPARC_SERVICES_READY_DATETIME_LABEL_KEY, ) from types_aiobotocore_ec2.client import EC2Client -from types_aiobotocore_ec2.literals import InstanceTypeType +from types_aiobotocore_ec2.literals import InstanceStateNameType, InstanceTypeType from types_aiobotocore_ec2.type_defs import FilterTypeDef, InstanceTypeDef @@ -1790,3 +1791,13 @@ async def test__activate_drained_nodes_with_drained_node( }, available=True, ) + + +async def test_warm_buffers_are_started_to_replace_missing_hot_buffers( + minimal_configuration: None, + create_buffer_machines: Callable[ + [int, InstanceTypeType, InstanceStateNameType, list[DockerGenericTag]], + Awaitable[list[str]], + ], +): + ... diff --git a/services/autoscaling/tests/unit/test_modules_buffer_machine_core.py b/services/autoscaling/tests/unit/test_modules_buffer_machine_core.py index 24a552f342b..6c03e44b275 100644 --- a/services/autoscaling/tests/unit/test_modules_buffer_machine_core.py +++ b/services/autoscaling/tests/unit/test_modules_buffer_machine_core.py @@ -17,7 +17,6 @@ import pytest import tenacity from aws_library.ec2 import AWSTagKey, EC2InstanceBootSpecific -from common_library.json_serialization import json_dumps from faker import Faker from fastapi import FastAPI from fastapi.encoders import jsonable_encoder @@ -30,19 +29,15 @@ from pytest_simcore.helpers.logging_tools import log_context from pytest_simcore.helpers.monkeypatch_envs import EnvVarsDict, setenvs_from_dict from simcore_service_autoscaling.constants import PRE_PULLED_IMAGES_EC2_TAG_KEY -from simcore_service_autoscaling.core.settings import ApplicationSettings from simcore_service_autoscaling.modules.auto_scaling_mode_dynamic import ( DynamicAutoscaling, ) from simcore_service_autoscaling.modules.buffer_machines_pool_core import ( monitor_buffer_machines, ) -from simcore_service_autoscaling.utils.buffer_machines_pool_core import ( - get_deactivated_buffer_ec2_tags, -) from types_aiobotocore_ec2 import EC2Client from types_aiobotocore_ec2.literals import InstanceStateNameType, InstanceTypeType -from types_aiobotocore_ec2.type_defs import FilterTypeDef, TagTypeDef +from types_aiobotocore_ec2.type_defs import FilterTypeDef @pytest.fixture @@ -345,96 +340,6 @@ async def test_monitor_buffer_machines( ) -@pytest.fixture -async def create_buffer_machines( - ec2_client: EC2Client, - aws_ami_id: str, - app_settings: ApplicationSettings, - initialized_app: FastAPI, -) -> Callable[ - [int, InstanceTypeType, InstanceStateNameType, list[DockerGenericTag]], - Awaitable[list[str]], -]: - async def _do( - num: int, - instance_type: InstanceTypeType, - instance_state_name: InstanceStateNameType, - pre_pull_images: list[DockerGenericTag], - ) -> list[str]: - assert app_settings.AUTOSCALING_EC2_INSTANCES - - assert instance_state_name in [ - "running", - "stopped", - ], "only 'running' and 'stopped' are supported for testing" - - resource_tags: list[TagTypeDef] = [ - {"Key": tag_key, "Value": tag_value} - for tag_key, tag_value in get_deactivated_buffer_ec2_tags( - initialized_app, DynamicAutoscaling() - ).items() - ] - if pre_pull_images is not None and instance_state_name == "stopped": - resource_tags.append( - { - "Key": PRE_PULLED_IMAGES_EC2_TAG_KEY, - "Value": f"{json_dumps(pre_pull_images)}", - } - ) - with log_context( - logging.INFO, f"creating {num} buffer machines of {instance_type}" - ): - instances = await ec2_client.run_instances( - ImageId=aws_ami_id, - MaxCount=num, - MinCount=num, - InstanceType=instance_type, - KeyName=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_KEY_NAME, - SecurityGroupIds=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_SECURITY_GROUP_IDS, - SubnetId=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_SUBNET_ID, - IamInstanceProfile={ - "Arn": app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_ATTACHED_IAM_PROFILE - }, - TagSpecifications=[ - {"ResourceType": "instance", "Tags": resource_tags}, - {"ResourceType": "volume", "Tags": resource_tags}, - {"ResourceType": "network-interface", "Tags": resource_tags}, - ], - UserData="echo 'I am pytest'", - ) - instance_ids = [ - i["InstanceId"] for i in instances["Instances"] if "InstanceId" in i - ] - - waiter = ec2_client.get_waiter("instance_exists") - await waiter.wait(InstanceIds=instance_ids) - instances = await ec2_client.describe_instances(InstanceIds=instance_ids) - assert "Reservations" in instances - assert instances["Reservations"] - assert "Instances" in instances["Reservations"][0] - assert len(instances["Reservations"][0]["Instances"]) == num - for instance in instances["Reservations"][0]["Instances"]: - assert "State" in instance - assert "Name" in instance["State"] - assert instance["State"]["Name"] == "running" - - if instance_state_name == "stopped": - await ec2_client.stop_instances(InstanceIds=instance_ids) - instances = await ec2_client.describe_instances(InstanceIds=instance_ids) - assert "Reservations" in instances - assert instances["Reservations"] - assert "Instances" in instances["Reservations"][0] - assert len(instances["Reservations"][0]["Instances"]) == num - for instance in instances["Reservations"][0]["Instances"]: - assert "State" in instance - assert "Name" in instance["State"] - assert instance["State"]["Name"] == "stopped" - - return instance_ids - - return _do - - @dataclass class _BufferMachineParams: instance_state_name: InstanceStateNameType From 78db78f8fee6581af872383e21a244d7268f7b6e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 12 Dec 2024 16:20:32 +0100 Subject: [PATCH 04/16] renaming fixture --- services/autoscaling/tests/unit/conftest.py | 22 ++++++++++++---- .../unit/test_modules_auto_scaling_dynamic.py | 25 ++++++++----------- .../unit/test_utils_auto_scaling_core.py | 10 +++----- 3 files changed, 32 insertions(+), 25 deletions(-) diff --git a/services/autoscaling/tests/unit/conftest.py b/services/autoscaling/tests/unit/conftest.py index d4e58d5bf96..3239761efd2 100644 --- a/services/autoscaling/tests/unit/conftest.py +++ b/services/autoscaling/tests/unit/conftest.py @@ -50,7 +50,7 @@ Service, TaskSpec, ) -from pydantic import ByteSize, PositiveInt, TypeAdapter +from pydantic import ByteSize, NonNegativeInt, PositiveInt, TypeAdapter from pytest_mock import MockType from pytest_mock.plugin import MockerFixture from pytest_simcore.helpers.host import get_localhost_ip @@ -1005,10 +1005,22 @@ def _creator( @pytest.fixture -def mock_machines_buffer(monkeypatch: pytest.MonkeyPatch) -> int: - num_machines_in_buffer = 5 - monkeypatch.setenv("EC2_INSTANCES_MACHINES_BUFFER", f"{num_machines_in_buffer}") - return num_machines_in_buffer +def num_hot_buffer() -> NonNegativeInt: + return 5 + + +@pytest.fixture +def with_instances_machines_hot_buffer( + num_hot_buffer: int, + app_environment: EnvVarsDict, + monkeypatch: pytest.MonkeyPatch, +) -> EnvVarsDict: + return app_environment | setenvs_from_dict( + monkeypatch, + { + "EC2_INSTANCES_MACHINES_BUFFER": f"{num_hot_buffer}", + }, + ) @pytest.fixture diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 51be00eeea5..e2ddc87a53c 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -308,7 +308,7 @@ async def test_cluster_scaling_with_no_services_does_nothing( async def test_cluster_scaling_with_no_services_and_machine_buffer_starts_expected_machines( patch_ec2_client_launch_instances_min_number_of_instances: mock.Mock, minimal_configuration: None, - mock_machines_buffer: int, + with_instances_machines_hot_buffer: EnvVarsDict, app_settings: ApplicationSettings, initialized_app: FastAPI, aws_allowed_ec2_instance_type_names_env: list[str], @@ -322,17 +322,13 @@ async def test_cluster_scaling_with_no_services_and_machine_buffer_starts_expect instance_type_filters: Sequence[FilterTypeDef], ): assert app_settings.AUTOSCALING_EC2_INSTANCES - assert ( - mock_machines_buffer - == app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER - ) await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() ) await assert_autoscaled_dynamic_ec2_instances( ec2_client, expected_num_reservations=1, - expected_num_instances=mock_machines_buffer, + expected_num_instances=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER, expected_instance_type=cast( InstanceTypeType, next( @@ -347,7 +343,7 @@ async def test_cluster_scaling_with_no_services_and_machine_buffer_starts_expect mock_rabbitmq_post_message, app_settings, initialized_app, - instances_pending=mock_machines_buffer, + instances_pending=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER, ) mock_rabbitmq_post_message.reset_mock() # calling again should attach the new nodes to the reserve, but nothing should start @@ -357,7 +353,7 @@ async def test_cluster_scaling_with_no_services_and_machine_buffer_starts_expect await assert_autoscaled_dynamic_ec2_instances( ec2_client, expected_num_reservations=1, - expected_num_instances=mock_machines_buffer, + expected_num_instances=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER, expected_instance_type=cast( InstanceTypeType, next( @@ -376,14 +372,15 @@ async def test_cluster_scaling_with_no_services_and_machine_buffer_starts_expect mock_rabbitmq_post_message, app_settings, initialized_app, - nodes_total=mock_machines_buffer, - nodes_drained=mock_machines_buffer, - instances_running=mock_machines_buffer, + nodes_total=with_instances_machines_hot_buffer, + nodes_drained=with_instances_machines_hot_buffer, + instances_running=with_instances_machines_hot_buffer, cluster_total_resources={ - "cpus": mock_machines_buffer + "cpus": app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER * fake_node.description.resources.nano_cp_us / 1e9, - "ram": mock_machines_buffer * fake_node.description.resources.memory_bytes, + "ram": app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER + * fake_node.description.resources.memory_bytes, }, ) @@ -395,7 +392,7 @@ async def test_cluster_scaling_with_no_services_and_machine_buffer_starts_expect await assert_autoscaled_dynamic_ec2_instances( ec2_client, expected_num_reservations=1, - expected_num_instances=mock_machines_buffer, + expected_num_instances=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER, expected_instance_type=cast( InstanceTypeType, next( diff --git a/services/autoscaling/tests/unit/test_utils_auto_scaling_core.py b/services/autoscaling/tests/unit/test_utils_auto_scaling_core.py index f576292ec6b..5a5a3240057 100644 --- a/services/autoscaling/tests/unit/test_utils_auto_scaling_core.py +++ b/services/autoscaling/tests/unit/test_utils_auto_scaling_core.py @@ -323,7 +323,7 @@ def test_sort_empty_drained_nodes( def test_sort_drained_nodes( - mock_machines_buffer: int, + with_instances_machines_hot_buffer: EnvVarsDict, minimal_configuration: None, app_settings: ApplicationSettings, random_fake_available_instances: list[EC2InstanceType], @@ -332,7 +332,9 @@ def test_sort_drained_nodes( ): machine_buffer_type = get_machine_buffer_type(random_fake_available_instances) _NUM_DRAINED_NODES = 20 - _NUM_NODE_WITH_TYPE_BUFFER = 3 * mock_machines_buffer + _NUM_NODE_WITH_TYPE_BUFFER = ( + 3 * app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER + ) _NUM_NODES_TERMINATING = 13 fake_drained_nodes = [] for _ in range(_NUM_DRAINED_NODES): @@ -388,10 +390,6 @@ def test_sort_drained_nodes( app_settings, fake_drained_nodes, random_fake_available_instances ) assert app_settings.AUTOSCALING_EC2_INSTANCES - assert ( - mock_machines_buffer - == app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER - ) assert len(sorted_drained_nodes) == ( _NUM_DRAINED_NODES + _NUM_NODE_WITH_TYPE_BUFFER From c9b5771e1f090b5a2b320485caefc85a45bb2034 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 12 Dec 2024 18:30:55 +0100 Subject: [PATCH 05/16] moving fixtures and preparing test --- services/autoscaling/tests/unit/conftest.py | 76 ++++++++++++++++++- .../unit/test_modules_auto_scaling_dynamic.py | 42 +++++++++- .../unit/test_modules_buffer_machine_core.py | 75 +----------------- 3 files changed, 114 insertions(+), 79 deletions(-) diff --git a/services/autoscaling/tests/unit/conftest.py b/services/autoscaling/tests/unit/conftest.py index 3239761efd2..9b7489268e6 100644 --- a/services/autoscaling/tests/unit/conftest.py +++ b/services/autoscaling/tests/unit/conftest.py @@ -1070,6 +1070,78 @@ async def _( ) +@pytest.fixture +def fake_pre_pull_images() -> list[DockerGenericTag]: + return TypeAdapter(list[DockerGenericTag]).validate_python( + [ + "nginx:latest", + "itisfoundation/my-very-nice-service:latest", + "simcore/services/dynamic/another-nice-one:2.4.5", + "asd", + ] + ) + + +@pytest.fixture +def ec2_instances_allowed_types_with_only_1_buffered( + faker: Faker, + fake_pre_pull_images: list[DockerGenericTag], + external_ec2_instances_allowed_types: None | dict[str, EC2InstanceBootSpecific], +) -> dict[InstanceTypeType, EC2InstanceBootSpecific]: + if not external_ec2_instances_allowed_types: + return { + "t2.micro": EC2InstanceBootSpecific( + ami_id=faker.pystr(), + pre_pull_images=fake_pre_pull_images, + buffer_count=faker.pyint(min_value=1, max_value=10), + ) + } + + allowed_ec2_types = external_ec2_instances_allowed_types + allowed_ec2_types_with_buffer_defined = dict( + filter( + lambda instance_type_and_settings: instance_type_and_settings[ + 1 + ].buffer_count + > 0, + allowed_ec2_types.items(), + ) + ) + assert ( + allowed_ec2_types_with_buffer_defined + ), "one type with buffer is needed for the tests!" + assert ( + len(allowed_ec2_types_with_buffer_defined) == 1 + ), "more than one type with buffer is disallowed in this test!" + return { + TypeAdapter(InstanceTypeType).validate_python(k): v + for k, v in allowed_ec2_types_with_buffer_defined.items() + } + + +@pytest.fixture +def buffer_count( + ec2_instances_allowed_types_with_only_1_buffered: dict[ + InstanceTypeType, EC2InstanceBootSpecific + ], +) -> int: + def _by_buffer_count( + instance_type_and_settings: tuple[InstanceTypeType, EC2InstanceBootSpecific] + ) -> bool: + _, boot_specific = instance_type_and_settings + return boot_specific.buffer_count > 0 + + allowed_ec2_types = ec2_instances_allowed_types_with_only_1_buffered + allowed_ec2_types_with_buffer_defined = dict( + filter(_by_buffer_count, allowed_ec2_types.items()) + ) + assert allowed_ec2_types_with_buffer_defined, "you need one type with buffer" + assert ( + len(allowed_ec2_types_with_buffer_defined) == 1 + ), "more than one type with buffer is disallowed in this test!" + return next(iter(allowed_ec2_types_with_buffer_defined.values())).buffer_count + + @pytest.fixture async def create_buffer_machines( ec2_client: EC2Client, @@ -1077,14 +1149,14 @@ async def create_buffer_machines( app_settings: ApplicationSettings, initialized_app: FastAPI, ) -> Callable[ - [int, InstanceTypeType, InstanceStateNameType, list[DockerGenericTag]], + [int, InstanceTypeType, InstanceStateNameType, list[DockerGenericTag] | None], Awaitable[list[str]], ]: async def _do( num: int, instance_type: InstanceTypeType, instance_state_name: InstanceStateNameType, - pre_pull_images: list[DockerGenericTag], + pre_pull_images: list[DockerGenericTag] | None, ) -> list[str]: assert app_settings.AUTOSCALING_EC2_INSTANCES diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index e2ddc87a53c..521ede32332 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -44,7 +44,10 @@ assert_cluster_state, create_fake_association, ) -from pytest_simcore.helpers.aws_ec2 import assert_autoscaled_dynamic_ec2_instances +from pytest_simcore.helpers.aws_ec2 import ( + assert_autoscaled_dynamic_ec2_instances, + assert_autoscaled_dynamic_warm_pools_ec2_instances, +) from pytest_simcore.helpers.logging_tools import log_context from pytest_simcore.helpers.monkeypatch_envs import EnvVarsDict from simcore_service_autoscaling.core.settings import ApplicationSettings @@ -1792,9 +1795,42 @@ async def test__activate_drained_nodes_with_drained_node( async def test_warm_buffers_are_started_to_replace_missing_hot_buffers( minimal_configuration: None, + with_instances_machines_hot_buffer: EnvVarsDict, + ec2_client: EC2Client, + app_settings: ApplicationSettings, + ec2_instances_allowed_types_with_only_1_buffered: dict[ + InstanceTypeType, EC2InstanceBootSpecific + ], + ec2_instance_custom_tags: dict[str, str], + buffer_count: int, create_buffer_machines: Callable[ - [int, InstanceTypeType, InstanceStateNameType, list[DockerGenericTag]], + [int, InstanceTypeType, InstanceStateNameType, list[DockerGenericTag] | None], Awaitable[list[str]], ], ): - ... + # pre-requisites + assert app_settings.AUTOSCALING_EC2_INSTANCES + assert app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER > 0 + # we have nothing running now + all_instances = await ec2_client.describe_instances() + assert not all_instances["Reservations"] + + # have a few warm buffers ready + buffer_machines = await create_buffer_machines( + buffer_count, + next(iter(list(ec2_instances_allowed_types_with_only_1_buffered))), + "stopped", + None, + ) + await assert_autoscaled_dynamic_warm_pools_ec2_instances( + ec2_client, + expected_num_reservations=1, + expected_num_instances=buffer_count, + expected_instance_type=next( + iter(ec2_instances_allowed_types_with_only_1_buffered) + ), + expected_instance_state="stopped", + expected_additional_tag_keys=list(ec2_instance_custom_tags), + expected_pre_pulled_images=None, + instance_filters=None, + ) diff --git a/services/autoscaling/tests/unit/test_modules_buffer_machine_core.py b/services/autoscaling/tests/unit/test_modules_buffer_machine_core.py index 6c03e44b275..26375418417 100644 --- a/services/autoscaling/tests/unit/test_modules_buffer_machine_core.py +++ b/services/autoscaling/tests/unit/test_modules_buffer_machine_core.py @@ -16,8 +16,7 @@ import pytest import tenacity -from aws_library.ec2 import AWSTagKey, EC2InstanceBootSpecific -from faker import Faker +from aws_library.ec2 import AWSTagKey from fastapi import FastAPI from fastapi.encoders import jsonable_encoder from models_library.docker import DockerGenericTag @@ -40,55 +39,6 @@ from types_aiobotocore_ec2.type_defs import FilterTypeDef -@pytest.fixture -def fake_pre_pull_images() -> list[DockerGenericTag]: - return TypeAdapter(list[DockerGenericTag]).validate_python( - [ - "nginx:latest", - "itisfoundation/my-very-nice-service:latest", - "simcore/services/dynamic/another-nice-one:2.4.5", - "asd", - ] - ) - - -@pytest.fixture -def ec2_instances_allowed_types_with_only_1_buffered( - faker: Faker, - fake_pre_pull_images: list[DockerGenericTag], - external_ec2_instances_allowed_types: None | dict[str, EC2InstanceBootSpecific], -) -> dict[InstanceTypeType, EC2InstanceBootSpecific]: - if not external_ec2_instances_allowed_types: - return { - "t2.micro": EC2InstanceBootSpecific( - ami_id=faker.pystr(), - pre_pull_images=fake_pre_pull_images, - buffer_count=faker.pyint(min_value=1, max_value=10), - ) - } - - allowed_ec2_types = external_ec2_instances_allowed_types - allowed_ec2_types_with_buffer_defined = dict( - filter( - lambda instance_type_and_settings: instance_type_and_settings[ - 1 - ].buffer_count - > 0, - allowed_ec2_types.items(), - ) - ) - assert ( - allowed_ec2_types_with_buffer_defined - ), "one type with buffer is needed for the tests!" - assert ( - len(allowed_ec2_types_with_buffer_defined) == 1 - ), "more than one type with buffer is disallowed in this test!" - return { - TypeAdapter(InstanceTypeType).validate_python(k): v - for k, v in allowed_ec2_types_with_buffer_defined.items() - } - - @pytest.fixture def with_ec2_instance_allowed_types_env( app_environment: EnvVarsDict, @@ -557,29 +507,6 @@ async def test_monitor_buffer_machines_terminates_unneeded_pool( ) -@pytest.fixture -def buffer_count( - ec2_instances_allowed_types_with_only_1_buffered: dict[ - InstanceTypeType, EC2InstanceBootSpecific - ], -) -> int: - def _by_buffer_count( - instance_type_and_settings: tuple[InstanceTypeType, EC2InstanceBootSpecific] - ) -> bool: - _, boot_specific = instance_type_and_settings - return boot_specific.buffer_count > 0 - - allowed_ec2_types = ec2_instances_allowed_types_with_only_1_buffered - allowed_ec2_types_with_buffer_defined = dict( - filter(_by_buffer_count, allowed_ec2_types.items()) - ) - assert allowed_ec2_types_with_buffer_defined, "you need one type with buffer" - assert ( - len(allowed_ec2_types_with_buffer_defined) == 1 - ), "more than one type with buffer is disallowed in this test!" - return next(iter(allowed_ec2_types_with_buffer_defined.values())).buffer_count - - @pytest.fixture def pre_pull_images( ec2_instances_allowed_types_with_only_1_buffered: dict[InstanceTypeType, Any] From 851d7a7f391bf274f98bc5c9bb2f0148efd10216 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 12 Dec 2024 18:35:49 +0100 Subject: [PATCH 06/16] ongoing --- .../unit/test_modules_auto_scaling_dynamic.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 521ede32332..0669389db6a 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -1797,6 +1797,7 @@ async def test_warm_buffers_are_started_to_replace_missing_hot_buffers( minimal_configuration: None, with_instances_machines_hot_buffer: EnvVarsDict, ec2_client: EC2Client, + initialized_app: FastAPI, app_settings: ApplicationSettings, ec2_instances_allowed_types_with_only_1_buffered: dict[ InstanceTypeType, EC2InstanceBootSpecific @@ -1807,6 +1808,7 @@ async def test_warm_buffers_are_started_to_replace_missing_hot_buffers( [int, InstanceTypeType, InstanceStateNameType, list[DockerGenericTag] | None], Awaitable[list[str]], ], + spied_cluster_analysis: MockType, ): # pre-requisites assert app_settings.AUTOSCALING_EC2_INSTANCES @@ -1834,3 +1836,16 @@ async def test_warm_buffers_are_started_to_replace_missing_hot_buffers( expected_pre_pulled_images=None, instance_filters=None, ) + + # let's autoscale, this should move the warm buffers to hot buffers + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=DynamicAutoscaling() + ) + analyzed_cluster = assert_cluster_state( + spied_cluster_analysis, + expected_calls=1, + expected_num_machines=0, + ) + assert not analyzed_cluster.active_nodes + assert analyzed_cluster.buffer_ec2s + assert len(analyzed_cluster.buffer_ec2s) == len(buffer_machines) From b7026fdc05ee832238888b166d1eb452aa9341cc Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 12 Dec 2024 18:51:15 +0100 Subject: [PATCH 07/16] test ready --- .../unit/test_modules_auto_scaling_dynamic.py | 60 ++++++++++++++++--- 1 file changed, 53 insertions(+), 7 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 0669389db6a..2656c3a20b4 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -1794,14 +1794,12 @@ async def test__activate_drained_nodes_with_drained_node( async def test_warm_buffers_are_started_to_replace_missing_hot_buffers( + patch_ec2_client_launch_instances_min_number_of_instances: mock.Mock, minimal_configuration: None, with_instances_machines_hot_buffer: EnvVarsDict, ec2_client: EC2Client, initialized_app: FastAPI, app_settings: ApplicationSettings, - ec2_instances_allowed_types_with_only_1_buffered: dict[ - InstanceTypeType, EC2InstanceBootSpecific - ], ec2_instance_custom_tags: dict[str, str], buffer_count: int, create_buffer_machines: Callable[ @@ -1809,18 +1807,25 @@ async def test_warm_buffers_are_started_to_replace_missing_hot_buffers( Awaitable[list[str]], ], spied_cluster_analysis: MockType, + instance_type_filters: Sequence[FilterTypeDef], ): # pre-requisites assert app_settings.AUTOSCALING_EC2_INSTANCES assert app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER > 0 + # we have nothing running now all_instances = await ec2_client.describe_instances() assert not all_instances["Reservations"] - # have a few warm buffers ready + # have a few warm buffers ready with the same type as the hot buffer machines buffer_machines = await create_buffer_machines( buffer_count, - next(iter(list(ec2_instances_allowed_types_with_only_1_buffered))), + cast( + InstanceTypeType, + next( + iter(app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_ALLOWED_TYPES) + ), + ), "stopped", None, ) @@ -1828,8 +1833,11 @@ async def test_warm_buffers_are_started_to_replace_missing_hot_buffers( ec2_client, expected_num_reservations=1, expected_num_instances=buffer_count, - expected_instance_type=next( - iter(ec2_instances_allowed_types_with_only_1_buffered) + expected_instance_type=cast( + InstanceTypeType, + next( + iter(app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_ALLOWED_TYPES) + ), ), expected_instance_state="stopped", expected_additional_tag_keys=list(ec2_instance_custom_tags), @@ -1841,6 +1849,7 @@ async def test_warm_buffers_are_started_to_replace_missing_hot_buffers( await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() ) + # at analysis time, we had no machines running analyzed_cluster = assert_cluster_state( spied_cluster_analysis, expected_calls=1, @@ -1849,3 +1858,40 @@ async def test_warm_buffers_are_started_to_replace_missing_hot_buffers( assert not analyzed_cluster.active_nodes assert analyzed_cluster.buffer_ec2s assert len(analyzed_cluster.buffer_ec2s) == len(buffer_machines) + + # now we should have a warm buffer moved to the hot buffer + await assert_autoscaled_dynamic_ec2_instances( + ec2_client, + expected_num_reservations=1, + expected_num_instances=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER, + expected_instance_type=cast( + InstanceTypeType, + next( + iter(app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_ALLOWED_TYPES) + ), + ), + expected_instance_state="running", + expected_additional_tag_keys=list(ec2_instance_custom_tags), + instance_filters=instance_type_filters, + ) + + # let's autoscale again, to check the cluster analysis + await auto_scale_cluster( + app=initialized_app, auto_scaling_mode=DynamicAutoscaling() + ) + # at analysis time, we had no machines running + analyzed_cluster = assert_cluster_state( + spied_cluster_analysis, + expected_calls=1, + expected_num_machines=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER, + ) + assert not analyzed_cluster.active_nodes + assert len(analyzed_cluster.buffer_ec2s) == max( + 0, + buffer_count + - app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER, + ) + assert ( + len(analyzed_cluster.buffer_drained_nodes) + == app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER + ) From e09f5050a47a8ea598410af54287e25f47fc5f95 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 12 Dec 2024 22:44:16 +0100 Subject: [PATCH 08/16] ongoing --- .../modules/auto_scaling_core.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index e2212195aed..69f9b73ce03 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -418,15 +418,26 @@ async def _activate_drained_nodes( ) -async def _start_buffer_instances( +async def _start_warm_buffer_instances( app: FastAPI, cluster: Cluster, auto_scaling_mode: BaseAutoscaling ) -> Cluster: + """starts warm buffer if there are assigned tasks, or if a hot buffer of the same type is needed""" + + app_settings = get_application_settings(app) + needed_hot_buffers = ( + app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER + ) + hot_buffer_instance_type = next( + iter(app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_ALLOWED_TYPES) + ) + current_hot_buffers = cluster.buffer_drained_nodes + instances_to_start = [ i.ec2_instance for i in cluster.buffer_ec2s if i.assigned_tasks ] if not instances_to_start: return cluster - # change the buffer machine to an active one + with log_context( _logger, logging.INFO, f"start {len(instances_to_start)} buffer machines" ): @@ -1187,8 +1198,8 @@ async def _autoscale_cluster( # 2. activate available drained nodes to cover some of the tasks cluster = await _activate_drained_nodes(app, cluster, auto_scaling_mode) - # 3. start buffer instances to cover the remaining tasks - cluster = await _start_buffer_instances(app, cluster, auto_scaling_mode) + # 3. start warm buffer instances to cover the remaining tasks + cluster = await _start_warm_buffer_instances(app, cluster, auto_scaling_mode) # 4. scale down unused instances cluster = await _scale_down_unused_cluster_instances( From 9677c6e8a857d3b2ef87c940e630431bf7ee7637 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 13 Dec 2024 09:36:14 +0100 Subject: [PATCH 09/16] better assert --- .../tests/unit/test_modules_auto_scaling_dynamic.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 2656c3a20b4..6ba134907be 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -1890,6 +1890,10 @@ async def test_warm_buffers_are_started_to_replace_missing_hot_buffers( 0, buffer_count - app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER, + ), ( + "the warm buffers were not used as expected there should be" + f" {buffer_count - app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER} remaining, " + f"found {len(analyzed_cluster.buffer_ec2s)}" ) assert ( len(analyzed_cluster.buffer_drained_nodes) From 252e6741e0dcee34ce7b7367aac0e121d2f1fa72 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 13 Dec 2024 09:44:20 +0100 Subject: [PATCH 10/16] check tagging --- .../tests/unit/test_modules_auto_scaling_dynamic.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 6ba134907be..a369296e07a 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -1808,6 +1808,8 @@ async def test_warm_buffers_are_started_to_replace_missing_hot_buffers( ], spied_cluster_analysis: MockType, instance_type_filters: Sequence[FilterTypeDef], + mock_find_node_with_name_returns_fake_node: mock.Mock, + mock_docker_tag_node: mock.Mock, ): # pre-requisites assert app_settings.AUTOSCALING_EC2_INSTANCES @@ -1849,6 +1851,7 @@ async def test_warm_buffers_are_started_to_replace_missing_hot_buffers( await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() ) + mock_docker_tag_node.assert_not_called() # at analysis time, we had no machines running analyzed_cluster = assert_cluster_state( spied_cluster_analysis, @@ -1875,10 +1878,15 @@ async def test_warm_buffers_are_started_to_replace_missing_hot_buffers( instance_filters=instance_type_filters, ) - # let's autoscale again, to check the cluster analysis + # let's autoscale again, to check the cluster analysis and tag the nodes await auto_scale_cluster( app=initialized_app, auto_scaling_mode=DynamicAutoscaling() ) + mock_docker_tag_node.assert_called() + assert ( + mock_docker_tag_node.call_count + == app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER + ) # at analysis time, we had no machines running analyzed_cluster = assert_cluster_state( spied_cluster_analysis, From b173f1b0b9508f714bd7a88f07be213341838500 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 13 Dec 2024 12:18:50 +0100 Subject: [PATCH 11/16] test passes --- .../src/pytest_simcore/helpers/aws_ec2.py | 5 ++- .../modules/auto_scaling_core.py | 31 ++++++++++++++----- .../unit/test_modules_auto_scaling_dynamic.py | 10 ++++-- 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/packages/pytest-simcore/src/pytest_simcore/helpers/aws_ec2.py b/packages/pytest-simcore/src/pytest_simcore/helpers/aws_ec2.py index 1e992f4ee45..7bb826149fe 100644 --- a/packages/pytest-simcore/src/pytest_simcore/helpers/aws_ec2.py +++ b/packages/pytest-simcore/src/pytest_simcore/helpers/aws_ec2.py @@ -42,7 +42,10 @@ async def assert_autoscaled_dynamic_ec2_instances( expected_instance_state: InstanceStateNameType, expected_additional_tag_keys: list[str], instance_filters: Sequence[FilterTypeDef] | None, + expected_user_data: list[str] | None = None, ) -> list[InstanceTypeDef]: + if expected_user_data is None: + expected_user_data = ["docker swarm join"] return await assert_ec2_instances( ec2_client, expected_num_reservations=expected_num_reservations, @@ -54,7 +57,7 @@ async def assert_autoscaled_dynamic_ec2_instances( "io.simcore.autoscaling.monitored_services_labels", *expected_additional_tag_keys, ], - expected_user_data=["docker swarm join"], + expected_user_data=expected_user_data, instance_filters=instance_filters, ) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 69f9b73ce03..2100b9b67d2 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -424,17 +424,34 @@ async def _start_warm_buffer_instances( """starts warm buffer if there are assigned tasks, or if a hot buffer of the same type is needed""" app_settings = get_application_settings(app) - needed_hot_buffers = ( - app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER - ) - hot_buffer_instance_type = next( - iter(app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_ALLOWED_TYPES) - ) - current_hot_buffers = cluster.buffer_drained_nodes + assert app_settings.AUTOSCALING_EC2_INSTANCES # nosec instances_to_start = [ i.ec2_instance for i in cluster.buffer_ec2s if i.assigned_tasks ] + + if ( + len(cluster.buffer_drained_nodes) + < app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER + ): + # check if we can migrate warm buffers to hot buffers + hot_buffer_instance_type = cast( + InstanceTypeType, + next( + iter(app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_ALLOWED_TYPES) + ), + ) + free_starteable_warm_buffers_to_replace_hot_buffers = [ + warm_buffer.ec2_instance + for warm_buffer in cluster.buffer_ec2s + if (warm_buffer.ec2_instance.type == hot_buffer_instance_type) + and not warm_buffer.assigned_tasks + ] + instances_to_start += free_starteable_warm_buffers_to_replace_hot_buffers[ + : app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER + - len(cluster.buffer_drained_nodes) + ] + if not instances_to_start: return cluster diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index a369296e07a..7e58a00e321 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -50,6 +50,7 @@ ) from pytest_simcore.helpers.logging_tools import log_context from pytest_simcore.helpers.monkeypatch_envs import EnvVarsDict +from simcore_service_autoscaling.constants import BUFFER_MACHINE_TAG_KEY from simcore_service_autoscaling.core.settings import ApplicationSettings from simcore_service_autoscaling.models import AssociatedInstance, Cluster from simcore_service_autoscaling.modules.auto_scaling_core import ( @@ -1809,6 +1810,7 @@ async def test_warm_buffers_are_started_to_replace_missing_hot_buffers( spied_cluster_analysis: MockType, instance_type_filters: Sequence[FilterTypeDef], mock_find_node_with_name_returns_fake_node: mock.Mock, + mock_compute_node_used_resources: mock.Mock, mock_docker_tag_node: mock.Mock, ): # pre-requisites @@ -1874,8 +1876,12 @@ async def test_warm_buffers_are_started_to_replace_missing_hot_buffers( ), ), expected_instance_state="running", - expected_additional_tag_keys=list(ec2_instance_custom_tags), + expected_additional_tag_keys=[ + *list(ec2_instance_custom_tags), + BUFFER_MACHINE_TAG_KEY, + ], instance_filters=instance_type_filters, + expected_user_data=[], ) # let's autoscale again, to check the cluster analysis and tag the nodes @@ -1904,6 +1910,6 @@ async def test_warm_buffers_are_started_to_replace_missing_hot_buffers( f"found {len(analyzed_cluster.buffer_ec2s)}" ) assert ( - len(analyzed_cluster.buffer_drained_nodes) + len(analyzed_cluster.pending_ec2s) == app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER ) From 6737c4c6f873f9cf730ca7e01ec4d42e74f15025 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:04:05 +0100 Subject: [PATCH 12/16] wrong check --- .../tests/unit/test_modules_auto_scaling_dynamic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 7e58a00e321..315b03820d4 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -376,9 +376,9 @@ async def test_cluster_scaling_with_no_services_and_machine_buffer_starts_expect mock_rabbitmq_post_message, app_settings, initialized_app, - nodes_total=with_instances_machines_hot_buffer, - nodes_drained=with_instances_machines_hot_buffer, - instances_running=with_instances_machines_hot_buffer, + nodes_total=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER, + nodes_drained=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER, + instances_running=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER, cluster_total_resources={ "cpus": app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER * fake_node.description.resources.nano_cp_us From 60ff0a71a934edf89750348edaaedb5dfd752eed Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:04:22 +0100 Subject: [PATCH 13/16] only run upload docker logs if there is a failure --- .github/workflows/ci-testing-deploy.yml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci-testing-deploy.yml b/.github/workflows/ci-testing-deploy.yml index aa1efbee7a9..789c552cc81 100644 --- a/.github/workflows/ci-testing-deploy.yml +++ b/.github/workflows/ci-testing-deploy.yml @@ -772,7 +772,7 @@ jobs: if: ${{ !cancelled() }} run: ./ci/github/unit-testing/catalog.bash test - name: upload failed tests logs - if: ${{ !cancelled() }} + if: ${{ failure() }} uses: actions/upload-artifact@v4 with: name: ${{ github.job }}_docker_logs @@ -879,7 +879,7 @@ jobs: if: ${{ !cancelled() }} run: ./ci/github/unit-testing/datcore-adapter.bash test - name: upload failed tests logs - if: ${{ !cancelled() }} + if: ${{ failure() }} uses: actions/upload-artifact@v4 with: name: ${{ github.job }}_docker_logs @@ -930,7 +930,7 @@ jobs: if: ${{ !cancelled() }} run: ./ci/github/unit-testing/director.bash test - name: upload failed tests logs - if: ${{ !cancelled() }} + if: ${{ failure() }} uses: actions/upload-artifact@v4 with: name: ${{ github.job }}_docker_logs @@ -981,7 +981,7 @@ jobs: if: ${{ !cancelled() }} run: ./ci/github/unit-testing/director-v2.bash test - name: upload failed tests logs - if: ${{ !cancelled() }} + if: ${{ failure() }} uses: actions/upload-artifact@v4 with: name: ${{ github.job }}_docker_logs @@ -1910,7 +1910,7 @@ jobs: - name: test run: ./ci/github/integration-testing/webserver.bash test 01 - name: upload failed tests logs - if: ${{ !cancelled() }} + if: ${{ failure() }} uses: actions/upload-artifact@v4 with: name: ${{ github.job }}_docker_logs @@ -1974,7 +1974,7 @@ jobs: - name: test run: ./ci/github/integration-testing/webserver.bash test 02 - name: upload failed tests logs - if: ${{ !cancelled() }} + if: ${{ failure() }} uses: actions/upload-artifact@v4 with: name: ${{ github.job }}_docker_logs @@ -2038,7 +2038,7 @@ jobs: - name: test run: ./ci/github/integration-testing/director-v2.bash test 01 - name: upload failed tests logs - if: ${{ !cancelled() }} + if: ${{ failure() }} uses: actions/upload-artifact@v4 with: name: ${{ github.job }}_docker_logs @@ -2111,7 +2111,7 @@ jobs: - name: test run: ./ci/github/integration-testing/director-v2.bash test 02 - name: upload failed tests logs - if: ${{ !cancelled() }} + if: ${{ failure() }} uses: actions/upload-artifact@v4 with: name: ${{ github.job }}_docker_logs @@ -2177,7 +2177,7 @@ jobs: - name: test run: ./ci/github/integration-testing/dynamic-sidecar.bash test 01 - name: upload failed tests logs - if: ${{ !cancelled() }} + if: ${{ failure() }} uses: actions/upload-artifact@v4 with: name: ${{ github.job }}_docker_logs @@ -2241,7 +2241,7 @@ jobs: - name: test run: ./ci/github/integration-testing/simcore-sdk.bash test - name: upload failed tests logs - if: ${{ !cancelled() }} + if: ${{ failure() }} uses: actions/upload-artifact@v4 with: name: ${{ github.job }}_docker_logs @@ -2330,7 +2330,7 @@ jobs: - name: test run: ./ci/github/system-testing/public-api.bash test - name: upload failed tests logs - if: ${{ !cancelled() }} + if: ${{ failure() }} uses: actions/upload-artifact@v4 with: name: ${{ github.job }}_docker_logs @@ -2395,7 +2395,7 @@ jobs: name: ${{ github.job }}_services_settings_schemas path: ./services/**/settings-schema.json - name: upload failed tests logs - if: ${{ !cancelled() }} + if: ${{ failure() }} uses: actions/upload-artifact@v4 with: name: ${{ github.job }}_docker_logs From 66286319ec4201bc1db88b60fd108ff688ed11c9 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 13 Dec 2024 14:36:17 +0100 Subject: [PATCH 14/16] reduce number of test --- ...test_modules_auto_scaling_computational.py | 116 +++++++++++++- .../unit/test_modules_auto_scaling_dynamic.py | 150 +++++++++++++++++- 2 files changed, 263 insertions(+), 3 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py index 6e7a0d7c828..bad4215a65e 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_computational.py @@ -305,6 +305,18 @@ async def _(scale_up_params: _ScaleUpParams) -> list[distributed.Future]: return _ +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["with_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["without_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) async def test_cluster_scaling_with_no_tasks_does_nothing( minimal_configuration: None, app_settings: ApplicationSettings, @@ -330,6 +342,18 @@ async def test_cluster_scaling_with_no_tasks_does_nothing( @pytest.mark.acceptance_test( "Ensure this does not happen https://github.com/ITISFoundation/osparc-simcore/issues/6227" ) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["with_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["without_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) async def test_cluster_scaling_with_disabled_ssm_does_not_block_autoscaling( minimal_configuration: None, disabled_ssm: None, @@ -353,6 +377,18 @@ async def test_cluster_scaling_with_disabled_ssm_does_not_block_autoscaling( ) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["with_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["without_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) async def test_cluster_scaling_with_task_with_too_much_resources_starts_nothing( minimal_configuration: None, app_settings: ApplicationSettings, @@ -800,6 +836,18 @@ async def test_cluster_scaling_up_and_down( # noqa: PLR0915 mock_docker_compute_node_used_resources.assert_not_called() +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["with_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["without_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) async def test_cluster_does_not_scale_up_if_defined_instance_is_not_allowed( minimal_configuration: None, app_settings: ApplicationSettings, @@ -839,6 +887,18 @@ async def test_cluster_does_not_scale_up_if_defined_instance_is_not_allowed( assert "Unexpected error:" in error_messages[0] +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["with_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["without_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) async def test_cluster_does_not_scale_up_if_defined_instance_is_not_fitting_resources( minimal_configuration: None, app_settings: ApplicationSettings, @@ -878,6 +938,18 @@ async def test_cluster_does_not_scale_up_if_defined_instance_is_not_fitting_reso assert "Unexpected error:" in error_messages[0] +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["with_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["without_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) @pytest.mark.parametrize( "scale_up_params", [ @@ -948,6 +1020,18 @@ async def test_cluster_scaling_up_starts_multiple_instances( mock_rabbitmq_post_message.reset_mock() +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["with_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["without_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) @pytest.mark.parametrize( "scale_up_params", [ @@ -1044,6 +1128,18 @@ async def test_cluster_scaling_up_more_than_allowed_max_starts_max_instances_and ) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["with_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["without_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) async def test_cluster_scaling_up_more_than_allowed_with_multiple_types_max_starts_max_instances_and_not_more( patch_ec2_client_launch_instances_min_number_of_instances: mock.Mock, minimal_configuration: None, @@ -1141,6 +1237,18 @@ async def test_cluster_scaling_up_more_than_allowed_with_multiple_types_max_star ) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["with_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["without_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) @pytest.mark.parametrize( "scale_up_params", [ @@ -1305,11 +1413,15 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( @pytest.mark.parametrize( - "with_docker_join_drained", ["with_AUTOSCALING_DOCKER_JOIN_DRAINED"], indirect=True + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["with_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, ) @pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options "with_drain_nodes_labelled", - ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + ["without_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], indirect=True, ) @pytest.mark.parametrize( diff --git a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py index 315b03820d4..afd3c01e4a3 100644 --- a/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py +++ b/services/autoscaling/tests/unit/test_modules_auto_scaling_dynamic.py @@ -291,6 +291,18 @@ async def _(scale_up_params: _ScaleUpParams) -> list[Service]: return _ +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["without_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) async def test_cluster_scaling_with_no_services_does_nothing( minimal_configuration: None, app_settings: ApplicationSettings, @@ -309,6 +321,18 @@ async def test_cluster_scaling_with_no_services_does_nothing( ) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["without_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) async def test_cluster_scaling_with_no_services_and_machine_buffer_starts_expected_machines( patch_ec2_client_launch_instances_min_number_of_instances: mock.Mock, minimal_configuration: None, @@ -409,6 +433,18 @@ async def test_cluster_scaling_with_no_services_and_machine_buffer_starts_expect ) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["without_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) @pytest.mark.parametrize( "scale_up_params", [ @@ -992,6 +1028,18 @@ async def test_cluster_scaling_up_and_down( ) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["without_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) @pytest.mark.parametrize( "scale_up_params", [ @@ -1068,6 +1116,18 @@ async def test_cluster_scaling_up_and_down_against_aws( ) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["without_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) @pytest.mark.parametrize( "scale_up_params", [ @@ -1150,9 +1210,13 @@ async def test_cluster_scaling_up_starts_multiple_instances( @pytest.mark.parametrize( - "with_docker_join_drained", ["with_AUTOSCALING_DOCKER_JOIN_DRAINED"], indirect=True + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["without_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, ) @pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options "with_drain_nodes_labelled", ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], indirect=True, @@ -1447,6 +1511,18 @@ async def test_cluster_adapts_machines_on_the_fly( # noqa: PLR0915 assert instance["InstanceType"] == scale_up_params2.expected_instance_type +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["without_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) @pytest.mark.parametrize( "scale_up_params", [ @@ -1608,6 +1684,18 @@ async def test_long_pending_ec2_is_detected_as_broken_terminated_and_restarted( ) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["without_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) async def test__find_terminateable_nodes_with_no_hosts( minimal_configuration: None, initialized_app: FastAPI, @@ -1628,6 +1716,18 @@ async def test__find_terminateable_nodes_with_no_hosts( assert await _find_terminateable_instances(initialized_app, active_cluster) == [] +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["without_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) async def test__try_scale_down_cluster_with_no_nodes( minimal_configuration: None, with_valid_time_before_termination: datetime.timedelta, @@ -1652,6 +1752,18 @@ async def test__try_scale_down_cluster_with_no_nodes( mock_remove_nodes.assert_not_called() +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["without_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) async def test__activate_drained_nodes_with_no_tasks( minimal_configuration: None, with_valid_time_before_termination: datetime.timedelta, @@ -1685,6 +1797,18 @@ async def test__activate_drained_nodes_with_no_tasks( mock_docker_tag_node.assert_not_called() +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["without_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) async def test__activate_drained_nodes_with_no_drained_nodes( minimal_configuration: None, with_valid_time_before_termination: datetime.timedelta, @@ -1726,6 +1850,18 @@ async def test__activate_drained_nodes_with_no_drained_nodes( mock_docker_tag_node.assert_not_called() +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["without_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) async def test__activate_drained_nodes_with_drained_node( minimal_configuration: None, with_valid_time_before_termination: datetime.timedelta, @@ -1794,6 +1930,18 @@ async def test__activate_drained_nodes_with_drained_node( ) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_docker_join_drained", + ["without_AUTOSCALING_DOCKER_JOIN_DRAINED"], + indirect=True, +) +@pytest.mark.parametrize( + # NOTE: only the main test test_cluster_scaling_up_and_down is run with all options + "with_drain_nodes_labelled", + ["with_AUTOSCALING_DRAIN_NODES_WITH_LABELS"], + indirect=True, +) async def test_warm_buffers_are_started_to_replace_missing_hot_buffers( patch_ec2_client_launch_instances_min_number_of_instances: mock.Mock, minimal_configuration: None, From c168917fd9bd75d8a7150ec4928df1fa8bddefdf Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 13 Dec 2024 16:34:39 +0100 Subject: [PATCH 15/16] @pcrespov review: typo --- .../simcore_service_autoscaling/modules/auto_scaling_core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 2100b9b67d2..9c45de0524b 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -441,13 +441,13 @@ async def _start_warm_buffer_instances( iter(app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_ALLOWED_TYPES) ), ) - free_starteable_warm_buffers_to_replace_hot_buffers = [ + free_startable_warm_buffers_to_replace_hot_buffers = [ warm_buffer.ec2_instance for warm_buffer in cluster.buffer_ec2s if (warm_buffer.ec2_instance.type == hot_buffer_instance_type) and not warm_buffer.assigned_tasks ] - instances_to_start += free_starteable_warm_buffers_to_replace_hot_buffers[ + instances_to_start += free_startable_warm_buffers_to_replace_hot_buffers[ : app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MACHINES_BUFFER - len(cluster.buffer_drained_nodes) ] From 4d0234165d54ee16571d47f1622a2065ddc13211 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 13 Dec 2024 16:42:46 +0100 Subject: [PATCH 16/16] codecov --- .codecov.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.codecov.yml b/.codecov.yml index 02666df0a13..eb2e6697348 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -10,10 +10,10 @@ flag_management: statuses: - type: project target: auto - threshold: 1% + threshold: 2% - type: patch target: auto - threshold: 1% + threshold: 2% component_management: @@ -22,7 +22,7 @@ component_management: statuses: - type: project target: auto - threshold: 1% + threshold: 2% branches: - "!master" individual_components: @@ -116,12 +116,12 @@ coverage: project: default: informational: true - threshold: 1% + threshold: 2% patch: default: informational: true - threshold: 1% + threshold: 2% comment: layout: "header,diff,flags,components,footer"