diff --git a/.env-devel b/.env-devel index 8f979751926..52fb6e84bfd 100644 --- a/.env-devel +++ b/.env-devel @@ -17,6 +17,7 @@ AGENT_VOLUMES_CLEANUP_S3_ENDPOINT=http://172.17.0.1:9001 AGENT_VOLUMES_CLEANUP_S3_PROVIDER=MINIO AGENT_VOLUMES_CLEANUP_S3_REGION=us-east-1 AGENT_VOLUMES_CLEANUP_S3_SECRET_KEY=12345678 +AGENT_TRACING={} API_SERVER_DEV_FEATURES_ENABLED=0 API_SERVER_LOGLEVEL=INFO diff --git a/packages/aws-library/requirements/_base.txt b/packages/aws-library/requirements/_base.txt index 63c88ba0037..6caf09a9844 100644 --- a/packages/aws-library/requirements/_base.txt +++ b/packages/aws-library/requirements/_base.txt @@ -44,8 +44,6 @@ arrow==1.3.0 # -r requirements/../../../packages/service-library/requirements/../../../packages/models-library/requirements/_base.in # -r requirements/../../../packages/service-library/requirements/_base.in # -r requirements/_base.in -async-timeout==4.0.3 - # via redis attrs==24.2.0 # via # aiohttp diff --git a/packages/notifications-library/requirements/_base.txt b/packages/notifications-library/requirements/_base.txt index abc242615c5..634746a1298 100644 --- a/packages/notifications-library/requirements/_base.txt +++ b/packages/notifications-library/requirements/_base.txt @@ -16,6 +16,10 @@ attrs==24.2.0 # referencing click==8.1.7 # via typer +deprecated==1.2.14 + # via + # opentelemetry-api + # opentelemetry-semantic-conventions dnspython==2.6.1 # via email-validator email-validator==2.2.0 @@ -26,6 +30,8 @@ idna==3.10 # via # email-validator # yarl +importlib-metadata==8.5.0 + # via opentelemetry-api jinja2==3.1.4 # via # -c requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt @@ -54,6 +60,19 @@ mdurl==0.1.2 # via markdown-it-py multidict==6.1.0 # via yarl +opentelemetry-api==1.28.1 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-asyncpg + # opentelemetry-semantic-conventions +opentelemetry-instrumentation==0.49b1 + # via opentelemetry-instrumentation-asyncpg +opentelemetry-instrumentation-asyncpg==0.49b1 + # via -r requirements/../../../packages/postgres-database/requirements/_base.in +opentelemetry-semantic-conventions==0.49b1 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-asyncpg orjson==3.10.7 # via # -c requirements/../../../packages/models-library/requirements/../../../requirements/constraints.txt @@ -61,6 +80,8 @@ orjson==3.10.7 # -c requirements/../../../packages/settings-library/requirements/../../../requirements/constraints.txt # -c requirements/../../../requirements/constraints.txt # -r requirements/../../../packages/models-library/requirements/_base.in +packaging==24.2 + # via opentelemetry-instrumentation psycopg2-binary==2.9.9 # via sqlalchemy pydantic==1.10.18 @@ -109,5 +130,11 @@ typing-extensions==4.12.2 # alembic # pydantic # typer +wrapt==1.16.0 + # via + # deprecated + # opentelemetry-instrumentation yarl==1.12.1 # via -r requirements/../../../packages/postgres-database/requirements/_base.in +zipp==3.21.0 + # via importlib-metadata diff --git a/packages/notifications-library/requirements/_test.txt b/packages/notifications-library/requirements/_test.txt index 55a7d9b8ee8..e802554a901 100644 --- a/packages/notifications-library/requirements/_test.txt +++ b/packages/notifications-library/requirements/_test.txt @@ -28,8 +28,9 @@ mypy==1.12.0 # via sqlalchemy mypy-extensions==1.0.0 # via mypy -packaging==24.1 +packaging==24.2 # via + # -c requirements/_base.txt # pytest # pytest-sugar pluggy==1.5.0 diff --git a/packages/notifications-library/requirements/_tools.txt b/packages/notifications-library/requirements/_tools.txt index 217752d687f..4a902da9cb2 100644 --- a/packages/notifications-library/requirements/_tools.txt +++ b/packages/notifications-library/requirements/_tools.txt @@ -38,8 +38,9 @@ mypy-extensions==1.0.0 # mypy nodeenv==1.9.1 # via pre-commit -packaging==24.1 +packaging==24.2 # via + # -c requirements/_base.txt # -c requirements/_test.txt # black # build diff --git a/packages/service-library/requirements/_base.txt b/packages/service-library/requirements/_base.txt index d53ce73a8c4..696dc496fcf 100644 --- a/packages/service-library/requirements/_base.txt +++ b/packages/service-library/requirements/_base.txt @@ -28,8 +28,6 @@ arrow==1.3.0 # via # -r requirements/../../../packages/models-library/requirements/_base.in # -r requirements/_base.in -async-timeout==4.0.3 - # via redis attrs==24.2.0 # via # aiohttp diff --git a/packages/service-library/requirements/_fastapi.in b/packages/service-library/requirements/_fastapi.in index 7b6a6bb2cf2..e11871af331 100644 --- a/packages/service-library/requirements/_fastapi.in +++ b/packages/service-library/requirements/_fastapi.in @@ -9,6 +9,7 @@ fastapi httpx opentelemetry-instrumentation-fastapi +opentelemetry-instrumentation-httpx prometheus-client prometheus-fastapi-instrumentator uvicorn diff --git a/packages/service-library/requirements/_fastapi.txt b/packages/service-library/requirements/_fastapi.txt index 8a3aed37600..71c9d7cabce 100644 --- a/packages/service-library/requirements/_fastapi.txt +++ b/packages/service-library/requirements/_fastapi.txt @@ -47,23 +47,29 @@ opentelemetry-api==1.27.0 # opentelemetry-instrumentation # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-semantic-conventions opentelemetry-instrumentation==0.48b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx opentelemetry-instrumentation-asgi==0.48b0 # via opentelemetry-instrumentation-fastapi opentelemetry-instrumentation-fastapi==0.48b0 # via -r requirements/_fastapi.in +opentelemetry-instrumentation-httpx==0.48b0 + # via -r requirements/_fastapi.in opentelemetry-semantic-conventions==0.48b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx opentelemetry-util-http==0.48b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx prometheus-client==0.21.0 # via # -r requirements/_fastapi.in diff --git a/packages/service-library/src/servicelib/fastapi/http_client_thin.py b/packages/service-library/src/servicelib/fastapi/http_client_thin.py index e00e0d636a2..554ccb450ad 100644 --- a/packages/service-library/src/servicelib/fastapi/http_client_thin.py +++ b/packages/service-library/src/servicelib/fastapi/http_client_thin.py @@ -8,6 +8,8 @@ from httpx import AsyncClient, ConnectError, HTTPError, PoolTimeout, Response from httpx._types import TimeoutTypes, URLTypes from pydantic.errors import PydanticErrorMixin +from servicelib.fastapi.tracing import setup_httpx_client_tracing +from settings_library.tracing import TracingSettings from tenacity import RetryCallState from tenacity.asyncio import AsyncRetrying from tenacity.before_sleep import before_sleep_log @@ -201,6 +203,7 @@ def __init__( base_url: URLTypes | None = None, default_http_client_timeout: TimeoutTypes | None = None, extra_allowed_method_names: set[str] | None = None, + tracing_settings: TracingSettings | None, ) -> None: _assert_public_interface(self, extra_allowed_method_names) @@ -220,7 +223,10 @@ def __init__( if default_http_client_timeout: client_args["timeout"] = default_http_client_timeout - super().__init__(client=AsyncClient(**client_args)) + client = AsyncClient(**client_args) + if tracing_settings: + setup_httpx_client_tracing(client) + super().__init__(client=client) async def __aenter__(self): await self.setup_client() diff --git a/packages/service-library/src/servicelib/fastapi/tracing.py b/packages/service-library/src/servicelib/fastapi/tracing.py index b5179a8a5f6..36e9b06fa12 100644 --- a/packages/service-library/src/servicelib/fastapi/tracing.py +++ b/packages/service-library/src/servicelib/fastapi/tracing.py @@ -5,11 +5,13 @@ import logging from fastapi import FastAPI +from httpx import AsyncClient, Client from opentelemetry import trace from opentelemetry.exporter.otlp.proto.http.trace_exporter import ( OTLPSpanExporter as OTLPSpanExporterHTTP, ) from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor +from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor @@ -121,3 +123,7 @@ def setup_tracing( msg="Attempting to add requests opentelemetry autoinstrumentation...", ): RequestsInstrumentor().instrument() + + +def setup_httpx_client_tracing(client: AsyncClient | Client): + HTTPXClientInstrumentor.instrument_client(client) diff --git a/packages/service-library/src/servicelib/redis_utils.py b/packages/service-library/src/servicelib/redis_utils.py index 10f32ae5944..559349cbb0d 100644 --- a/packages/service-library/src/servicelib/redis_utils.py +++ b/packages/service-library/src/servicelib/redis_utils.py @@ -3,7 +3,7 @@ import logging from collections.abc import Awaitable, Callable from datetime import timedelta -from typing import Any +from typing import Any, ParamSpec, TypeVar import arrow @@ -12,10 +12,16 @@ _logger = logging.getLogger(__file__) +P = ParamSpec("P") +R = TypeVar("R") + def exclusive( - redis: RedisClientSDK, *, lock_key: str, lock_value: bytes | str | None = None -): + redis: RedisClientSDK | Callable[..., RedisClientSDK], + *, + lock_key: str | Callable[..., str], + lock_value: bytes | str | None = None, +) -> Callable[[Callable[P, Awaitable[R]]], Callable[P, Awaitable[R]]]: """ Define a method to run exclusively across processes by leveraging a Redis Lock. @@ -24,12 +30,30 @@ def exclusive( redis: the redis client SDK lock_key: a string as the name of the lock (good practice: app_name:lock_name) lock_value: some additional data that can be retrieved by another client + + Raises: + - ValueError if used incorrectly + - CouldNotAcquireLockError if the lock could not be acquired """ - def decorator(func): + if not lock_key: + msg = "lock_key cannot be empty string!" + raise ValueError(msg) + + def decorator(func: Callable[P, Awaitable[R]]) -> Callable[P, Awaitable[R]]: @functools.wraps(func) - async def wrapper(*args, **kwargs): - async with redis.lock_context(lock_key=lock_key, lock_value=lock_value): + async def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: + redis_lock_key = ( + lock_key(*args, **kwargs) if callable(lock_key) else lock_key + ) + assert isinstance(redis_lock_key, str) # nosec + + redis_client = redis(*args, **kwargs) if callable(redis) else redis + assert isinstance(redis_client, RedisClientSDK) # nosec + + async with redis_client.lock_context( + lock_key=redis_lock_key, lock_value=lock_value + ): return await func(*args, **kwargs) return wrapper diff --git a/packages/service-library/tests/fastapi/test_http_client_thin.py b/packages/service-library/tests/fastapi/test_http_client_thin.py index f98de720c33..8c052948f6d 100644 --- a/packages/service-library/tests/fastapi/test_http_client_thin.py +++ b/packages/service-library/tests/fastapi/test_http_client_thin.py @@ -71,7 +71,9 @@ def request_timeout() -> int: @pytest.fixture async def thick_client(request_timeout: int) -> AsyncIterable[FakeThickClient]: - async with FakeThickClient(total_retry_interval=request_timeout) as client: + async with FakeThickClient( + total_retry_interval=request_timeout, tracing_settings=None + ) as client: yield client @@ -95,7 +97,9 @@ async def test_retry_on_errors( test_url: AnyHttpUrl, caplog_info_level: pytest.LogCaptureFixture, ) -> None: - client = FakeThickClient(total_retry_interval=request_timeout) + client = FakeThickClient( + total_retry_interval=request_timeout, tracing_settings=None + ) with pytest.raises(ClientHttpError): await client.get_provided_url(test_url) @@ -119,7 +123,7 @@ async def raises_request_error(self) -> Response: request=Request(method="GET", url=test_url), ) - client = ATestClient(total_retry_interval=request_timeout) + client = ATestClient(total_retry_interval=request_timeout, tracing_settings=None) with pytest.raises(ClientHttpError): await client.raises_request_error() @@ -145,7 +149,7 @@ async def raises_http_error(self) -> Response: msg = "mock_http_error" raise HTTPError(msg) - client = ATestClient(total_retry_interval=request_timeout) + client = ATestClient(total_retry_interval=request_timeout, tracing_settings=None) with pytest.raises(ClientHttpError): await client.raises_http_error() @@ -159,21 +163,25 @@ async def public_method_ok(self) -> Response: # type: ignore """this method will be ok even if no code is used""" # OK - OKTestClient(total_retry_interval=request_timeout) + OKTestClient(total_retry_interval=request_timeout, tracing_settings=None) class FailWrongAnnotationTestClient(BaseThinClient): async def public_method_wrong_annotation(self) -> None: """this method will raise an error""" with pytest.raises(AssertionError, match="should return an instance"): - FailWrongAnnotationTestClient(total_retry_interval=request_timeout) + FailWrongAnnotationTestClient( + total_retry_interval=request_timeout, tracing_settings=None + ) class FailNoAnnotationTestClient(BaseThinClient): async def public_method_no_annotation(self): """this method will raise an error""" with pytest.raises(AssertionError, match="should return an instance"): - FailNoAnnotationTestClient(total_retry_interval=request_timeout) + FailNoAnnotationTestClient( + total_retry_interval=request_timeout, tracing_settings=None + ) async def test_expect_state_decorator( @@ -197,7 +205,9 @@ async def get_wrong_state(self) -> Response: respx_mock.get(url_get_200_ok).mock(return_value=Response(codes.OK)) respx_mock.get(get_wrong_state).mock(return_value=Response(codes.OK)) - test_client = ATestClient(total_retry_interval=request_timeout) + test_client = ATestClient( + total_retry_interval=request_timeout, tracing_settings=None + ) # OK response = await test_client.get_200_ok() @@ -218,7 +228,9 @@ async def test_retry_timeout_overwrite( request_timeout: int, caplog_info_level: pytest.LogCaptureFixture, ) -> None: - client = FakeThickClient(total_retry_interval=request_timeout) + client = FakeThickClient( + total_retry_interval=request_timeout, tracing_settings=None + ) caplog_info_level.clear() start = arrow.utcnow() diff --git a/packages/service-library/tests/test_redis_utils.py b/packages/service-library/tests/test_redis_utils.py index f897fc7c399..26f749cd894 100644 --- a/packages/service-library/tests/test_redis_utils.py +++ b/packages/service-library/tests/test_redis_utils.py @@ -5,6 +5,7 @@ from contextlib import AbstractAsyncContextManager from datetime import timedelta from itertools import chain +from typing import Awaitable from unittest.mock import Mock import arrow @@ -32,39 +33,117 @@ async def _is_locked(redis_client_sdk: RedisClientSDK, lock_name: str) -> bool: @pytest.fixture def lock_name(faker: Faker) -> str: - return faker.uuid4() # type: ignore + return faker.pystr() + + +def _exclusive_sleeping_task( + redis_client_sdk: RedisClientSDK | Callable[..., RedisClientSDK], + lock_name: str | Callable[..., str], + sleep_duration: float, +) -> Callable[..., Awaitable[float]]: + @exclusive(redis_client_sdk, lock_key=lock_name) + async def _() -> float: + resolved_client = ( + redis_client_sdk() if callable(redis_client_sdk) else redis_client_sdk + ) + resolved_lock_name = lock_name() if callable(lock_name) else lock_name + assert await _is_locked(resolved_client, resolved_lock_name) + await asyncio.sleep(sleep_duration) + assert await _is_locked(resolved_client, resolved_lock_name) + return sleep_duration + + return _ + + +@pytest.fixture +def sleep_duration(faker: Faker) -> float: + return faker.pyfloat(positive=True, min_value=0.2, max_value=0.8) -async def _contained_client( +async def test_exclusive_decorator( get_redis_client_sdk: Callable[ [RedisDatabase], AbstractAsyncContextManager[RedisClientSDK] ], lock_name: str, - task_duration: float, -) -> None: - async with get_redis_client_sdk(RedisDatabase.RESOURCES) as redis_client_sdk: - assert not await _is_locked(redis_client_sdk, lock_name) - - @exclusive(redis_client_sdk, lock_key=lock_name) - async def _some_task() -> None: - assert await _is_locked(redis_client_sdk, lock_name) - await asyncio.sleep(task_duration) - assert await _is_locked(redis_client_sdk, lock_name) - - await _some_task() + sleep_duration: float, +): - assert not await _is_locked(redis_client_sdk, lock_name) + async with get_redis_client_sdk(RedisDatabase.RESOURCES) as redis_client: + for _ in range(3): + assert ( + await _exclusive_sleeping_task( + redis_client, lock_name, sleep_duration + )() + == sleep_duration + ) -@pytest.mark.parametrize("task_duration", [0.1, 1, 2]) -async def test_exclusive_sequentially( +async def test_exclusive_decorator_with_key_builder( get_redis_client_sdk: Callable[ [RedisDatabase], AbstractAsyncContextManager[RedisClientSDK] ], lock_name: str, - task_duration: float, + sleep_duration: float, ): - await _contained_client(get_redis_client_sdk, lock_name, task_duration) + def _get_lock_name(*args, **kwargs) -> str: + assert args is not None + assert kwargs is not None + return lock_name + + async with get_redis_client_sdk(RedisDatabase.RESOURCES) as redis_client: + for _ in range(3): + assert ( + await _exclusive_sleeping_task( + redis_client, _get_lock_name, sleep_duration + )() + == sleep_duration + ) + + +async def test_exclusive_decorator_with_client_builder( + get_redis_client_sdk: Callable[ + [RedisDatabase], AbstractAsyncContextManager[RedisClientSDK] + ], + lock_name: str, + sleep_duration: float, +): + async with get_redis_client_sdk(RedisDatabase.RESOURCES) as redis_client: + + def _get_redis_client_builder(*args, **kwargs) -> RedisClientSDK: + assert args is not None + assert kwargs is not None + return redis_client + + for _ in range(3): + assert ( + await _exclusive_sleeping_task( + _get_redis_client_builder, lock_name, sleep_duration + )() + == sleep_duration + ) + + +async def _acquire_lock_and_exclusively_sleep( + get_redis_client_sdk: Callable[ + [RedisDatabase], AbstractAsyncContextManager[RedisClientSDK] + ], + lock_name: str | Callable[..., str], + sleep_duration: float, +) -> None: + async with get_redis_client_sdk(RedisDatabase.RESOURCES) as redis_client_sdk: + redis_lock_name = lock_name() if callable(lock_name) else lock_name + assert not await _is_locked(redis_client_sdk, redis_lock_name) + + @exclusive(redis_client_sdk, lock_key=lock_name) + async def _() -> float: + assert await _is_locked(redis_client_sdk, redis_lock_name) + await asyncio.sleep(sleep_duration) + assert await _is_locked(redis_client_sdk, redis_lock_name) + return sleep_duration + + assert await _() == sleep_duration + + assert not await _is_locked(redis_client_sdk, redis_lock_name) async def test_exclusive_parallel_lock_is_released_and_reacquired( @@ -76,17 +155,19 @@ async def test_exclusive_parallel_lock_is_released_and_reacquired( parallel_tasks = 10 results = await logged_gather( *[ - _contained_client(get_redis_client_sdk, lock_name, task_duration=0.1) + _acquire_lock_and_exclusively_sleep( + get_redis_client_sdk, lock_name, sleep_duration=0.1 + ) for _ in range(parallel_tasks) ], - reraise=False + reraise=False, ) assert results.count(None) == 1 assert [isinstance(x, CouldNotAcquireLockError) for x in results].count( True ) == parallel_tasks - 1 - # check lock is being released + # check lock is released async with get_redis_client_sdk(RedisDatabase.RESOURCES) as redis_client_sdk: assert not await _is_locked(redis_client_sdk, lock_name) @@ -168,7 +249,7 @@ async def test_start_exclusive_periodic_task_parallel_all_finish( _assert_task_completes_once(get_redis_client_sdk, stop_after=60) for _ in range(parallel_tasks) ], - reraise=False + reraise=False, ) # check no error occurred diff --git a/packages/simcore-sdk/requirements/_base.txt b/packages/simcore-sdk/requirements/_base.txt index 5eac02fa1ec..11be2af08e1 100644 --- a/packages/simcore-sdk/requirements/_base.txt +++ b/packages/simcore-sdk/requirements/_base.txt @@ -48,7 +48,6 @@ async-timeout==4.0.3 # via # aiopg # asyncpg - # redis asyncpg==0.29.0 # via sqlalchemy attrs==24.2.0 diff --git a/services/agent/requirements/_base.txt b/services/agent/requirements/_base.txt index 59f29515fe5..a42027b8a00 100644 --- a/services/agent/requirements/_base.txt +++ b/services/agent/requirements/_base.txt @@ -38,8 +38,6 @@ arrow==1.3.0 # -r requirements/../../../packages/service-library/requirements/_base.in asgiref==3.8.1 # via opentelemetry-instrumentation-asgi -async-timeout==4.0.3 - # via redis attrs==24.2.0 # via # aiohttp @@ -143,6 +141,7 @@ opentelemetry-api==1.27.0 # opentelemetry-instrumentation # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -161,12 +160,15 @@ opentelemetry-instrumentation==0.48b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests opentelemetry-instrumentation-asgi==0.48b0 # via opentelemetry-instrumentation-fastapi opentelemetry-instrumentation-fastapi==0.48b0 # via -r requirements/../../../packages/service-library/requirements/_fastapi.in +opentelemetry-instrumentation-httpx==0.48b0 + # via -r requirements/../../../packages/service-library/requirements/_fastapi.in opentelemetry-instrumentation-redis==0.48b0 # via -r requirements/../../../packages/service-library/requirements/_base.in opentelemetry-instrumentation-requests==0.48b0 @@ -185,6 +187,7 @@ opentelemetry-semantic-conventions==0.48b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -192,6 +195,7 @@ opentelemetry-util-http==0.48b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-requests orjson==3.10.7 # via diff --git a/services/agent/src/simcore_service_agent/core/application.py b/services/agent/src/simcore_service_agent/core/application.py index 41c80b07d61..84bc71e24c5 100644 --- a/services/agent/src/simcore_service_agent/core/application.py +++ b/services/agent/src/simcore_service_agent/core/application.py @@ -5,6 +5,7 @@ get_common_oas_options, override_fastapi_openapi_method, ) +from servicelib.fastapi.tracing import setup_tracing from servicelib.logging_utils import config_all_loggers from .._meta import ( @@ -59,6 +60,9 @@ def create_app() -> FastAPI: setup_rest_api(app) setup_rpc_api_routes(app) + if settings.AGENT_TRACING: + setup_tracing(app, settings.AGENT_TRACING, APP_NAME) + async def _on_startup() -> None: print(APP_STARTED_BANNER_MSG, flush=True) # noqa: T201 diff --git a/services/agent/src/simcore_service_agent/core/settings.py b/services/agent/src/simcore_service_agent/core/settings.py index 756bf2cac28..f11350968f4 100644 --- a/services/agent/src/simcore_service_agent/core/settings.py +++ b/services/agent/src/simcore_service_agent/core/settings.py @@ -6,6 +6,7 @@ from settings_library.base import BaseCustomSettings from settings_library.r_clone import S3Provider from settings_library.rabbit import RabbitSettings +from settings_library.tracing import TracingSettings from settings_library.utils_logging import MixinLoggingSettings @@ -77,6 +78,10 @@ class ApplicationSettings(BaseCustomSettings, MixinLoggingSettings): auto_default_from_env=True, description="settings for service/rabbitmq" ) + AGENT_TRACING: TracingSettings | None = Field( + auto_default_from_env=True, description="settings for opentelemetry tracing" + ) + @validator("LOGLEVEL") @classmethod def valid_log_level(cls, value) -> LogLevel: diff --git a/services/agent/tests/conftest.py b/services/agent/tests/conftest.py index 4632ca84102..5fe2cad817e 100644 --- a/services/agent/tests/conftest.py +++ b/services/agent/tests/conftest.py @@ -58,6 +58,7 @@ def mock_environment( "RABBIT_SECURE": "false", "RABBIT_USER": "test", "AGENT_DOCKER_NODE_ID": docker_node_id, + "AGENT_TRACING": "null", }, ) diff --git a/services/api-server/requirements/_base.txt b/services/api-server/requirements/_base.txt index 92a441a0e25..02a3778eab2 100644 --- a/services/api-server/requirements/_base.txt +++ b/services/api-server/requirements/_base.txt @@ -74,7 +74,6 @@ async-timeout==4.0.3 # via # aiopg # asyncpg - # redis asyncpg==0.29.0 # via sqlalchemy attrs==23.2.0 @@ -283,6 +282,7 @@ opentelemetry-api==1.27.0 # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-dbapi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -306,6 +306,7 @@ opentelemetry-instrumentation==0.48b0 # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-dbapi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests opentelemetry-instrumentation-aiopg==0.48b0 @@ -320,6 +321,8 @@ opentelemetry-instrumentation-dbapi==0.48b0 # via opentelemetry-instrumentation-aiopg opentelemetry-instrumentation-fastapi==0.48b0 # via -r requirements/../../../packages/service-library/requirements/_fastapi.in +opentelemetry-instrumentation-httpx==0.48b0 + # via -r requirements/../../../packages/service-library/requirements/_fastapi.in opentelemetry-instrumentation-redis==0.48b0 # via # -r requirements/../../../packages/service-library/requirements/_base.in @@ -345,6 +348,7 @@ opentelemetry-semantic-conventions==0.48b0 # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-dbapi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -352,6 +356,7 @@ opentelemetry-util-http==0.48b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-requests orjson==3.10.0 # via diff --git a/services/api-server/src/simcore_service_api_server/core/application.py b/services/api-server/src/simcore_service_api_server/core/application.py index 04dcd397c28..3d67746deb7 100644 --- a/services/api-server/src/simcore_service_api_server/core/application.py +++ b/services/api-server/src/simcore_service_api_server/core/application.py @@ -82,19 +82,36 @@ def init_app(settings: ApplicationSettings | None = None) -> FastAPI: setup_rabbitmq(app) + if settings.API_SERVER_TRACING: + setup_tracing(app, settings.API_SERVER_TRACING, APP_NAME) + if settings.API_SERVER_WEBSERVER: - webserver.setup(app, settings.API_SERVER_WEBSERVER) - if app.state.settings.API_SERVER_TRACING: - setup_tracing(app, app.state.settings.API_SERVER_TRACING, APP_NAME) + webserver.setup( + app, + settings.API_SERVER_WEBSERVER, + tracing_settings=settings.API_SERVER_TRACING, + ) if settings.API_SERVER_CATALOG: - catalog.setup(app, settings.API_SERVER_CATALOG) + catalog.setup( + app, + settings.API_SERVER_CATALOG, + tracing_settings=settings.API_SERVER_TRACING, + ) if settings.API_SERVER_STORAGE: - storage.setup(app, settings.API_SERVER_STORAGE) + storage.setup( + app, + settings.API_SERVER_STORAGE, + tracing_settings=settings.API_SERVER_TRACING, + ) if settings.API_SERVER_DIRECTOR_V2: - director_v2.setup(app, settings.API_SERVER_DIRECTOR_V2) + director_v2.setup( + app, + settings.API_SERVER_DIRECTOR_V2, + tracing_settings=settings.API_SERVER_TRACING, + ) # setup app app.add_event_handler("startup", create_start_app_handler(app)) diff --git a/services/api-server/src/simcore_service_api_server/services/catalog.py b/services/api-server/src/simcore_service_api_server/services/catalog.py index 56a7d648790..461237ce998 100644 --- a/services/api-server/src/simcore_service_api_server/services/catalog.py +++ b/services/api-server/src/simcore_service_api_server/services/catalog.py @@ -11,6 +11,7 @@ from models_library.services import ServiceMetaDataPublished, ServiceType from pydantic import Extra, ValidationError, parse_obj_as, parse_raw_as from settings_library.catalog import CatalogSettings +from settings_library.tracing import TracingSettings from simcore_service_api_server.exceptions.backend_errors import ( ListSolversOrStudiesError, SolverOrStudyNotFoundError, @@ -209,10 +210,16 @@ async def get_latest_release( # MODULES APP SETUP ------------------------------------------------------------- -def setup(app: FastAPI, settings: CatalogSettings) -> None: +def setup( + app: FastAPI, settings: CatalogSettings, tracing_settings: TracingSettings | None +) -> None: if not settings: settings = CatalogSettings() setup_client_instance( - app, CatalogApi, api_baseurl=settings.api_base_url, service_name="catalog" + app, + CatalogApi, + api_baseurl=settings.api_base_url, + service_name="catalog", + tracing_settings=tracing_settings, ) diff --git a/services/api-server/src/simcore_service_api_server/services/director_v2.py b/services/api-server/src/simcore_service_api_server/services/director_v2.py index ff31490b072..938e36c5242 100644 --- a/services/api-server/src/simcore_service_api_server/services/director_v2.py +++ b/services/api-server/src/simcore_service_api_server/services/director_v2.py @@ -9,6 +9,7 @@ from models_library.projects_pipeline import ComputationTask from models_library.projects_state import RunningState from pydantic import AnyHttpUrl, AnyUrl, BaseModel, Field, PositiveInt, parse_raw_as +from settings_library.tracing import TracingSettings from simcore_service_api_server.exceptions.backend_errors import ( JobNotFoundError, LogFileNotFoundError, @@ -191,11 +192,14 @@ async def get_computation_logs( # MODULES APP SETUP ------------------------------------------------------------- -def setup(app: FastAPI, settings: DirectorV2Settings) -> None: +def setup( + app: FastAPI, settings: DirectorV2Settings, tracing_settings: TracingSettings | None +) -> None: setup_client_instance( app, DirectorV2Api, # WARNING: it has /v0 and /v2 prefixes api_baseurl=settings.base_url, service_name="director_v2", + tracing_settings=tracing_settings, ) diff --git a/services/api-server/src/simcore_service_api_server/services/storage.py b/services/api-server/src/simcore_service_api_server/services/storage.py index 13920d8a931..4e6d8be54ca 100644 --- a/services/api-server/src/simcore_service_api_server/services/storage.py +++ b/services/api-server/src/simcore_service_api_server/services/storage.py @@ -14,6 +14,7 @@ from models_library.basic_types import SHA256Str from models_library.generics import Envelope from pydantic import AnyUrl, PositiveInt +from settings_library.tracing import TracingSettings from starlette.datastructures import URL from ..core.settings import StorageSettings @@ -209,12 +210,18 @@ async def create_soft_link( # MODULES APP SETUP ------------------------------------------------------------- -def setup(app: FastAPI, settings: StorageSettings) -> None: +def setup( + app: FastAPI, settings: StorageSettings, tracing_settings: TracingSettings | None +) -> None: if not settings: settings = StorageSettings() setup_client_instance( - app, StorageApi, api_baseurl=settings.api_base_url, service_name="storage" + app, + StorageApi, + api_baseurl=settings.api_base_url, + service_name="storage", + tracing_settings=tracing_settings, ) diff --git a/services/api-server/src/simcore_service_api_server/services/webserver.py b/services/api-server/src/simcore_service_api_server/services/webserver.py index 0d265248dc2..19688728cb5 100644 --- a/services/api-server/src/simcore_service_api_server/services/webserver.py +++ b/services/api-server/src/simcore_service_api_server/services/webserver.py @@ -48,6 +48,7 @@ X_SIMCORE_PARENT_NODE_ID, X_SIMCORE_PARENT_PROJECT_UUID, ) +from settings_library.tracing import TracingSettings from simcore_service_api_server.exceptions.backend_errors import ( ConfigurationError, ForbiddenWalletError, @@ -588,24 +589,30 @@ async def get_service_pricing_plan( # MODULES APP SETUP ------------------------------------------------------------- -def setup(app: FastAPI, settings: WebServerSettings) -> None: +def setup( + app: FastAPI, + webserver_settings: WebServerSettings, + tracing_settings: TracingSettings | None, +) -> None: setup_client_instance( app, WebserverApi, - api_baseurl=settings.api_base_url, + api_baseurl=webserver_settings.api_base_url, service_name="webserver", + tracing_settings=tracing_settings, ) setup_client_instance( app, LongRunningTasksClient, api_baseurl="", service_name="long_running_tasks_client", + tracing_settings=tracing_settings, ) def _on_startup() -> None: # normalize & encrypt - secret_key = settings.WEBSERVER_SESSION_SECRET_KEY.get_secret_value() + secret_key = webserver_settings.WEBSERVER_SESSION_SECRET_KEY.get_secret_value() app.state.webserver_fernet = fernet.Fernet(secret_key) async def _on_shutdown() -> None: diff --git a/services/api-server/src/simcore_service_api_server/utils/client_base.py b/services/api-server/src/simcore_service_api_server/utils/client_base.py index ed58f7429e3..3cc35a74bb6 100644 --- a/services/api-server/src/simcore_service_api_server/utils/client_base.py +++ b/services/api-server/src/simcore_service_api_server/utils/client_base.py @@ -4,6 +4,8 @@ import httpx from fastapi import FastAPI from httpx import AsyncClient +from servicelib.fastapi.tracing import setup_httpx_client_tracing +from settings_library.tracing import TracingSettings from .app_data import AppDataMixin @@ -43,14 +45,16 @@ def setup_client_instance( api_cls: type[BaseServiceClientApi], api_baseurl, service_name: str, + tracing_settings: TracingSettings | None, **extra_fields, ) -> None: """Helper to add init/cleanup of ServiceClientApi instances in the app lifespam""" assert issubclass(api_cls, BaseServiceClientApi) # nosec - # NOTE: this term is mocked in tests. If you need to modify pay attention to the mock client = AsyncClient(base_url=api_baseurl) + if tracing_settings: + setup_httpx_client_tracing(client) # events def _create_instance() -> None: diff --git a/services/api-server/tests/unit/test_utils_client_base.py b/services/api-server/tests/unit/test_utils_client_base.py index 61370a8ea52..9fe2da1a28c 100644 --- a/services/api-server/tests/unit/test_utils_client_base.py +++ b/services/api-server/tests/unit/test_utils_client_base.py @@ -43,6 +43,7 @@ class TheClientApi(BaseServiceClientApi): service_name="the_service", health_check_path="/health", x=42, + tracing_settings=None, ) assert not TheClientApi.get_instance(app) diff --git a/services/autoscaling/requirements/_base.txt b/services/autoscaling/requirements/_base.txt index 0c7ff77b07f..995fb44e3f4 100644 --- a/services/autoscaling/requirements/_base.txt +++ b/services/autoscaling/requirements/_base.txt @@ -65,8 +65,6 @@ arrow==1.3.0 # -r requirements/../../../packages/service-library/requirements/_base.in asgiref==3.8.1 # via opentelemetry-instrumentation-asgi -async-timeout==4.0.3 - # via redis attrs==23.2.0 # via # aiohttp @@ -260,6 +258,7 @@ opentelemetry-api==1.26.0 # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-botocore # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-propagator-aws-xray @@ -282,6 +281,7 @@ opentelemetry-instrumentation==0.47b0 # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-botocore # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests opentelemetry-instrumentation-asgi==0.47b0 @@ -290,6 +290,8 @@ opentelemetry-instrumentation-botocore==0.47b0 # via -r requirements/../../../packages/aws-library/requirements/_base.in opentelemetry-instrumentation-fastapi==0.47b0 # via -r requirements/../../../packages/service-library/requirements/_fastapi.in +opentelemetry-instrumentation-httpx==0.47b0 + # via -r requirements/../../../packages/service-library/requirements/_fastapi.in opentelemetry-instrumentation-redis==0.47b0 # via # -r requirements/../../../packages/aws-library/requirements/../../../packages/service-library/requirements/_base.in @@ -316,6 +318,7 @@ opentelemetry-semantic-conventions==0.47b0 # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-botocore # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -323,6 +326,7 @@ opentelemetry-util-http==0.47b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-requests orjson==3.10.3 # via diff --git a/services/autoscaling/requirements/_test.txt b/services/autoscaling/requirements/_test.txt index 8abc686eb76..47379c4d69f 100644 --- a/services/autoscaling/requirements/_test.txt +++ b/services/autoscaling/requirements/_test.txt @@ -6,10 +6,6 @@ anyio==4.3.0 # httpx asgi-lifespan==2.1.0 # via -r requirements/_test.in -async-timeout==4.0.3 - # via - # -c requirements/_base.txt - # redis attrs==23.2.0 # via # -c requirements/_base.txt diff --git a/services/catalog/requirements/_base.txt b/services/catalog/requirements/_base.txt index 890adbe5508..e650830f05d 100644 --- a/services/catalog/requirements/_base.txt +++ b/services/catalog/requirements/_base.txt @@ -41,9 +41,7 @@ arrow==1.3.0 asgiref==3.8.1 # via opentelemetry-instrumentation-asgi async-timeout==4.0.3 - # via - # asyncpg - # redis + # via asyncpg asyncpg==0.29.0 # via # -r requirements/_base.in @@ -191,6 +189,7 @@ opentelemetry-api==1.27.0 # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -210,6 +209,7 @@ opentelemetry-instrumentation==0.48b0 # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests opentelemetry-instrumentation-asgi==0.48b0 @@ -218,6 +218,8 @@ opentelemetry-instrumentation-asyncpg==0.48b0 # via -r requirements/../../../packages/postgres-database/requirements/_base.in opentelemetry-instrumentation-fastapi==0.48b0 # via -r requirements/../../../packages/service-library/requirements/_fastapi.in +opentelemetry-instrumentation-httpx==0.48b0 + # via -r requirements/../../../packages/service-library/requirements/_fastapi.in opentelemetry-instrumentation-redis==0.48b0 # via -r requirements/../../../packages/service-library/requirements/_base.in opentelemetry-instrumentation-requests==0.48b0 @@ -237,6 +239,7 @@ opentelemetry-semantic-conventions==0.48b0 # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -244,6 +247,7 @@ opentelemetry-util-http==0.48b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-requests orjson==3.10.0 # via diff --git a/services/catalog/src/simcore_service_catalog/core/application.py b/services/catalog/src/simcore_service_catalog/core/application.py index a28dc8c5a32..94f35b3d1ea 100644 --- a/services/catalog/src/simcore_service_catalog/core/application.py +++ b/services/catalog/src/simcore_service_catalog/core/application.py @@ -46,8 +46,13 @@ def create_app(settings: ApplicationSettings | None = None) -> FastAPI: # STATE app.state.settings = settings + if settings.CATALOG_TRACING: + setup_tracing(app, settings.CATALOG_TRACING, APP_NAME) + # STARTUP-EVENT - app.add_event_handler("startup", create_on_startup(app)) + app.add_event_handler( + "startup", create_on_startup(app, tracing_settings=settings.CATALOG_TRACING) + ) # PLUGIN SETUP setup_function_services(app) @@ -65,8 +70,6 @@ def create_app(settings: ApplicationSettings | None = None) -> FastAPI: app.add_middleware( BaseHTTPMiddleware, dispatch=timing_middleware.add_process_time_header ) - if app.state.settings.CATALOG_TRACING: - setup_tracing(app, app.state.settings.CATALOG_TRACING, APP_NAME) app.add_middleware(GZipMiddleware) diff --git a/services/catalog/src/simcore_service_catalog/core/events.py b/services/catalog/src/simcore_service_catalog/core/events.py index f22adbba4ec..dde295a2e56 100644 --- a/services/catalog/src/simcore_service_catalog/core/events.py +++ b/services/catalog/src/simcore_service_catalog/core/events.py @@ -5,6 +5,7 @@ from fastapi import FastAPI from servicelib.fastapi.db_asyncpg_engine import close_db_connection, connect_to_db from servicelib.logging_utils import log_context +from settings_library.tracing import TracingSettings from .._meta import APP_FINISHED_BANNER_MSG, APP_STARTED_BANNER_MSG from ..db.events import setup_default_product @@ -26,7 +27,9 @@ def _flush_finished_banner() -> None: print(APP_FINISHED_BANNER_MSG, flush=True) # noqa: T201 -def create_on_startup(app: FastAPI) -> EventCallable: +def create_on_startup( + app: FastAPI, tracing_settings: TracingSettings | None +) -> EventCallable: async def _() -> None: _flush_started_banner() @@ -37,7 +40,7 @@ async def _() -> None: if app.state.settings.CATALOG_DIRECTOR: # setup connection to director - await setup_director(app) + await setup_director(app, tracing_settings=tracing_settings) # FIXME: check director service is in place and ready. Hand-shake?? # SEE https://github.com/ITISFoundation/osparc-simcore/issues/1728 diff --git a/services/catalog/src/simcore_service_catalog/services/director.py b/services/catalog/src/simcore_service_catalog/services/director.py index 7c6925902f4..e97b72bb3f2 100644 --- a/services/catalog/src/simcore_service_catalog/services/director.py +++ b/services/catalog/src/simcore_service_catalog/services/director.py @@ -11,7 +11,9 @@ from models_library.services_metadata_published import ServiceMetaDataPublished from models_library.services_types import ServiceKey, ServiceVersion from models_library.utils.json_serialization import json_dumps +from servicelib.fastapi.tracing import setup_httpx_client_tracing from servicelib.logging_utils import log_context +from settings_library.tracing import TracingSettings from starlette import status from tenacity.asyncio import AsyncRetrying from tenacity.before_sleep import before_sleep_log @@ -106,11 +108,15 @@ class DirectorApi: SEE services/catalog/src/simcore_service_catalog/api/dependencies/director.py """ - def __init__(self, base_url: str, app: FastAPI): + def __init__( + self, base_url: str, app: FastAPI, tracing_settings: TracingSettings | None + ): self.client = httpx.AsyncClient( base_url=base_url, timeout=app.state.settings.CATALOG_CLIENT_REQUEST.HTTP_CLIENT_REQUEST_TOTAL_TIMEOUT, ) + if tracing_settings: + setup_httpx_client_tracing(self.client) self.vtag = app.state.settings.CATALOG_DIRECTOR.DIRECTOR_VTAG async def close(self): @@ -151,15 +157,25 @@ async def get_service( return ServiceMetaDataPublished.parse_obj(data[0]) -async def setup_director(app: FastAPI) -> None: +async def setup_director( + app: FastAPI, tracing_settings: TracingSettings | None +) -> None: if settings := app.state.settings.CATALOG_DIRECTOR: with log_context( _logger, logging.DEBUG, "Setup director at %s", f"{settings.base_url=}" ): async for attempt in AsyncRetrying(**_director_startup_retry_policy): - client = DirectorApi(base_url=settings.base_url, app=app) + client = DirectorApi( + base_url=settings.base_url, + app=app, + tracing_settings=tracing_settings, + ) with attempt: - client = DirectorApi(base_url=settings.base_url, app=app) + client = DirectorApi( + base_url=settings.base_url, + app=app, + tracing_settings=tracing_settings, + ) if not await client.is_responsive(): with suppress(Exception): await client.close() diff --git a/services/clusters-keeper/requirements/_base.txt b/services/clusters-keeper/requirements/_base.txt index 9443ee269ef..344d07b5339 100644 --- a/services/clusters-keeper/requirements/_base.txt +++ b/services/clusters-keeper/requirements/_base.txt @@ -63,8 +63,6 @@ arrow==1.3.0 # -r requirements/../../../packages/service-library/requirements/_base.in asgiref==3.8.1 # via opentelemetry-instrumentation-asgi -async-timeout==4.0.3 - # via redis attrs==23.2.0 # via # aiohttp @@ -258,6 +256,7 @@ opentelemetry-api==1.26.0 # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-botocore # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-propagator-aws-xray @@ -280,6 +279,7 @@ opentelemetry-instrumentation==0.47b0 # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-botocore # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests opentelemetry-instrumentation-asgi==0.47b0 @@ -288,6 +288,8 @@ opentelemetry-instrumentation-botocore==0.47b0 # via -r requirements/../../../packages/aws-library/requirements/_base.in opentelemetry-instrumentation-fastapi==0.47b0 # via -r requirements/../../../packages/service-library/requirements/_fastapi.in +opentelemetry-instrumentation-httpx==0.47b0 + # via -r requirements/../../../packages/service-library/requirements/_fastapi.in opentelemetry-instrumentation-redis==0.47b0 # via # -r requirements/../../../packages/aws-library/requirements/../../../packages/service-library/requirements/_base.in @@ -314,6 +316,7 @@ opentelemetry-semantic-conventions==0.47b0 # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-botocore # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -321,6 +324,7 @@ opentelemetry-util-http==0.47b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-requests orjson==3.10.3 # via diff --git a/services/clusters-keeper/requirements/_test.txt b/services/clusters-keeper/requirements/_test.txt index e2832a14944..00a7437644c 100644 --- a/services/clusters-keeper/requirements/_test.txt +++ b/services/clusters-keeper/requirements/_test.txt @@ -19,10 +19,6 @@ anyio==4.3.0 # httpx asgi-lifespan==2.1.0 # via -r requirements/_test.in -async-timeout==4.0.3 - # via - # -c requirements/_base.txt - # redis attrs==23.2.0 # via # -c requirements/_base.txt diff --git a/services/dask-sidecar/requirements/_base.txt b/services/dask-sidecar/requirements/_base.txt index 6cdd686b12f..dc0ea01d6f9 100644 --- a/services/dask-sidecar/requirements/_base.txt +++ b/services/dask-sidecar/requirements/_base.txt @@ -46,8 +46,6 @@ arrow==1.3.0 # -r requirements/../../../packages/models-library/requirements/_base.in # -r requirements/../../../packages/service-library/requirements/../../../packages/models-library/requirements/_base.in # -r requirements/../../../packages/service-library/requirements/_base.in -async-timeout==4.0.3 - # via redis attrs==23.2.0 # via # aiohttp diff --git a/services/datcore-adapter/requirements/_base.txt b/services/datcore-adapter/requirements/_base.txt index f8fe44d6058..5a9116dfe47 100644 --- a/services/datcore-adapter/requirements/_base.txt +++ b/services/datcore-adapter/requirements/_base.txt @@ -39,8 +39,6 @@ arrow==1.3.0 # -r requirements/../../../packages/service-library/requirements/_base.in asgiref==3.8.1 # via opentelemetry-instrumentation-asgi -async-timeout==4.0.3 - # via redis attrs==23.2.0 # via # aiohttp @@ -166,6 +164,7 @@ opentelemetry-api==1.26.0 # opentelemetry-instrumentation # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -184,12 +183,15 @@ opentelemetry-instrumentation==0.47b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests opentelemetry-instrumentation-asgi==0.47b0 # via opentelemetry-instrumentation-fastapi opentelemetry-instrumentation-fastapi==0.47b0 # via -r requirements/../../../packages/service-library/requirements/_fastapi.in +opentelemetry-instrumentation-httpx==0.47b0 + # via -r requirements/../../../packages/service-library/requirements/_fastapi.in opentelemetry-instrumentation-redis==0.47b0 # via -r requirements/../../../packages/service-library/requirements/_base.in opentelemetry-instrumentation-requests==0.47b0 @@ -208,6 +210,7 @@ opentelemetry-semantic-conventions==0.47b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -215,6 +218,7 @@ opentelemetry-util-http==0.47b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-requests orjson==3.10.0 # via diff --git a/services/director-v2/requirements/_base.txt b/services/director-v2/requirements/_base.txt index 02162fe9a64..dfcfa5ab028 100644 --- a/services/director-v2/requirements/_base.txt +++ b/services/director-v2/requirements/_base.txt @@ -81,7 +81,6 @@ async-timeout==4.0.3 # via # aiopg # asyncpg - # redis asyncpg==0.29.0 # via sqlalchemy attrs==23.2.0 @@ -340,6 +339,7 @@ opentelemetry-api==1.27.0 # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-dbapi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -363,6 +363,7 @@ opentelemetry-instrumentation==0.48b0 # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-dbapi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests opentelemetry-instrumentation-aiopg==0.48b0 @@ -377,6 +378,8 @@ opentelemetry-instrumentation-dbapi==0.48b0 # via opentelemetry-instrumentation-aiopg opentelemetry-instrumentation-fastapi==0.48b0 # via -r requirements/../../../packages/service-library/requirements/_fastapi.in +opentelemetry-instrumentation-httpx==0.48b0 + # via -r requirements/../../../packages/service-library/requirements/_fastapi.in opentelemetry-instrumentation-redis==0.48b0 # via # -r requirements/../../../packages/service-library/requirements/_base.in @@ -402,6 +405,7 @@ opentelemetry-semantic-conventions==0.48b0 # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-dbapi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -409,6 +413,7 @@ opentelemetry-util-http==0.48b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-requests ordered-set==4.1.0 # via -r requirements/_base.in diff --git a/services/director-v2/src/simcore_service_director_v2/api/dependencies/scheduler.py b/services/director-v2/src/simcore_service_director_v2/api/dependencies/scheduler.py index a0903608789..aa01af1f34b 100644 --- a/services/director-v2/src/simcore_service_director_v2/api/dependencies/scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/api/dependencies/scheduler.py @@ -1,3 +1,5 @@ +from typing import Annotated + from fastapi import Depends, FastAPI, Request from ...core.settings import ComputationalBackendSettings @@ -11,7 +13,7 @@ def get_scheduler(request: Request) -> BaseCompScheduler: def get_scheduler_settings( - app: FastAPI = Depends(get_app), + app: Annotated[FastAPI, Depends(get_app)] ) -> ComputationalBackendSettings: settings: ComputationalBackendSettings = ( app.state.settings.DIRECTOR_V2_COMPUTATIONAL_BACKEND diff --git a/services/director-v2/src/simcore_service_director_v2/cli/_client.py b/services/director-v2/src/simcore_service_director_v2/cli/_client.py index 541d90688dc..872c08f3b5f 100644 --- a/services/director-v2/src/simcore_service_director_v2/cli/_client.py +++ b/services/director-v2/src/simcore_service_director_v2/cli/_client.py @@ -12,7 +12,9 @@ class ThinDV2LocalhostClient(BaseThinClient): def __init__(self): super().__init__( - total_retry_interval=10, default_http_client_timeout=Timeout(5) + total_retry_interval=10, + default_http_client_timeout=Timeout(5), + tracing_settings=None, ) def _get_url(self, postfix: str) -> str: diff --git a/services/director-v2/src/simcore_service_director_v2/cli/_core.py b/services/director-v2/src/simcore_service_director_v2/cli/_core.py index 893aed2504e..70ee252aa20 100644 --- a/services/director-v2/src/simcore_service_director_v2/cli/_core.py +++ b/services/director-v2/src/simcore_service_director_v2/cli/_core.py @@ -36,13 +36,16 @@ async def _initialized_app(only_db: bool = False) -> AsyncIterator[FastAPI]: app = create_base_app() settings: AppSettings = app.state.settings - # Initialize minimal required components for the application db.setup(app, settings.POSTGRES) if not only_db: dynamic_sidecar.setup(app) - director_v0.setup(app, settings.DIRECTOR_V0) + director_v0.setup( + app, + director_v0_settings=settings.DIRECTOR_V0, + tracing_settings=settings.DIRECTOR_V2_TRACING, + ) await app.router.startup() yield app diff --git a/services/director-v2/src/simcore_service_director_v2/core/application.py b/services/director-v2/src/simcore_service_director_v2/core/application.py index f1c81f18f98..6487d725143 100644 --- a/services/director-v2/src/simcore_service_director_v2/core/application.py +++ b/services/director-v2/src/simcore_service_director_v2/core/application.py @@ -149,19 +149,34 @@ def init_app(settings: AppSettings | None = None) -> FastAPI: substitutions.setup(app) + if settings.DIRECTOR_V2_TRACING: + setup_tracing(app, settings.DIRECTOR_V2_TRACING, APP_NAME) + if settings.DIRECTOR_V0.DIRECTOR_V0_ENABLED: - director_v0.setup(app, settings.DIRECTOR_V0) + director_v0.setup( + app, + director_v0_settings=settings.DIRECTOR_V0, + tracing_settings=settings.DIRECTOR_V2_TRACING, + ) if settings.DIRECTOR_V2_STORAGE: - storage.setup(app, settings.DIRECTOR_V2_STORAGE) + storage.setup( + app, + storage_settings=settings.DIRECTOR_V2_STORAGE, + tracing_settings=settings.DIRECTOR_V2_TRACING, + ) if settings.DIRECTOR_V2_CATALOG: - catalog.setup(app, settings.DIRECTOR_V2_CATALOG) + catalog.setup( + app, + catalog_settings=settings.DIRECTOR_V2_CATALOG, + tracing_settings=settings.DIRECTOR_V2_TRACING, + ) db.setup(app, settings.POSTGRES) if settings.DYNAMIC_SERVICES.DIRECTOR_V2_DYNAMIC_SERVICES_ENABLED: - dynamic_services.setup(app) + dynamic_services.setup(app, tracing_settings=settings.DIRECTOR_V2_TRACING) dynamic_scheduler_enabled = settings.DYNAMIC_SERVICES.DYNAMIC_SIDECAR and ( settings.DYNAMIC_SERVICES.DYNAMIC_SCHEDULER @@ -192,8 +207,6 @@ def init_app(settings: AppSettings | None = None) -> FastAPI: if settings.DIRECTOR_V2_PROMETHEUS_INSTRUMENTATION_ENABLED: instrumentation.setup(app) - if settings.DIRECTOR_V2_TRACING: - setup_tracing(app, app.state.settings.DIRECTOR_V2_TRACING, APP_NAME) if settings.DIRECTOR_V2_PROFILING: app.add_middleware(ProfilerMiddleware) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/catalog.py b/services/director-v2/src/simcore_service_director_v2/modules/catalog.py index f5e378afa43..22b4eb89bd3 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/catalog.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/catalog.py @@ -9,26 +9,37 @@ from models_library.services_resources import ServiceResourcesDict from models_library.users import UserID from pydantic import parse_obj_as +from servicelib.fastapi.tracing import setup_httpx_client_tracing from settings_library.catalog import CatalogSettings +from settings_library.tracing import TracingSettings from ..utils.client_decorators import handle_errors, handle_retry logger = logging.getLogger(__name__) -def setup(app: FastAPI, settings: CatalogSettings) -> None: - if not settings: - settings = CatalogSettings() +def setup( + app: FastAPI, + catalog_settings: CatalogSettings | None, + tracing_settings: TracingSettings | None, +) -> None: + + if not catalog_settings: + catalog_settings = CatalogSettings() async def on_startup() -> None: + client = httpx.AsyncClient( + base_url=f"{catalog_settings.api_base_url}", + timeout=app.state.settings.CLIENT_REQUEST.HTTP_CLIENT_REQUEST_TOTAL_TIMEOUT, + ) + if tracing_settings: + setup_httpx_client_tracing(client=client) + CatalogClient.create( app, - client=httpx.AsyncClient( - base_url=f"{settings.api_base_url}", - timeout=app.state.settings.CLIENT_REQUEST.HTTP_CLIENT_REQUEST_TOTAL_TIMEOUT, - ), + client=client, ) - logger.debug("created client for catalog: %s", settings.api_base_url) + logger.debug("created client for catalog: %s", catalog_settings.api_base_url) # Here we currently do not ensure the catalog is up on start # This will need to be assessed. diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py index 1eb6c3dab10..d06c37457b7 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/__init__.py @@ -1,7 +1,38 @@ +import logging +from collections.abc import Callable, Coroutine +from typing import Any, cast + from fastapi import FastAPI +from servicelib.logging_utils import log_context +from . import _scheduler_factory from ._base_scheduler import BaseCompScheduler -from ._task import on_app_shutdown, on_app_startup + +_logger = logging.getLogger(__name__) + + +def on_app_startup(app: FastAPI) -> Callable[[], Coroutine[Any, Any, None]]: + async def start_scheduler() -> None: + with log_context( + _logger, level=logging.INFO, msg="starting computational scheduler" + ): + app.state.scheduler = scheduler = await _scheduler_factory.create_from_db( + app + ) + scheduler.recover_scheduling() + + return start_scheduler + + +def on_app_shutdown(app: FastAPI) -> Callable[[], Coroutine[Any, Any, None]]: + async def stop_scheduler() -> None: + await get_scheduler(app).shutdown() + + return stop_scheduler + + +def get_scheduler(app: FastAPI) -> BaseCompScheduler: + return cast(BaseCompScheduler, app.state.scheduler) def setup(app: FastAPI): @@ -12,4 +43,5 @@ def setup(app: FastAPI): __all__: tuple[str, ...] = ( "setup", "BaseCompScheduler", + "get_scheduler", ) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py index cae539596d4..097afd95288 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_base_scheduler.py @@ -12,7 +12,9 @@ """ import asyncio +import contextlib import datetime +import functools import logging from abc import ABC, abstractmethod from dataclasses import dataclass, field @@ -29,9 +31,12 @@ from models_library.users import UserID from networkx.classes.reportviews import InDegreeView from pydantic import PositiveInt +from servicelib.background_task import start_periodic_task, stop_periodic_task from servicelib.common_headers import UNDEFINED_DEFAULT_SIMCORE_USER_AGENT_VALUE +from servicelib.logging_utils import log_context from servicelib.rabbitmq import RabbitMQClient, RabbitMQRPCClient -from servicelib.utils import limited_gather +from servicelib.redis import CouldNotAcquireLockError, RedisClientSDK +from servicelib.redis_utils import exclusive from ...constants import UNDEFINED_STR_METADATA from ...core.errors import ( @@ -76,6 +81,10 @@ _Previous = CompTaskAtDB _Current = CompTaskAtDB _MAX_WAITING_FOR_CLUSTER_TIMEOUT_IN_MIN: Final[int] = 10 +_SCHEDULER_INTERVAL: Final[datetime.timedelta] = datetime.timedelta(seconds=5) +_TASK_NAME_TEMPLATE: Final[ + str +] = "computational-scheduler-{user_id}:{project_id}:{iteration}" @dataclass(frozen=True, slots=True) @@ -134,6 +143,12 @@ class ScheduledPipelineParams: mark_for_cancellation: datetime.datetime | None use_on_demand_clusters: bool + scheduler_task: asyncio.Task | None = None + scheduler_waker: asyncio.Event = field(default_factory=asyncio.Event) + + def wake_up(self) -> None: + self.scheduler_waker.set() + @dataclass class BaseCompScheduler(ABC): @@ -146,6 +161,7 @@ class BaseCompScheduler(ABC): rabbitmq_rpc_client: RabbitMQRPCClient settings: ComputationalBackendSettings service_runtime_heartbeat_interval: datetime.timedelta + redis_client: RedisClientSDK async def run_new_pipeline( self, @@ -178,7 +194,7 @@ async def run_new_pipeline( ) self.scheduled_pipelines[ (user_id, project_id, new_run.iteration) - ] = ScheduledPipelineParams( + ] = pipeline_params = ScheduledPipelineParams( cluster_id=cluster_id, run_metadata=new_run.metadata, use_on_demand_clusters=use_on_demand_clusters, @@ -191,8 +207,8 @@ async def run_new_pipeline( log=f"Project pipeline scheduled using {'on-demand clusters' if use_on_demand_clusters else 'pre-defined clusters'}, starting soon...", log_level=logging.INFO, ) - # ensure the scheduler starts right away - self._wake_up_scheduler_now() + + self._start_scheduling(pipeline_params, user_id, project_id, new_run.iteration) async def stop_pipeline( self, user_id: UserID, project_id: ProjectID, iteration: int | None = None @@ -224,29 +240,76 @@ async def stop_pipeline( (user_id, project_id, selected_iteration) ].mark_for_cancellation = updated_comp_run.cancelled # ensure the scheduler starts right away - self._wake_up_scheduler_now() + self.scheduled_pipelines[ + (user_id, project_id, selected_iteration) + ].wake_up() - async def schedule_all_pipelines(self) -> None: - self.wake_up_event.clear() - # if one of the task throws, the other are NOT cancelled which is what we want - await limited_gather( + def recover_scheduling(self) -> None: + for ( + user_id, + project_id, + iteration, + ), params in self.scheduled_pipelines.items(): + self._start_scheduling(params, user_id, project_id, iteration) + + async def shutdown(self) -> None: + # cancel all current scheduling processes + await asyncio.gather( *( - self._schedule_pipeline( + stop_periodic_task(p.scheduler_task, timeout=3) + for p in self.scheduled_pipelines.values() + if p.scheduler_task + ), + return_exceptions=True, + ) + + def _get_last_iteration(self, user_id: UserID, project_id: ProjectID) -> Iteration: + # if no iteration given find the latest one in the list + possible_iterations = { + it + for u_id, p_id, it in self.scheduled_pipelines + if u_id == user_id and p_id == project_id + } + if not possible_iterations: + msg = f"There are no pipeline scheduled for {user_id}:{project_id}" + raise SchedulerError(msg) + return max(possible_iterations) + + def _start_scheduling( + self, + pipeline_params: ScheduledPipelineParams, + user_id: UserID, + project_id: ProjectID, + iteration: Iteration, + ) -> None: + async def _exclusive_safe_schedule_pipeline( + *, + user_id: UserID, + project_id: ProjectID, + iteration: Iteration, + pipeline_params: ScheduledPipelineParams, + ) -> None: + with contextlib.suppress(CouldNotAcquireLockError): + await self._schedule_pipeline( user_id=user_id, project_id=project_id, iteration=iteration, pipeline_params=pipeline_params, ) - for ( - user_id, - project_id, - iteration, - ), pipeline_params in self.scheduled_pipelines.items() + + pipeline_params.scheduler_task = start_periodic_task( + functools.partial( + _exclusive_safe_schedule_pipeline, + user_id=user_id, + project_id=project_id, + iteration=iteration, + pipeline_params=pipeline_params, + ), + interval=_SCHEDULER_INTERVAL, + task_name=_TASK_NAME_TEMPLATE.format( + user_id=user_id, project_id=project_id, iteration=iteration ), - reraise=False, - log=_logger, - limit=40, - tasks_group_prefix="computational-scheduled-pipeline", + early_wake_up_event=pipeline_params.scheduler_waker, ) async def _get_pipeline_dag(self, project_id: ProjectID) -> nx.DiGraph: @@ -610,6 +673,22 @@ async def _process_completed_tasks( ) -> None: ... + @staticmethod + def _build_exclusive_lock_key(*args, **kwargs) -> str: + assert args # nosec + return f"{kwargs['user_id']}:{kwargs['project_id']}:{kwargs['iteration']}" + + @staticmethod + def _redis_client_getter(*args, **kwargs) -> RedisClientSDK: + assert kwargs # nosec + zelf = args[0] + assert isinstance(zelf, BaseCompScheduler) # nosec + return zelf.redis_client + + @exclusive( + redis=_redis_client_getter, + lock_key=_build_exclusive_lock_key, + ) async def _schedule_pipeline( self, *, @@ -618,98 +697,99 @@ async def _schedule_pipeline( iteration: PositiveInt, pipeline_params: ScheduledPipelineParams, ) -> None: - _logger.debug( - "checking run of project [%s:%s] for user [%s]", - f"{project_id=}", - f"{iteration=}", - f"{user_id=}", - ) - dag: nx.DiGraph = nx.DiGraph() - try: - dag = await self._get_pipeline_dag(project_id) - # 1. Update our list of tasks with data from backend (state, results) - await self._update_states_from_comp_backend( - user_id, project_id, iteration, dag, pipeline_params=pipeline_params - ) - # 2. Any task following a FAILED task shall be ABORTED - comp_tasks = await self._set_states_following_failed_to_aborted( - project_id, dag - ) - # 3. do we want to stop the pipeline now? - if pipeline_params.mark_for_cancellation: - await self._schedule_tasks_to_stop( - user_id, project_id, comp_tasks, pipeline_params + with log_context( + _logger, + level=logging.INFO, + msg=f"scheduling pipeline {user_id=}:{project_id=}:{iteration=}", + ): + dag: nx.DiGraph = nx.DiGraph() + try: + dag = await self._get_pipeline_dag(project_id) + # 1. Update our list of tasks with data from backend (state, results) + await self._update_states_from_comp_backend( + user_id, project_id, iteration, dag, pipeline_params=pipeline_params ) - else: - # let's get the tasks to schedule then - comp_tasks = await self._schedule_tasks_to_start( - user_id=user_id, - project_id=project_id, - comp_tasks=comp_tasks, - dag=dag, - pipeline_params=pipeline_params, + # 2. Any task following a FAILED task shall be ABORTED + comp_tasks = await self._set_states_following_failed_to_aborted( + project_id, dag + ) + # 3. do we want to stop the pipeline now? + if pipeline_params.mark_for_cancellation: + await self._schedule_tasks_to_stop( + user_id, project_id, comp_tasks, pipeline_params + ) + else: + # let's get the tasks to schedule then + comp_tasks = await self._schedule_tasks_to_start( + user_id=user_id, + project_id=project_id, + comp_tasks=comp_tasks, + dag=dag, + pipeline_params=pipeline_params, + ) + # 4. timeout if waiting for cluster has been there for more than X minutes + comp_tasks = await self._timeout_if_waiting_for_cluster_too_long( + user_id, project_id, comp_tasks + ) + # 5. send a heartbeat + await self._send_running_tasks_heartbeat( + user_id, project_id, iteration, dag ) - # 4. timeout if waiting for cluster has been there for more than X minutes - comp_tasks = await self._timeout_if_waiting_for_cluster_too_long( - user_id, project_id, comp_tasks - ) - # 5. send a heartbeat - await self._send_running_tasks_heartbeat( - user_id, project_id, iteration, dag - ) - # 6. Update the run result - pipeline_result = await self._update_run_result_from_tasks( - user_id, project_id, iteration, comp_tasks - ) + # 6. Update the run result + pipeline_result = await self._update_run_result_from_tasks( + user_id, project_id, iteration, comp_tasks + ) - # 7. Are we done scheduling that pipeline? - if not dag.nodes() or pipeline_result in COMPLETED_STATES: - # there is nothing left, the run is completed, we're done here + # 7. Are we done scheduling that pipeline? + if not dag.nodes() or pipeline_result in COMPLETED_STATES: + # there is nothing left, the run is completed, we're done here + self.scheduled_pipelines.pop((user_id, project_id, iteration), None) + _logger.info( + "pipeline %s scheduling completed with result %s", + f"{project_id=}", + f"{pipeline_result=}", + ) + assert pipeline_params.scheduler_task is not None # nosec + pipeline_params.scheduler_task.cancel() + except PipelineNotFoundError: + _logger.warning( + "pipeline %s does not exist in comp_pipeline table, it will be removed from scheduler", + f"{project_id=}", + ) + await self._set_run_result( + user_id, project_id, iteration, RunningState.ABORTED + ) self.scheduled_pipelines.pop((user_id, project_id, iteration), None) - _logger.info( - "pipeline %s scheduling completed with result %s", + except InvalidPipelineError as exc: + _logger.warning( + "pipeline %s appears to be misconfigured, it will be removed from scheduler. Please check pipeline:\n%s", f"{project_id=}", - f"{pipeline_result=}", + exc, ) - except PipelineNotFoundError: - _logger.warning( - "pipeline %s does not exist in comp_pipeline table, it will be removed from scheduler", - f"{project_id=}", - ) - await self._set_run_result( - user_id, project_id, iteration, RunningState.ABORTED - ) - self.scheduled_pipelines.pop((user_id, project_id, iteration), None) - except InvalidPipelineError as exc: - _logger.warning( - "pipeline %s appears to be misconfigured, it will be removed from scheduler. Please check pipeline:\n%s", - f"{project_id=}", - exc, - ) - await self._set_run_result( - user_id, project_id, iteration, RunningState.ABORTED - ) - self.scheduled_pipelines.pop((user_id, project_id, iteration), None) - except (DaskClientAcquisisitonError, ClustersKeeperNotAvailableError): - _logger.exception( - "Unexpected error while connecting with computational backend, aborting pipeline" - ) - tasks: dict[NodeIDStr, CompTaskAtDB] = await self._get_pipeline_tasks( - project_id, dag - ) - comp_tasks_repo = CompTasksRepository(self.db_engine) - await comp_tasks_repo.update_project_tasks_state( - project_id, - [t.node_id for t in tasks.values()], - RunningState.FAILED, - ) - await self._set_run_result( - user_id, project_id, iteration, RunningState.FAILED - ) - self.scheduled_pipelines.pop((user_id, project_id, iteration), None) - except ComputationalBackendNotConnectedError: - _logger.exception("Computational backend is not connected!") + await self._set_run_result( + user_id, project_id, iteration, RunningState.ABORTED + ) + self.scheduled_pipelines.pop((user_id, project_id, iteration), None) + except (DaskClientAcquisisitonError, ClustersKeeperNotAvailableError): + _logger.exception( + "Unexpected error while connecting with computational backend, aborting pipeline" + ) + tasks: dict[NodeIDStr, CompTaskAtDB] = await self._get_pipeline_tasks( + project_id, dag + ) + comp_tasks_repo = CompTasksRepository(self.db_engine) + await comp_tasks_repo.update_project_tasks_state( + project_id, + [t.node_id for t in tasks.values()], + RunningState.FAILED, + ) + await self._set_run_result( + user_id, project_id, iteration, RunningState.FAILED + ) + self.scheduled_pipelines.pop((user_id, project_id, iteration), None) + except ComputationalBackendNotConnectedError: + _logger.exception("Computational backend is not connected!") async def _schedule_tasks_to_stop( self, @@ -910,6 +990,3 @@ async def _timeout_if_waiting_for_cluster_too_long( log_level=logging.ERROR, ) return comp_tasks - - def _wake_up_scheduler_now(self) -> None: - self.wake_up_event.set() diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_dask_scheduler.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_dask_scheduler.py index 51fb3b1a3fb..512df1b1712 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_dask_scheduler.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_dask_scheduler.py @@ -124,7 +124,7 @@ async def _start_tasks( cluster_id=pipeline_params.cluster_id, tasks={node_id: task.image}, hardware_info=task.hardware_info, - callback=self._wake_up_scheduler_now, + callback=pipeline_params.wake_up, metadata=pipeline_params.run_metadata, ) for node_id, task in scheduled_tasks.items() diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py index f8b648eaf48..4f7812816cc 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_scheduler_factory.py @@ -2,6 +2,8 @@ from fastapi import FastAPI from models_library.clusters import DEFAULT_CLUSTER_ID +from servicelib.logging_utils import log_context +from settings_library.redis import RedisDatabase from ...core.errors import ConfigurationError from ...core.settings import AppSettings @@ -10,10 +12,11 @@ from ..dask_clients_pool import DaskClientsPool from ..db.repositories.comp_runs import CompRunsRepository from ..rabbitmq import get_rabbitmq_client, get_rabbitmq_rpc_client +from ..redis import get_redis_client_manager from ._base_scheduler import BaseCompScheduler, ScheduledPipelineParams from ._dask_scheduler import DaskScheduler -logger = logging.getLogger(__name__) +_logger = logging.getLogger(__name__) async def create_from_db(app: FastAPI) -> BaseCompScheduler: @@ -28,29 +31,32 @@ async def create_from_db(app: FastAPI) -> BaseCompScheduler: filter_by_state=SCHEDULED_STATES ) - logger.debug( + _logger.debug( "Following scheduled comp_runs found still to be scheduled: %s", runs if runs else "NONE", ) - logger.info("Creating Dask-based scheduler...") - app_settings: AppSettings = app.state.settings - return DaskScheduler( - settings=app_settings.DIRECTOR_V2_COMPUTATIONAL_BACKEND, - dask_clients_pool=DaskClientsPool.instance(app), - rabbitmq_client=get_rabbitmq_client(app), - rabbitmq_rpc_client=get_rabbitmq_rpc_client(app), - db_engine=db_engine, - scheduled_pipelines={ - (r.user_id, r.project_uuid, r.iteration): ScheduledPipelineParams( - cluster_id=( - r.cluster_id if r.cluster_id is not None else DEFAULT_CLUSTER_ID - ), - run_metadata=r.metadata, - mark_for_cancellation=r.cancelled, - use_on_demand_clusters=r.use_on_demand_clusters, - ) - for r in runs - }, - service_runtime_heartbeat_interval=app_settings.SERVICE_TRACKING_HEARTBEAT, - ) + with log_context( + _logger, logging.INFO, msg="Creating Dask-based computational scheduler" + ): + app_settings: AppSettings = app.state.settings + return DaskScheduler( + settings=app_settings.DIRECTOR_V2_COMPUTATIONAL_BACKEND, + dask_clients_pool=DaskClientsPool.instance(app), + rabbitmq_client=get_rabbitmq_client(app), + rabbitmq_rpc_client=get_rabbitmq_rpc_client(app), + redis_client=get_redis_client_manager(app).client(RedisDatabase.LOCKS), + db_engine=db_engine, + scheduled_pipelines={ + (r.user_id, r.project_uuid, r.iteration): ScheduledPipelineParams( + cluster_id=( + r.cluster_id if r.cluster_id is not None else DEFAULT_CLUSTER_ID + ), + run_metadata=r.metadata, + mark_for_cancellation=r.cancelled, + use_on_demand_clusters=r.use_on_demand_clusters, + ) + for r in runs + }, + service_runtime_heartbeat_interval=app_settings.SERVICE_TRACKING_HEARTBEAT, + ) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_task.py b/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_task.py deleted file mode 100644 index 989b310687c..00000000000 --- a/services/director-v2/src/simcore_service_director_v2/modules/comp_scheduler/_task.py +++ /dev/null @@ -1,51 +0,0 @@ -import datetime -import logging -from collections.abc import Callable, Coroutine -from typing import Any, Final - -from fastapi import FastAPI -from servicelib.background_task import start_periodic_task, stop_periodic_task -from servicelib.logging_utils import log_context -from servicelib.redis import RedisClientsManager -from servicelib.redis_utils import exclusive -from settings_library.redis import RedisDatabase - -from ..._meta import APP_NAME -from . import _scheduler_factory - -_logger = logging.getLogger(__name__) - -_COMPUTATIONAL_SCHEDULER_INTERVAL: Final[datetime.timedelta] = datetime.timedelta( - seconds=5 -) -_TASK_NAME: Final[str] = "computational services scheduler" - - -def on_app_startup(app: FastAPI) -> Callable[[], Coroutine[Any, Any, None]]: - async def start_scheduler() -> None: - with log_context( - _logger, level=logging.INFO, msg="starting computational scheduler" - ): - redis_clients_manager: RedisClientsManager = app.state.redis_clients_manager - lock_key = f"{APP_NAME}:computational_scheduler" - app.state.scheduler = scheduler = await _scheduler_factory.create_from_db( - app - ) - app.state.computational_scheduler_task = start_periodic_task( - exclusive( - redis_clients_manager.client(RedisDatabase.LOCKS), - lock_key=lock_key, - )(scheduler.schedule_all_pipelines), - interval=_COMPUTATIONAL_SCHEDULER_INTERVAL, - task_name=_TASK_NAME, - early_wake_up_event=scheduler.wake_up_event, - ) - - return start_scheduler - - -def on_app_shutdown(app: FastAPI) -> Callable[[], Coroutine[Any, Any, None]]: - async def stop_scheduler() -> None: - await stop_periodic_task(app.state.computational_scheduler_task) - - return stop_scheduler diff --git a/services/director-v2/src/simcore_service_director_v2/modules/director_v0.py b/services/director-v2/src/simcore_service_director_v2/modules/director_v0.py index 0bc8c799dcb..3229ddc642a 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/director_v0.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/director_v0.py @@ -1,7 +1,4 @@ -""" Module that takes care of communications with director v0 service - - -""" +"""Module that takes care of communications with director v0 service""" import logging import urllib.parse @@ -20,7 +17,9 @@ from models_library.service_settings_labels import SimcoreServiceLabels from models_library.services import ServiceKey, ServiceKeyVersion, ServiceVersion from models_library.users import UserID +from servicelib.fastapi.tracing import setup_httpx_client_tracing from servicelib.logging_utils import log_decorator +from settings_library.tracing import TracingSettings from ..core.settings import DirectorV0Settings from ..utils.client_decorators import handle_errors, handle_retry @@ -31,25 +30,34 @@ # Module's setup logic --------------------------------------------- -def setup(app: FastAPI, settings: DirectorV0Settings | None): - if not settings: - settings = DirectorV0Settings() +def setup( + app: FastAPI, + director_v0_settings: DirectorV0Settings | None, + tracing_settings: TracingSettings | None, +): + if not director_v0_settings: + director_v0_settings = DirectorV0Settings() def on_startup() -> None: + client = httpx.AsyncClient( + base_url=f"{director_v0_settings.endpoint}", + timeout=app.state.settings.CLIENT_REQUEST.HTTP_CLIENT_REQUEST_TOTAL_TIMEOUT, + ) + if tracing_settings: + setup_httpx_client_tracing(client=client) DirectorV0Client.create( app, - client=httpx.AsyncClient( - base_url=f"{settings.endpoint}", - timeout=app.state.settings.CLIENT_REQUEST.HTTP_CLIENT_REQUEST_TOTAL_TIMEOUT, - ), + client=client, + ) + logger.debug( + "created client for director-v0: %s", director_v0_settings.endpoint ) - logger.debug("created client for director-v0: %s", settings.endpoint) async def on_shutdown() -> None: client = DirectorV0Client.instance(app).client await client.aclose() del client - logger.debug("delete client for director-v0: %s", settings.endpoint) + logger.debug("delete client for director-v0: %s", director_v0_settings.endpoint) app.add_event_handler("startup", on_startup) app.add_event_handler("shutdown", on_shutdown) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_services.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_services.py index d572a9f23fb..acbc08849a6 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_services.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_services.py @@ -8,19 +8,24 @@ import httpx from fastapi import FastAPI +from servicelib.fastapi.tracing import setup_httpx_client_tracing +from settings_library.tracing import TracingSettings from ..utils.client_decorators import handle_errors, handle_retry logger = logging.getLogger(__name__) -def setup(app: FastAPI) -> None: +def setup(app: FastAPI, tracing_settings: TracingSettings | None) -> None: def on_startup() -> None: + client = httpx.AsyncClient( + timeout=app.state.settings.CLIENT_REQUEST.HTTP_CLIENT_REQUEST_TOTAL_TIMEOUT + ) + if tracing_settings: + setup_httpx_client_tracing(client=client) ServicesClient.create( app, - client=httpx.AsyncClient( - timeout=app.state.settings.CLIENT_REQUEST.HTTP_CLIENT_REQUEST_TOTAL_TIMEOUT - ), + client=client, ) async def on_shutdown() -> None: diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/api_client/_thin.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/api_client/_thin.py index 241f32fe70e..feba415ecd0 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/api_client/_thin.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/api_client/_thin.py @@ -12,6 +12,7 @@ expect_status, retry_on_errors, ) +from settings_library.tracing import TracingSettings from ....core.dynamic_services_settings.scheduler import ( DynamicServicesSchedulerSettings, @@ -31,6 +32,9 @@ def __init__(self, app: FastAPI): scheduler_settings: DynamicServicesSchedulerSettings = ( app.state.settings.DYNAMIC_SERVICES.DYNAMIC_SCHEDULER ) + tracing_settings: TracingSettings | None = ( + app.state.settings.DIRECTOR_V2_TRACING + ) # timeouts self._health_request_timeout = Timeout(1.0, connect=1.0) @@ -53,6 +57,7 @@ def __init__(self, app: FastAPI): scheduler_settings.DYNAMIC_SIDECAR_API_REQUEST_TIMEOUT, connect=scheduler_settings.DYNAMIC_SIDECAR_API_CONNECT_TIMEOUT, ), + tracing_settings=tracing_settings, ) def _get_url( diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py index b788e455cf3..44e2ff575e7 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py @@ -175,6 +175,11 @@ def _get_environment_variables( "S3_SECRET_KEY": r_clone_settings.R_CLONE_S3.S3_SECRET_KEY, "SC_BOOT_MODE": f"{app_settings.DYNAMIC_SERVICES.DYNAMIC_SIDECAR.DYNAMIC_SIDECAR_SC_BOOT_MODE}", "SSL_CERT_FILE": app_settings.DIRECTOR_V2_SELF_SIGNED_SSL_FILENAME, + "DYNAMIC_SIDECAR_TRACING": ( + app_settings.DIRECTOR_V2_TRACING.json() + if app_settings.DIRECTOR_V2_TRACING + else "null" + ), # For background info on this special env-var above, see # - https://stackoverflow.com/questions/31448854/how-to-force-requests-use-the-certificates-on-my-ubuntu-system#comment78596389_37447847 "SIMCORE_HOST_NAME": scheduler_data.service_name, diff --git a/services/director-v2/src/simcore_service_director_v2/modules/redis.py b/services/director-v2/src/simcore_service_director_v2/modules/redis.py index e7da01afef7..273061cb188 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/redis.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/redis.py @@ -1,3 +1,5 @@ +from typing import cast + from fastapi import FastAPI from servicelib.redis import RedisClientsManager, RedisManagerDBConfig from settings_library.redis import RedisDatabase @@ -29,3 +31,7 @@ async def on_shutdown() -> None: app.add_event_handler("startup", on_startup) app.add_event_handler("shutdown", on_shutdown) + + +def get_redis_client_manager(app: FastAPI) -> RedisClientsManager: + return cast(RedisClientsManager, app.state.redis_clients_manager) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/resource_usage_tracker_client.py b/services/director-v2/src/simcore_service_director_v2/modules/resource_usage_tracker_client.py index 2c546ea3d84..4eaf3ba2016 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/resource_usage_tracker_client.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/resource_usage_tracker_client.py @@ -24,6 +24,7 @@ from models_library.services import ServiceKey, ServiceVersion from models_library.wallets import WalletID from pydantic import parse_obj_as +from servicelib.fastapi.tracing import setup_httpx_client_tracing from ..core.errors import PricingPlanUnitNotFoundError from ..core.settings import AppSettings @@ -41,6 +42,8 @@ def create(cls, settings: AppSettings) -> "ResourceUsageTrackerClient": client = httpx.AsyncClient( base_url=settings.DIRECTOR_V2_RESOURCE_USAGE_TRACKER.api_base_url, ) + if settings.DIRECTOR_V2_TRACING: + setup_httpx_client_tracing(client=client) exit_stack = contextlib.AsyncExitStack() return cls(client=client, exit_stack=exit_stack) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/storage.py b/services/director-v2/src/simcore_service_director_v2/modules/storage.py index 98e18845333..c3e9cd21576 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/storage.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/storage.py @@ -8,9 +8,11 @@ import httpx from fastapi import FastAPI, HTTPException from models_library.users import UserID +from servicelib.fastapi.tracing import setup_httpx_client_tracing from servicelib.logging_utils import log_decorator from settings_library.s3 import S3Settings from settings_library.storage import StorageSettings +from settings_library.tracing import TracingSettings # Module's business logic --------------------------------------------- from starlette import status @@ -23,19 +25,27 @@ # Module's setup logic --------------------------------------------- -def setup(app: FastAPI, settings: StorageSettings): - if not settings: - settings = StorageSettings() +def setup( + app: FastAPI, + storage_settings: StorageSettings | None, + tracing_settings: TracingSettings | None, +): + + if not storage_settings: + storage_settings = StorageSettings() def on_startup() -> None: + client = httpx.AsyncClient( + base_url=f"{storage_settings.api_base_url}", + timeout=app.state.settings.CLIENT_REQUEST.HTTP_CLIENT_REQUEST_TOTAL_TIMEOUT, + ) + if tracing_settings: + setup_httpx_client_tracing(client=client) StorageClient.create( app, - client=httpx.AsyncClient( - base_url=f"{settings.api_base_url}", - timeout=app.state.settings.CLIENT_REQUEST.HTTP_CLIENT_REQUEST_TOTAL_TIMEOUT, - ), + client=client, ) - logger.debug("created client for storage: %s", settings.api_base_url) + logger.debug("created client for storage: %s", storage_settings.api_base_url) async def on_shutdown() -> None: client = StorageClient.instance(app).client diff --git a/services/director-v2/tests/integration/02/test_dynamic_sidecar_nodeports_integration.py b/services/director-v2/tests/integration/02/test_dynamic_sidecar_nodeports_integration.py index 720e7d0c3e1..ec955f1e167 100644 --- a/services/director-v2/tests/integration/02/test_dynamic_sidecar_nodeports_integration.py +++ b/services/director-v2/tests/integration/02/test_dynamic_sidecar_nodeports_integration.py @@ -64,6 +64,7 @@ from settings_library.rabbit import RabbitSettings from settings_library.redis import RedisSettings from settings_library.storage import StorageSettings +from settings_library.tracing import TracingSettings from simcore_postgres_database.models.comp_pipeline import comp_pipeline from simcore_postgres_database.models.comp_tasks import comp_tasks from simcore_postgres_database.models.projects_networks import projects_networks @@ -340,8 +341,14 @@ async def patch_storage_setup( original_setup = dv2_modules_storage.setup - def setup(app: FastAPI, settings: StorageSettings) -> None: - original_setup(app, local_settings) + def setup( + app: FastAPI, + storage_settings: StorageSettings, + tracing_settings: TracingSettings | None, + ) -> None: + original_setup( + app, storage_settings=local_settings, tracing_settings=tracing_settings + ) mocker.patch("simcore_service_director_v2.modules.storage.setup", side_effect=setup) diff --git a/services/director-v2/tests/unit/_helpers.py b/services/director-v2/tests/unit/_helpers.py index 2654c63a3e1..779d6cdd117 100644 --- a/services/director-v2/tests/unit/_helpers.py +++ b/services/director-v2/tests/unit/_helpers.py @@ -1,4 +1,3 @@ -import asyncio from dataclasses import dataclass from typing import Any @@ -11,9 +10,6 @@ from simcore_service_director_v2.models.comp_pipelines import CompPipelineAtDB from simcore_service_director_v2.models.comp_runs import CompRunsAtDB from simcore_service_director_v2.models.comp_tasks import CompTaskAtDB -from simcore_service_director_v2.modules.comp_scheduler._base_scheduler import ( - BaseCompScheduler, -) @dataclass @@ -28,13 +24,6 @@ class RunningProject(PublishedProject): runs: CompRunsAtDB -async def trigger_comp_scheduler(scheduler: BaseCompScheduler) -> None: - # trigger the scheduler - scheduler._wake_up_scheduler_now() # pylint: disable=protected-access # noqa: SLF001 - # let the scheduler be actually triggered - await asyncio.sleep(1) - - async def set_comp_task_state( aiopg_engine: aiopg.sa.engine.Engine, node_id: str, state: StateType ) -> None: diff --git a/services/director-v2/tests/unit/test_modules_dynamic_sidecar_docker_service_specs_sidecar.py b/services/director-v2/tests/unit/test_modules_dynamic_sidecar_docker_service_specs_sidecar.py index 4a73b3e7210..f4870a140c4 100644 --- a/services/director-v2/tests/unit/test_modules_dynamic_sidecar_docker_service_specs_sidecar.py +++ b/services/director-v2/tests/unit/test_modules_dynamic_sidecar_docker_service_specs_sidecar.py @@ -37,6 +37,7 @@ "DY_SIDECAR_USER_SERVICES_HAVE_INTERNET_ACCESS", "DYNAMIC_SIDECAR_COMPOSE_NAMESPACE", "DYNAMIC_SIDECAR_LOG_LEVEL", + "DYNAMIC_SIDECAR_TRACING", "NODE_PORTS_400_REQUEST_TIMEOUT_ATTEMPTS", "POSTGRES_DB", "POSTGRES_ENDPOINT", diff --git a/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py b/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py index f9e5ff33c4b..1df1ae09d39 100644 --- a/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py @@ -45,6 +45,7 @@ from pytest_mock.plugin import MockerFixture from pytest_simcore.helpers.typing_env import EnvVarsDict from servicelib.rabbitmq import RabbitMQClient +from servicelib.redis import CouldNotAcquireLockError from settings_library.rabbit import RabbitSettings from settings_library.redis import RedisSettings from simcore_postgres_database.models.comp_runs import comp_runs @@ -66,8 +67,12 @@ from simcore_service_director_v2.models.comp_runs import CompRunsAtDB, RunMetadataDict from simcore_service_director_v2.models.comp_tasks import CompTaskAtDB, Image from simcore_service_director_v2.models.dask_subsystem import DaskClientTaskState -from simcore_service_director_v2.modules.comp_scheduler._base_scheduler import ( +from simcore_service_director_v2.modules.comp_scheduler import ( BaseCompScheduler, + get_scheduler, +) +from simcore_service_director_v2.modules.comp_scheduler._base_scheduler import ( + ScheduledPipelineParams, ) from simcore_service_director_v2.modules.comp_scheduler._dask_scheduler import ( DaskScheduler, @@ -155,8 +160,38 @@ async def _assert_comp_tasks_db( ), f"{expected_progress=}, found: {[t.progress for t in tasks]}" -async def run_comp_scheduler(scheduler: BaseCompScheduler) -> None: - await scheduler.schedule_all_pipelines() +async def schedule_all_pipelines(scheduler: BaseCompScheduler) -> None: + # NOTE: we take a copy of the pipelines, as this could change quickly if there are + # misconfigured pipelines that would be removed from the scheduler + # NOTE: we simulate multiple dv-2 replicas by running several times + # the same pipeline scheduling + local_pipelines = deepcopy(scheduler.scheduled_pipelines) + results = await asyncio.gather( + *( + scheduler._schedule_pipeline( # noqa: SLF001 + user_id=user_id, + project_id=project_id, + iteration=iteration, + pipeline_params=params, + ) + for _ in range(3) + for ( + user_id, + project_id, + iteration, + ), params in local_pipelines.items() + ), + return_exceptions=True, + ) + # we should have exceptions 2/3 of the time + could_not_acquire_lock_count = sum( + isinstance(r, CouldNotAcquireLockError) for r in results + ) + total_results_count = len(results) + + # Check if 2/3 of the results are CouldNotAcquireLockError + # checks that scheduling is done exclusively + assert could_not_acquire_lock_count == (2 / 3) * total_results_count @pytest.fixture @@ -185,11 +220,11 @@ def minimal_dask_scheduler_config( def scheduler( minimal_dask_scheduler_config: None, aiopg_engine: aiopg.sa.engine.Engine, - # dask_spec_local_cluster: SpecCluster, minimal_app: FastAPI, ) -> BaseCompScheduler: - assert minimal_app.state.scheduler is not None - return minimal_app.state.scheduler + scheduler = get_scheduler(minimal_app) + assert scheduler is not None + return scheduler @pytest.fixture @@ -220,16 +255,21 @@ def mocked_clean_task_output_fct(mocker: MockerFixture) -> mock.MagicMock: @pytest.fixture -def with_disabled_scheduler_task(mocker: MockerFixture) -> None: +def with_disabled_auto_scheduling(mocker: MockerFixture) -> mock.MagicMock: """disables the scheduler task, note that it needs to be triggered manually then""" - mocker.patch( - "simcore_service_director_v2.modules.comp_scheduler._task.start_periodic_task", - autospec=True, - ) - mocker.patch( - "simcore_service_director_v2.modules.comp_scheduler._task.stop_periodic_task", + def _fake_starter( + self: BaseCompScheduler, + pipeline_params: ScheduledPipelineParams, + *args, + **kwargs, + ) -> None: + pipeline_params.scheduler_task = mocker.MagicMock() + + return mocker.patch( + "simcore_service_director_v2.modules.comp_scheduler._base_scheduler.BaseCompScheduler._start_scheduling", autospec=True, + side_effect=_fake_starter, ) @@ -258,7 +298,7 @@ async def test_scheduler_gracefully_starts_and_stops( minimal_app: FastAPI, ): # check it started correctly - assert minimal_app.state.computational_scheduler_task is not None + assert get_scheduler(minimal_app) is not None @pytest.mark.parametrize( @@ -287,7 +327,7 @@ def test_scheduler_raises_exception_for_missing_dependencies( async def test_empty_pipeline_is_not_scheduled( - with_disabled_scheduler_task: None, + with_disabled_auto_scheduling: None, scheduler: BaseCompScheduler, registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], @@ -319,9 +359,6 @@ async def test_empty_pipeline_is_not_scheduled( use_on_demand_clusters=False, ) assert len(scheduler.scheduled_pipelines) == 0 - assert ( - scheduler.wake_up_event.is_set() is False - ), "the scheduler was woken up on an empty pipeline!" # check the database is empty async with aiopg_engine.acquire() as conn: result = await conn.scalar( @@ -334,7 +371,7 @@ async def test_empty_pipeline_is_not_scheduled( async def test_misconfigured_pipeline_is_not_scheduled( - with_disabled_scheduler_task: None, + with_disabled_auto_scheduling: None, scheduler: BaseCompScheduler, registered_user: Callable[..., dict[str, Any]], project: Callable[..., Awaitable[ProjectAtDB]], @@ -361,9 +398,6 @@ async def test_misconfigured_pipeline_is_not_scheduled( use_on_demand_clusters=False, ) assert len(scheduler.scheduled_pipelines) == 1 - assert ( - scheduler.wake_up_event.is_set() is True - ), "the scheduler was NOT woken up on the scheduled pipeline!" for (u_id, p_id, it), params in scheduler.scheduled_pipelines.items(): assert u_id == user["id"] assert p_id == sleepers_project.uuid @@ -380,7 +414,7 @@ async def test_misconfigured_pipeline_is_not_scheduled( run_entry = CompRunsAtDB.parse_obj(await result.first()) assert run_entry.result == RunningState.PUBLISHED # let the scheduler kick in - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) # check the scheduled pipelines is again empty since it's misconfigured assert len(scheduler.scheduled_pipelines) == 0 # check the database entry is correctly updated @@ -412,9 +446,6 @@ async def _assert_start_pipeline( use_on_demand_clusters=False, ) assert len(scheduler.scheduled_pipelines) == 1, "the pipeline is not scheduled!" - assert ( - scheduler.wake_up_event.is_set() is True - ), "the scheduler was NOT woken up on the scheduled pipeline!" for (u_id, p_id, it), params in scheduler.scheduled_pipelines.items(): assert u_id == published_project.project.prj_owner assert p_id == published_project.project.uuid @@ -434,7 +465,7 @@ async def _assert_start_pipeline( return exp_published_tasks -async def _assert_schedule_pipeline_PENDING( +async def _assert_schedule_pipeline_PENDING( # noqa: N802 aiopg_engine, published_project: PublishedProject, published_tasks: list[CompTaskAtDB], @@ -452,7 +483,7 @@ async def _return_tasks_pending(job_ids: list[str]) -> list[DaskClientTaskState] return [DaskClientTaskState.PENDING for job_id in job_ids] mocked_dask_client.get_tasks_status.side_effect = _return_tasks_pending - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) _assert_dask_client_correctly_initialized(mocked_dask_client, scheduler) await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PUBLISHED) await _assert_comp_tasks_db( @@ -471,6 +502,7 @@ async def _return_tasks_pending(job_ids: list[str]) -> list[DaskClientTaskState] expected_progress=None, # since we bypass the API entrypoint this is correct ) # tasks were send to the backend + assert published_project.project.prj_owner is not None mocked_dask_client.send_computation_tasks.assert_has_calls( calls=[ mock.call( @@ -478,7 +510,7 @@ async def _return_tasks_pending(job_ids: list[str]) -> list[DaskClientTaskState] project_id=published_project.project.uuid, cluster_id=DEFAULT_CLUSTER_ID, tasks={f"{p.node_id}": p.image}, - callback=scheduler._wake_up_scheduler_now, # noqa: SLF001 + callback=mock.ANY, metadata=mock.ANY, hardware_info=mock.ANY, ) @@ -490,7 +522,7 @@ async def _return_tasks_pending(job_ids: list[str]) -> list[DaskClientTaskState] mocked_dask_client.get_tasks_status.assert_not_called() mocked_dask_client.get_task_result.assert_not_called() # there is a second run of the scheduler to move comp_runs to pending, the rest does not change - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PENDING) await _assert_comp_tasks_db( aiopg_engine, @@ -616,7 +648,7 @@ async def _trigger_progress_event( @pytest.mark.acceptance_test() async def test_proper_pipeline_is_scheduled( # noqa: PLR0915 - with_disabled_scheduler_task: None, + with_disabled_auto_scheduling: None, mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, aiopg_engine: aiopg.sa.engine.Engine, @@ -661,7 +693,7 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta mocked_dask_client.get_tasks_status.side_effect = _return_1st_task_running - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) await _assert_comp_run_db(aiopg_engine, published_project, RunningState.PENDING) await _assert_comp_tasks_db( @@ -707,7 +739,7 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta node_id=exp_started_task.node_id, ) - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) # comp_run, the comp_task switch to STARTED await _assert_comp_run_db(aiopg_engine, published_project, RunningState.STARTED) await _assert_comp_tasks_db( @@ -771,7 +803,7 @@ async def _return_random_task_result(job_id) -> TaskOutputData: return TaskOutputData.parse_obj({"out_1": None, "out_2": 45}) mocked_dask_client.get_task_result.side_effect = _return_random_task_result - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) await _assert_comp_run_db(aiopg_engine, published_project, RunningState.STARTED) await _assert_comp_tasks_db( aiopg_engine, @@ -819,7 +851,7 @@ async def _return_random_task_result(job_id) -> TaskOutputData: tasks={ f"{next_pending_task.node_id}": next_pending_task.image, }, - callback=scheduler._wake_up_scheduler_now, # noqa: SLF001 + callback=mock.ANY, metadata=mock.ANY, hardware_info=mock.ANY, ) @@ -866,7 +898,7 @@ async def _return_2nd_task_running(job_ids: list[str]) -> list[DaskClientTaskSta project_id=exp_started_task.project_id, node_id=exp_started_task.node_id, ) - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) await _assert_comp_run_db(aiopg_engine, published_project, RunningState.STARTED) await _assert_comp_tasks_db( aiopg_engine, @@ -908,7 +940,7 @@ async def _return_2nd_task_failed(job_ids: list[str]) -> list[DaskClientTaskStat mocked_dask_client.get_tasks_status.side_effect = _return_2nd_task_failed mocked_dask_client.get_task_result.side_effect = None - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) await _assert_comp_run_db(aiopg_engine, published_project, RunningState.STARTED) await _assert_comp_tasks_db( aiopg_engine, @@ -955,7 +987,7 @@ async def _return_3rd_task_success(job_ids: list[str]) -> list[DaskClientTaskSta mocked_dask_client.get_task_result.side_effect = _return_random_task_result # trigger the scheduler, it should switch to FAILED, as we are done - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) await _assert_comp_run_db(aiopg_engine, published_project, RunningState.FAILED) await _assert_comp_tasks_db( @@ -991,7 +1023,7 @@ async def _return_3rd_task_success(job_ids: list[str]) -> list[DaskClientTaskSta async def test_task_progress_triggers( - with_disabled_scheduler_task: None, + with_disabled_auto_scheduling: None, mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, aiopg_engine: aiopg.sa.engine.Engine, @@ -1054,7 +1086,7 @@ async def test_task_progress_triggers( ], ) async def test_handling_of_disconnected_dask_scheduler( - with_disabled_scheduler_task: None, + with_disabled_auto_scheduling: None, mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, aiopg_engine: aiopg.sa.engine.Engine, @@ -1098,7 +1130,7 @@ async def test_handling_of_disconnected_dask_scheduler( project_id=published_project.project.uuid, ) # we ensure the scheduler was run - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) # after this step the tasks are marked as ABORTED await _assert_comp_tasks_db( aiopg_engine, @@ -1112,7 +1144,7 @@ async def test_handling_of_disconnected_dask_scheduler( expected_progress=1, ) # then we have another scheduler run - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) # now the run should be ABORTED await _assert_comp_run_db(aiopg_engine, published_project, RunningState.ABORTED) @@ -1197,7 +1229,7 @@ class RebootState: ], ) async def test_handling_scheduling_after_reboot( - with_disabled_scheduler_task: None, + with_disabled_auto_scheduling: None, mocked_dask_client: mock.MagicMock, aiopg_engine: aiopg.sa.engine.Engine, running_project: RunningProject, @@ -1222,7 +1254,7 @@ async def mocked_get_task_result(_job_id: str) -> TaskOutputData: mocked_dask_client.get_task_result.side_effect = mocked_get_task_result - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) # the status will be called once for all RUNNING tasks mocked_dask_client.get_tasks_status.assert_called_once() if reboot_state.expected_run_state in COMPLETED_STATES: @@ -1279,7 +1311,7 @@ async def mocked_get_task_result(_job_id: str) -> TaskOutputData: async def test_handling_cancellation_of_jobs_after_reboot( - with_disabled_scheduler_task: None, + with_disabled_auto_scheduling: None, mocked_dask_client: mock.MagicMock, aiopg_engine: aiopg.sa.engine.Engine, running_project_mark_for_cancellation: RunningProject, @@ -1309,7 +1341,7 @@ async def mocked_get_tasks_status(job_ids: list[str]) -> list[DaskClientTaskStat mocked_dask_client.get_tasks_status.side_effect = mocked_get_tasks_status # Running the scheduler, should actually cancel the run now - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) mocked_dask_client.abort_computation_task.assert_called() assert mocked_dask_client.abort_computation_task.call_count == len( [ @@ -1346,7 +1378,7 @@ async def _return_random_task_result(job_id) -> TaskOutputData: raise TaskCancelledError mocked_dask_client.get_task_result.side_effect = _return_random_task_result - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) # now should be stopped await _assert_comp_tasks_db( aiopg_engine, @@ -1373,7 +1405,7 @@ def with_fast_service_heartbeat_s(monkeypatch: pytest.MonkeyPatch) -> int: async def test_running_pipeline_triggers_heartbeat( - with_disabled_scheduler_task: None, + with_disabled_auto_scheduling: None, with_fast_service_heartbeat_s: int, mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, @@ -1420,7 +1452,7 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta project_id=exp_started_task.project_id, node_id=exp_started_task.node_id, ) - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) messages = await _assert_message_received( resource_tracking_rabbit_client_parser, @@ -1432,8 +1464,8 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta # ------------------------------------------------------------------------------- # 3. wait a bit and run again we should get another heartbeat, but only one! await asyncio.sleep(with_fast_service_heartbeat_s + 1) - await run_comp_scheduler(scheduler) - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) + await schedule_all_pipelines(scheduler) messages = await _assert_message_received( resource_tracking_rabbit_client_parser, 1, @@ -1444,8 +1476,8 @@ async def _return_1st_task_running(job_ids: list[str]) -> list[DaskClientTaskSta # ------------------------------------------------------------------------------- # 4. wait a bit and run again we should get another heartbeat, but only one! await asyncio.sleep(with_fast_service_heartbeat_s + 1) - await run_comp_scheduler(scheduler) - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) + await schedule_all_pipelines(scheduler) messages = await _assert_message_received( resource_tracking_rabbit_client_parser, 1, @@ -1463,7 +1495,7 @@ async def mocked_get_or_create_cluster(mocker: MockerFixture) -> mock.Mock: async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( - with_disabled_scheduler_task: None, + with_disabled_auto_scheduling: None, scheduler: BaseCompScheduler, aiopg_engine: aiopg.sa.engine.Engine, published_project: PublishedProject, @@ -1501,7 +1533,7 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( published_project.tasks[1], published_project.tasks[3], ] - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) mocked_get_or_create_cluster.assert_called() assert mocked_get_or_create_cluster.call_count == 1 mocked_get_or_create_cluster.reset_mock() @@ -1516,7 +1548,7 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( expected_progress=None, ) # again will trigger the same response - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) mocked_get_or_create_cluster.assert_called() assert mocked_get_or_create_cluster.call_count == 1 mocked_get_or_create_cluster.reset_mock() @@ -1537,7 +1569,7 @@ async def test_pipeline_with_on_demand_cluster_with_not_ready_backend_waits( [ClustersKeeperNotAvailableError], ) async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails( - with_disabled_scheduler_task: None, + with_disabled_auto_scheduling: None, scheduler: BaseCompScheduler, aiopg_engine: aiopg.sa.engine.Engine, published_project: PublishedProject, @@ -1570,7 +1602,7 @@ async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails( published_project.tasks[1], published_project.tasks[3], ] - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) mocked_get_or_create_cluster.assert_called() assert mocked_get_or_create_cluster.call_count == 1 mocked_get_or_create_cluster.reset_mock() @@ -1583,7 +1615,7 @@ async def test_pipeline_with_on_demand_cluster_with_no_clusters_keeper_fails( expected_progress=1.0, ) # again will not re-trigger the call to clusters-keeper - await run_comp_scheduler(scheduler) + await schedule_all_pipelines(scheduler) mocked_get_or_create_cluster.assert_not_called() await _assert_comp_run_db(aiopg_engine, published_project, RunningState.FAILED) await _assert_comp_tasks_db( diff --git a/services/director-v2/tests/unit/with_dbs/test_modules_dynamic_sidecar_docker_service_specs.py b/services/director-v2/tests/unit/with_dbs/test_modules_dynamic_sidecar_docker_service_specs.py index a05e4cd84da..ab835039262 100644 --- a/services/director-v2/tests/unit/with_dbs/test_modules_dynamic_sidecar_docker_service_specs.py +++ b/services/director-v2/tests/unit/with_dbs/test_modules_dynamic_sidecar_docker_service_specs.py @@ -243,6 +243,7 @@ def expected_dynamic_sidecar_spec( "FORWARD_ENV_DISPLAY": ":0", "NODE_PORTS_400_REQUEST_TIMEOUT_ATTEMPTS": "3", "DYNAMIC_SIDECAR_LOG_LEVEL": "DEBUG", + "DYNAMIC_SIDECAR_TRACING": "null", "DY_DEPLOYMENT_REGISTRY_SETTINGS": ( '{"REGISTRY_AUTH": false, "REGISTRY_PATH": null, ' '"REGISTRY_URL": "foo.bar.com", "REGISTRY_USER": ' diff --git a/services/docker-compose-ops.yml b/services/docker-compose-ops.yml index 9beacf76c34..c80befe2316 100644 --- a/services/docker-compose-ops.yml +++ b/services/docker-compose-ops.yml @@ -111,6 +111,7 @@ services: - "4318:4318" # OTLP HTTP receiver networks: - simcore_default + - interactive_services_subnet environment: TRACING_OPENTELEMETRY_COLLECTOR_BATCH_SIZE: ${TRACING_OPENTELEMETRY_COLLECTOR_BATCH_SIZE} TRACING_OPENTELEMETRY_COLLECTOR_SAMPLING_PERCENTAGE: ${TRACING_OPENTELEMETRY_COLLECTOR_SAMPLING_PERCENTAGE} diff --git a/services/docker-compose.yml b/services/docker-compose.yml index 35dd3782609..2f039977889 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -1053,6 +1053,10 @@ services: RABBIT_USER: ${RABBIT_USER} RABBIT_SECURE: ${RABBIT_SECURE} + AGENT_TRACING: ${AGENT_TRACING} + TRACING_OPENTELEMETRY_COLLECTOR_ENDPOINT: ${TRACING_OPENTELEMETRY_COLLECTOR_ENDPOINT} + TRACING_OPENTELEMETRY_COLLECTOR_PORT: ${TRACING_OPENTELEMETRY_COLLECTOR_PORT} + dask-sidecar: image: ${DOCKER_REGISTRY:-itisfoundation}/dask-sidecar:${DOCKER_IMAGE_TAG:-latest} init: true diff --git a/services/dynamic-scheduler/requirements/_base.txt b/services/dynamic-scheduler/requirements/_base.txt index cb2cc603fb0..3462f0ba65b 100644 --- a/services/dynamic-scheduler/requirements/_base.txt +++ b/services/dynamic-scheduler/requirements/_base.txt @@ -40,9 +40,7 @@ arrow==1.3.0 asgiref==3.8.1 # via opentelemetry-instrumentation-asgi async-timeout==4.0.3 - # via - # asyncpg - # redis + # via asyncpg asyncpg==0.29.0 # via sqlalchemy attrs==23.2.0 @@ -172,6 +170,7 @@ opentelemetry-api==1.27.0 # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -191,6 +190,7 @@ opentelemetry-instrumentation==0.48b0 # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests opentelemetry-instrumentation-asgi==0.48b0 @@ -199,6 +199,8 @@ opentelemetry-instrumentation-asyncpg==0.48b0 # via -r requirements/../../../packages/postgres-database/requirements/_base.in opentelemetry-instrumentation-fastapi==0.48b0 # via -r requirements/../../../packages/service-library/requirements/_fastapi.in +opentelemetry-instrumentation-httpx==0.48b0 + # via -r requirements/../../../packages/service-library/requirements/_fastapi.in opentelemetry-instrumentation-redis==0.48b0 # via -r requirements/../../../packages/service-library/requirements/_base.in opentelemetry-instrumentation-requests==0.48b0 @@ -218,6 +220,7 @@ opentelemetry-semantic-conventions==0.48b0 # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -225,6 +228,7 @@ opentelemetry-util-http==0.48b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-requests orjson==3.10.0 # via diff --git a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/director_v2/_thin_client.py b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/director_v2/_thin_client.py index e823216576b..68aae3b97f3 100644 --- a/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/director_v2/_thin_client.py +++ b/services/dynamic-scheduler/src/simcore_service_dynamic_scheduler/services/director_v2/_thin_client.py @@ -37,6 +37,7 @@ def __init__(self, app: FastAPI) -> None: DEFAULT_LEGACY_WB_TO_DV2_HTTP_REQUESTS_TIMEOUT_S ), extra_allowed_method_names={"attach_lifespan_to"}, + tracing_settings=settings.DYNAMIC_SCHEDULER_TRACING, ) @retry_on_errors() diff --git a/services/dynamic-sidecar/requirements/_base.txt b/services/dynamic-sidecar/requirements/_base.txt index 40c32b696ec..559440b03f0 100644 --- a/services/dynamic-sidecar/requirements/_base.txt +++ b/services/dynamic-sidecar/requirements/_base.txt @@ -76,7 +76,6 @@ async-timeout==4.0.3 # via # aiopg # asyncpg - # redis asyncpg==0.29.0 # via sqlalchemy attrs==23.2.0 @@ -243,6 +242,7 @@ opentelemetry-api==1.27.0 # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-dbapi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -266,6 +266,7 @@ opentelemetry-instrumentation==0.48b0 # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-dbapi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests opentelemetry-instrumentation-aiopg==0.48b0 @@ -280,6 +281,8 @@ opentelemetry-instrumentation-dbapi==0.48b0 # via opentelemetry-instrumentation-aiopg opentelemetry-instrumentation-fastapi==0.48b0 # via -r requirements/../../../packages/service-library/requirements/_fastapi.in +opentelemetry-instrumentation-httpx==0.48b0 + # via -r requirements/../../../packages/service-library/requirements/_fastapi.in opentelemetry-instrumentation-redis==0.48b0 # via # -r requirements/../../../packages/service-library/requirements/_base.in @@ -305,6 +308,7 @@ opentelemetry-semantic-conventions==0.48b0 # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-dbapi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -312,6 +316,7 @@ opentelemetry-util-http==0.48b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-requests orjson==3.10.0 # via diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/application.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/application.py index ce5f48a8b21..59547f40119 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/application.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/application.py @@ -9,6 +9,7 @@ get_common_oas_options, override_fastapi_openapi_method, ) +from servicelib.fastapi.tracing import setup_tracing from servicelib.logging_utils import config_all_loggers from simcore_sdk.node_ports_common.exceptions import NodeNotFound @@ -190,6 +191,9 @@ def create_app(): if application_settings.are_prometheus_metrics_enabled: setup_prometheus_metrics(app) + if application_settings.DYNAMIC_SIDECAR_TRACING: + setup_tracing(app, application_settings.DYNAMIC_SIDECAR_TRACING, PROJECT_NAME) + # ERROR HANDLERS ------------ app.add_exception_handler(NodeNotFound, node_not_found_error_handler) app.add_exception_handler(BaseDynamicSidecarError, http_error_handler) diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/settings.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/settings.py index 214d51ad11b..024465913bd 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/settings.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/settings.py @@ -23,6 +23,7 @@ from settings_library.resource_usage_tracker import ( DEFAULT_RESOURCE_USAGE_HEARTBEAT_INTERVAL, ) +from settings_library.tracing import TracingSettings from settings_library.utils_logging import MixinLoggingSettings @@ -167,6 +168,10 @@ class ApplicationSettings(BaseCustomSettings, MixinLoggingSettings): SYSTEM_MONITOR_SETTINGS: SystemMonitorSettings = Field(auto_default_from_env=True) + DYNAMIC_SIDECAR_TRACING: TracingSettings | None = Field( + auto_default_from_env=True, description="settings for opentelemetry tracing" + ) + @property def are_prometheus_metrics_enabled(self) -> bool: return self.DY_SIDECAR_CALLBACKS_MAPPING.metrics is not None diff --git a/services/dynamic-sidecar/tests/conftest.py b/services/dynamic-sidecar/tests/conftest.py index 8b4760b26dd..a9ec557c6dc 100644 --- a/services/dynamic-sidecar/tests/conftest.py +++ b/services/dynamic-sidecar/tests/conftest.py @@ -199,6 +199,7 @@ def base_mock_envs( "REGISTRY_SSL": "false", } ), + "DYNAMIC_SIDECAR_TRACING": "null", } diff --git a/services/efs-guardian/requirements/_base.txt b/services/efs-guardian/requirements/_base.txt index 26a626f01db..8e46a857186 100644 --- a/services/efs-guardian/requirements/_base.txt +++ b/services/efs-guardian/requirements/_base.txt @@ -69,9 +69,7 @@ arrow==1.3.0 asgiref==3.8.1 # via opentelemetry-instrumentation-asgi async-timeout==4.0.3 - # via - # asyncpg - # redis + # via asyncpg asyncpg==0.29.0 # via sqlalchemy attrs==24.2.0 @@ -238,6 +236,7 @@ opentelemetry-api==1.27.0 # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-botocore # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-propagator-aws-xray @@ -261,6 +260,7 @@ opentelemetry-instrumentation==0.48b0 # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-botocore # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests opentelemetry-instrumentation-asgi==0.48b0 @@ -271,6 +271,8 @@ opentelemetry-instrumentation-botocore==0.48b0 # via -r requirements/../../../packages/aws-library/requirements/_base.in opentelemetry-instrumentation-fastapi==0.48b0 # via -r requirements/../../../packages/service-library/requirements/_fastapi.in +opentelemetry-instrumentation-httpx==0.48b0 + # via -r requirements/../../../packages/service-library/requirements/_fastapi.in opentelemetry-instrumentation-redis==0.48b0 # via # -r requirements/../../../packages/aws-library/requirements/../../../packages/service-library/requirements/_base.in @@ -298,6 +300,7 @@ opentelemetry-semantic-conventions==0.48b0 # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-botocore # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -305,6 +308,7 @@ opentelemetry-util-http==0.48b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-requests orjson==3.10.7 # via diff --git a/services/efs-guardian/requirements/_test.txt b/services/efs-guardian/requirements/_test.txt index 8bdc1ec8ebf..f188e8071de 100644 --- a/services/efs-guardian/requirements/_test.txt +++ b/services/efs-guardian/requirements/_test.txt @@ -23,10 +23,6 @@ anyio==4.6.2.post1 # httpx asgi-lifespan==2.1.0 # via -r requirements/_test.in -async-timeout==4.0.3 - # via - # -c requirements/_base.txt - # redis attrs==24.2.0 # via # -c requirements/_base.txt diff --git a/services/invitations/requirements/_base.txt b/services/invitations/requirements/_base.txt index c6e253b5e6a..732bac0872f 100644 --- a/services/invitations/requirements/_base.txt +++ b/services/invitations/requirements/_base.txt @@ -35,8 +35,6 @@ arrow==1.3.0 # -r requirements/../../../packages/service-library/requirements/_base.in asgiref==3.8.1 # via opentelemetry-instrumentation-asgi -async-timeout==4.0.3 - # via redis attrs==23.2.0 # via # aiohttp @@ -153,6 +151,7 @@ opentelemetry-api==1.26.0 # opentelemetry-instrumentation # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -171,12 +170,15 @@ opentelemetry-instrumentation==0.47b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests opentelemetry-instrumentation-asgi==0.47b0 # via opentelemetry-instrumentation-fastapi opentelemetry-instrumentation-fastapi==0.47b0 # via -r requirements/../../../packages/service-library/requirements/_fastapi.in +opentelemetry-instrumentation-httpx==0.47b0 + # via -r requirements/../../../packages/service-library/requirements/_fastapi.in opentelemetry-instrumentation-redis==0.47b0 # via -r requirements/../../../packages/service-library/requirements/_base.in opentelemetry-instrumentation-requests==0.47b0 @@ -195,6 +197,7 @@ opentelemetry-semantic-conventions==0.47b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -202,6 +205,7 @@ opentelemetry-util-http==0.47b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-requests orjson==3.10.0 # via diff --git a/services/payments/requirements/_base.txt b/services/payments/requirements/_base.txt index 88aae6375d4..c38b7880c1d 100644 --- a/services/payments/requirements/_base.txt +++ b/services/payments/requirements/_base.txt @@ -43,9 +43,7 @@ arrow==1.3.0 asgiref==3.8.1 # via opentelemetry-instrumentation-asgi async-timeout==4.0.3 - # via - # asyncpg - # redis + # via asyncpg asyncpg==0.29.0 # via sqlalchemy attrs==23.2.0 @@ -201,6 +199,7 @@ opentelemetry-api==1.27.0 # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -220,6 +219,7 @@ opentelemetry-instrumentation==0.48b0 # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests opentelemetry-instrumentation-asgi==0.48b0 @@ -228,6 +228,8 @@ opentelemetry-instrumentation-asyncpg==0.48b0 # via -r requirements/../../../packages/postgres-database/requirements/_base.in opentelemetry-instrumentation-fastapi==0.48b0 # via -r requirements/../../../packages/service-library/requirements/_fastapi.in +opentelemetry-instrumentation-httpx==0.48b0 + # via -r requirements/../../../packages/service-library/requirements/_fastapi.in opentelemetry-instrumentation-redis==0.48b0 # via -r requirements/../../../packages/service-library/requirements/_base.in opentelemetry-instrumentation-requests==0.48b0 @@ -247,6 +249,7 @@ opentelemetry-semantic-conventions==0.48b0 # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -254,6 +257,7 @@ opentelemetry-util-http==0.48b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-requests orjson==3.10.6 # via diff --git a/services/payments/src/simcore_service_payments/services/payments_gateway.py b/services/payments/src/simcore_service_payments/services/payments_gateway.py index 0b1097492c6..44c54b6108d 100644 --- a/services/payments/src/simcore_service_payments/services/payments_gateway.py +++ b/services/payments/src/simcore_service_payments/services/payments_gateway.py @@ -25,6 +25,7 @@ HealthMixinMixin, ) from servicelib.fastapi.httpx_utils import to_curl_command +from servicelib.fastapi.tracing import setup_httpx_client_tracing from simcore_service_payments.models.schemas.acknowledgements import ( AckPaymentWithPaymentMethod, ) @@ -216,5 +217,7 @@ def setup_payments_gateway(app: FastAPI): secret=settings.PAYMENTS_GATEWAY_API_SECRET.get_secret_value() ), ) + if settings.PAYMENTS_TRACING: + setup_httpx_client_tracing(api.client) api.attach_lifespan_to(app) api.set_to_app_state(app) diff --git a/services/payments/src/simcore_service_payments/services/resource_usage_tracker.py b/services/payments/src/simcore_service_payments/services/resource_usage_tracker.py index e66f650fe7b..3f114540f99 100644 --- a/services/payments/src/simcore_service_payments/services/resource_usage_tracker.py +++ b/services/payments/src/simcore_service_payments/services/resource_usage_tracker.py @@ -25,6 +25,7 @@ BaseHTTPApi, HealthMixinMixin, ) +from servicelib.fastapi.tracing import setup_httpx_client_tracing from ..core.settings import ApplicationSettings @@ -73,5 +74,7 @@ def setup_resource_usage_tracker(app: FastAPI): api = ResourceUsageTrackerApi.from_client_kwargs( base_url=settings.PAYMENTS_RESOURCE_USAGE_TRACKER.base_url, ) + if settings.PAYMENTS_TRACING: + setup_httpx_client_tracing(api.client) api.set_to_app_state(app) api.attach_lifespan_to(app) diff --git a/services/payments/src/simcore_service_payments/services/stripe.py b/services/payments/src/simcore_service_payments/services/stripe.py index 38cc21fab0e..3f3fa933bb6 100644 --- a/services/payments/src/simcore_service_payments/services/stripe.py +++ b/services/payments/src/simcore_service_payments/services/stripe.py @@ -19,6 +19,7 @@ BaseHTTPApi, HealthMixinMixin, ) +from servicelib.fastapi.tracing import setup_httpx_client_tracing from ..core.errors import StripeRuntimeError from ..core.settings import ApplicationSettings @@ -91,6 +92,8 @@ def setup_stripe(app: FastAPI): base_url=settings.PAYMENTS_STRIPE_URL, auth=_StripeBearerAuth(settings.PAYMENTS_STRIPE_API_SECRET.get_secret_value()), ) + if settings.PAYMENTS_TRACING: + setup_httpx_client_tracing(api.client) api.set_to_app_state(app) api.attach_lifespan_to(app) diff --git a/services/resource-usage-tracker/requirements/_base.txt b/services/resource-usage-tracker/requirements/_base.txt index 97a3bd129b7..bbd3cddf53d 100644 --- a/services/resource-usage-tracker/requirements/_base.txt +++ b/services/resource-usage-tracker/requirements/_base.txt @@ -69,9 +69,7 @@ arrow==1.3.0 asgiref==3.8.1 # via opentelemetry-instrumentation-asgi async-timeout==4.0.3 - # via - # asyncpg - # redis + # via asyncpg asyncpg==0.29.0 # via sqlalchemy attrs==23.2.0 @@ -260,6 +258,7 @@ opentelemetry-api==1.26.0 # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-botocore # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-propagator-aws-xray @@ -283,6 +282,7 @@ opentelemetry-instrumentation==0.47b0 # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-botocore # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests opentelemetry-instrumentation-asgi==0.47b0 @@ -293,6 +293,8 @@ opentelemetry-instrumentation-botocore==0.47b0 # via -r requirements/../../../packages/aws-library/requirements/_base.in opentelemetry-instrumentation-fastapi==0.47b0 # via -r requirements/../../../packages/service-library/requirements/_fastapi.in +opentelemetry-instrumentation-httpx==0.47b0 + # via -r requirements/../../../packages/service-library/requirements/_fastapi.in opentelemetry-instrumentation-redis==0.47b0 # via # -r requirements/../../../packages/aws-library/requirements/../../../packages/service-library/requirements/_base.in @@ -320,6 +322,7 @@ opentelemetry-semantic-conventions==0.47b0 # opentelemetry-instrumentation-asyncpg # opentelemetry-instrumentation-botocore # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk @@ -327,6 +330,7 @@ opentelemetry-util-http==0.47b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-httpx # opentelemetry-instrumentation-requests orjson==3.10.0 # via diff --git a/services/resource-usage-tracker/requirements/_test.txt b/services/resource-usage-tracker/requirements/_test.txt index e70753feb19..4db08363ded 100644 --- a/services/resource-usage-tracker/requirements/_test.txt +++ b/services/resource-usage-tracker/requirements/_test.txt @@ -10,10 +10,6 @@ anyio==4.3.0 # httpx asgi-lifespan==2.1.0 # via -r requirements/_test.in -async-timeout==4.0.3 - # via - # -c requirements/_base.txt - # redis attrs==23.2.0 # via # -c requirements/_base.txt diff --git a/services/static-webserver/client/source/class/osparc/dashboard/ResourceBrowserBase.js b/services/static-webserver/client/source/class/osparc/dashboard/ResourceBrowserBase.js index a80672bd3cd..344507aad9a 100644 --- a/services/static-webserver/client/source/class/osparc/dashboard/ResourceBrowserBase.js +++ b/services/static-webserver/client/source/class/osparc/dashboard/ResourceBrowserBase.js @@ -95,7 +95,7 @@ qx.Class.define("osparc.dashboard.ResourceBrowserBase", { return isLogged; }, - startStudyById: function(studyId, openCB, cancelCB, isStudyCreation = false) { + startStudyById: function(studyId, openCB, cancelCB, showStudyOptions = false) { if (!osparc.dashboard.ResourceBrowserBase.checkLoggedIn()) { return; } @@ -117,7 +117,7 @@ qx.Class.define("osparc.dashboard.ResourceBrowserBase", { osparc.data.Resources.fetch("studies", "getWallet", params) .then(wallet => { if ( - isStudyCreation || + showStudyOptions || wallet === null || osparc.desktop.credits.Utils.getWallet(wallet["walletId"]) === null ) { diff --git a/services/static-webserver/client/source/class/osparc/dashboard/StudyBrowser.js b/services/static-webserver/client/source/class/osparc/dashboard/StudyBrowser.js index ceaee03b3ac..a2de2032524 100644 --- a/services/static-webserver/client/source/class/osparc/dashboard/StudyBrowser.js +++ b/services/static-webserver/client/source/class/osparc/dashboard/StudyBrowser.js @@ -1229,7 +1229,7 @@ qx.Class.define("osparc.dashboard.StudyBrowser", { folderId: this.getCurrentFolderId(), }; osparc.study.Utils.createStudyFromTemplate(templateCopyData, this._loadingPage, contextProps) - .then(studyId => this.__startStudyAfterCreating(studyId)) + .then(studyData => this.__startStudyAfterCreating(studyData["uuid"])) .catch(err => { this._hideLoadingPage(); osparc.FlashMessenger.getInstance().logAs(err.message, "ERROR"); diff --git a/services/static-webserver/client/source/class/osparc/dashboard/TemplateBrowser.js b/services/static-webserver/client/source/class/osparc/dashboard/TemplateBrowser.js index d597d8a438c..7f4f0362cab 100644 --- a/services/static-webserver/client/source/class/osparc/dashboard/TemplateBrowser.js +++ b/services/static-webserver/client/source/class/osparc/dashboard/TemplateBrowser.js @@ -137,27 +137,85 @@ qx.Class.define("osparc.dashboard.TemplateBrowser", { return; } - this._showLoadingPage(this.tr("Creating ") + (templateData.name || osparc.product.Utils.getStudyAlias({firstUpperCase: true}))); - osparc.study.Utils.createStudyFromTemplate(templateData, this._loadingPage) - .then(studyId => { - const openCB = () => this._hideLoadingPage(); - const cancelCB = () => { - this._hideLoadingPage(); - const params = { - url: { - studyId - } + const studyAlias = osparc.product.Utils.getStudyAlias({firstUpperCase: true}); + this._showLoadingPage(this.tr("Creating ") + (templateData.name || studyAlias)); + + const studyOptions = new osparc.study.StudyOptions(); + // they will be patched once the study is created + studyOptions.setPatchStudy(false); + studyOptions.setStudyData(templateData); + const win = osparc.study.StudyOptions.popUpInWindow(studyOptions); + win.moveItUp(); + const cancelStudyOptions = () => { + this._hideLoadingPage(); + win.close(); + } + win.addListener("cancel", () => cancelStudyOptions()); + studyOptions.addListener("cancel", () => cancelStudyOptions()); + studyOptions.addListener("startStudy", () => { + const newName = studyOptions.getChildControl("title-field").getValue(); + const walletSelection = studyOptions.getChildControl("wallet-selector").getSelection(); + const nodesPricingUnits = studyOptions.getChildControl("study-pricing-units").getNodePricingUnits(); + win.close(); + this._showLoadingPage(this.tr("Creating ") + (newName || studyAlias)); + osparc.study.Utils.createStudyFromTemplate(templateData, this._loadingPage) + .then(newStudyData => { + const studyId = newStudyData["uuid"]; + const openCB = () => { + this._hideLoadingPage(); }; - osparc.data.Resources.fetch("studies", "delete", params); - }; - const isStudyCreation = true; - this._startStudyById(studyId, openCB, cancelCB, isStudyCreation); - }) - .catch(err => { - this._hideLoadingPage(); - osparc.FlashMessenger.getInstance().logAs(err.message, "ERROR"); - console.error(err); - }); + const cancelCB = () => { + this._hideLoadingPage(); + const params = { + url: { + studyId + } + }; + osparc.data.Resources.fetch("studies", "delete", params); + }; + + const promises = []; + // patch the name + if (newStudyData["name"] !== newName) { + promises.push(osparc.study.StudyOptions.updateName(newStudyData, newName)); + } + // patch the wallet + if (walletSelection.length && walletSelection[0]["walletId"]) { + const walletId = walletSelection[0]["walletId"]; + promises.push(osparc.study.StudyOptions.updateWallet(newStudyData["uuid"], walletId)); + } + // patch the pricing units + // the nodeIds are coming from the original template, they need to be mapped to the newStudy + const workbench = newStudyData["workbench"]; + const nodesIdsListed = []; + Object.keys(workbench).forEach(nodeId => { + const node = workbench[nodeId]; + if (osparc.study.StudyPricingUnits.includeInList(node)) { + nodesIdsListed.push(nodeId); + } + }); + nodesPricingUnits.forEach((nodePricingUnits, idx) => { + const selectedPricingUnitId = nodePricingUnits.getPricingUnits().getSelectedUnitId(); + if (selectedPricingUnitId) { + const nodeId = nodesIdsListed[idx]; + const pricingPlanId = nodePricingUnits.getPricingPlanId(); + promises.push(osparc.study.NodePricingUnits.patchPricingUnitSelection(studyId, nodeId, pricingPlanId, selectedPricingUnitId)); + } + }); + + Promise.all(promises) + .then(() => { + win.close(); + const showStudyOptions = false; + this._startStudyById(studyId, openCB, cancelCB, showStudyOptions); + }); + }) + .catch(err => { + this._hideLoadingPage(); + osparc.FlashMessenger.getInstance().logAs(err.message, "ERROR"); + console.error(err); + }); + }); }, // LAYOUT // diff --git a/services/static-webserver/client/source/class/osparc/node/TierSelectionView.js b/services/static-webserver/client/source/class/osparc/node/TierSelectionView.js index 34dfc397b37..ffa1431a00e 100644 --- a/services/static-webserver/client/source/class/osparc/node/TierSelectionView.js +++ b/services/static-webserver/client/source/class/osparc/node/TierSelectionView.js @@ -105,7 +105,7 @@ qx.Class.define("osparc.node.TierSelectionView", { if (selection.length) { tierBox.setEnabled(false); const selectedUnitId = selection[0].getModel(); - osparc.study.NodePricingUnits.pricingUnitSelected(studyId, nodeId, pricingPlans["pricingPlanId"], selectedUnitId) + osparc.study.NodePricingUnits.patchPricingUnitSelection(studyId, nodeId, pricingPlans["pricingPlanId"], selectedUnitId) .finally(() => { tierBox.setEnabled(true); showSelectedTier(selectedUnitId); diff --git a/services/static-webserver/client/source/class/osparc/study/NodePricingUnits.js b/services/static-webserver/client/source/class/osparc/study/NodePricingUnits.js index d8caa28b68f..76918e12b3e 100644 --- a/services/static-webserver/client/source/class/osparc/study/NodePricingUnits.js +++ b/services/static-webserver/client/source/class/osparc/study/NodePricingUnits.js @@ -30,8 +30,10 @@ qx.Class.define("osparc.study.NodePricingUnits", { layout: new qx.ui.layout.VBox() }); - this.__studyId = studyId; - this.__nodeId = nodeId; + this.set({ + studyId, + nodeId, + }); if (node instanceof osparc.data.model.Node) { this.__nodeKey = node.getKey(); this.__nodeVersion = node.getVersion(); @@ -43,8 +45,35 @@ qx.Class.define("osparc.study.NodePricingUnits", { } }, + properties: { + studyId: { + check: "String", + init: null, + nullable: false, + }, + + nodeId: { + check: "String", + init: null, + nullable: false, + }, + + pricingPlanId: { + check: "Number", + init: null, + nullable: false, + }, + + patchNode: { + check: "Boolean", + init: true, + nullable: false, + event: "changePatchNode", + }, + }, + statics: { - pricingUnitSelected: function(studyId, nodeId, planId, selectedUnitId) { + patchPricingUnitSelection: function(studyId, nodeId, planId, selectedUnitId) { const params = { url: { studyId, @@ -58,19 +87,18 @@ qx.Class.define("osparc.study.NodePricingUnits", { }, members: { - __studyId: null, - __nodeId: null, __nodeKey: null, __nodeVersion: null, __nodeLabel: null, + __pricingUnits: null, showPricingUnits: function(inGroupBox = true) { return new Promise(resolve => { const nodeKey = this.__nodeKey; const nodeVersion = this.__nodeVersion; const nodeLabel = this.__nodeLabel; - const studyId = this.__studyId; - const nodeId = this.__nodeId; + const studyId = this.getStudyId(); + const nodeId = this.getNodeId(); const plansParams = { url: osparc.data.Resources.getServiceUrl( @@ -79,30 +107,36 @@ qx.Class.define("osparc.study.NodePricingUnits", { ) }; osparc.data.Resources.fetch("services", "pricingPlans", plansParams) - .then(pricingPlans => { - if (pricingPlans) { + .then(pricingPlan => { + if (pricingPlan) { const unitParams = { url: { studyId, nodeId } }; + this.set({ + pricingPlanId: pricingPlan["pricingPlanId"] + }); osparc.data.Resources.fetch("studies", "getPricingUnit", unitParams) .then(preselectedPricingUnit => { - if (pricingPlans && "pricingUnits" in pricingPlans && pricingPlans["pricingUnits"].length) { - const unitButtons = new osparc.study.PricingUnits(pricingPlans["pricingUnits"], preselectedPricingUnit); + if (pricingPlan && "pricingUnits" in pricingPlan && pricingPlan["pricingUnits"].length) { + const pricingUnitButtons = this.__pricingUnits = new osparc.study.PricingUnits(pricingPlan["pricingUnits"], preselectedPricingUnit); if (inGroupBox) { const pricingUnitsLayout = osparc.study.StudyOptions.createGroupBox(nodeLabel); - pricingUnitsLayout.add(unitButtons); + pricingUnitsLayout.add(pricingUnitButtons); this._add(pricingUnitsLayout); } else { - this._add(unitButtons); + this._add(pricingUnitButtons); } - unitButtons.addListener("changeSelectedUnitId", e => { - unitButtons.setEnabled(false); - const selectedPricingUnitId = e.getData(); - this.self().pricingUnitSelected(this.__studyId, this.__nodeId, pricingPlans["pricingPlanId"], selectedPricingUnitId) - .finally(() => unitButtons.setEnabled(true)); + pricingUnitButtons.addListener("changeSelectedUnitId", e => { + if (this.isPatchNode()) { + pricingUnitButtons.setEnabled(false); + const pricingPlanId = this.getPricingPlanId(); + const selectedPricingUnitId = e.getData(); + this.self().patchPricingUnitSelection(studyId, nodeId, pricingPlanId, selectedPricingUnitId) + .finally(() => pricingUnitButtons.setEnabled(true)); + } }); } }) @@ -110,6 +144,10 @@ qx.Class.define("osparc.study.NodePricingUnits", { } }); }); - } + }, + + getPricingUnits: function() { + return this.__pricingUnits; + }, } }); diff --git a/services/static-webserver/client/source/class/osparc/study/StudyOptions.js b/services/static-webserver/client/source/class/osparc/study/StudyOptions.js index 9922ec017e3..5b0fd30cadb 100644 --- a/services/static-webserver/client/source/class/osparc/study/StudyOptions.js +++ b/services/static-webserver/client/source/class/osparc/study/StudyOptions.js @@ -22,8 +22,11 @@ qx.Class.define("osparc.study.StudyOptions", { this.base(arguments); this._setLayout(new qx.ui.layout.VBox(15)); + this.__buildLayout(); - this.setStudyId(studyId); + if (studyId) { + this.setStudyId(studyId); + } }, properties: { @@ -40,7 +43,14 @@ qx.Class.define("osparc.study.StudyOptions", { nullable: true, event: "changeWallet", apply: "__applyWallet" - } + }, + + patchStudy: { + check: "Boolean", + init: true, + nullable: false, + event: "changePatchStudy", + }, }, events: { @@ -78,7 +88,31 @@ qx.Class.define("osparc.study.StudyOptions", { }); box.setLayout(new qx.ui.layout.VBox(5)); return box; - } + }, + + updateName: function(studyData, name) { + return osparc.info.StudyUtils.patchStudyData(studyData, "name", name) + .catch(err => { + console.error(err); + const msg = err.message || qx.locale.Manager.tr("Something went wrong Renaming"); + osparc.FlashMessenger.logAs(msg, "ERROR"); + }); + }, + + updateWallet: function(studyId, walletId) { + const params = { + url: { + studyId, + walletId, + } + }; + return osparc.data.Resources.fetch("studies", "selectWallet", params) + .catch(err => { + console.error(err); + const msg = err.message || qx.locale.Manager.tr("Error selecting Credit Account"); + osparc.FlashMessenger.getInstance().logAs(msg, "ERROR"); + }); + }, }, members: { @@ -147,6 +181,27 @@ qx.Class.define("osparc.study.StudyOptions", { control = this.self().createGroupBox(this.tr("Tiers")); this.getChildControl("options-layout").add(control); break; + case "study-pricing-units": { + control = new osparc.study.StudyPricingUnits(); + const loadingImage = this.getChildControl("loading-units-spinner"); + const unitsBoxesLayout = this.getChildControl("services-resources-layout"); + const unitsLoading = () => { + loadingImage.show(); + unitsBoxesLayout.exclude(); + }; + const unitsReady = () => { + loadingImage.exclude(); + unitsBoxesLayout.show(); + control.getNodePricingUnits().forEach(nodePricingUnits => { + this.bind("patchStudy", nodePricingUnits, "patchNode"); + }); + }; + unitsLoading(); + control.addListener("loadingUnits", () => unitsLoading()); + control.addListener("unitsReady", () => unitsReady()); + unitsBoxesLayout.add(control); + break; + } case "buttons-layout": control = new qx.ui.container.Composite(new qx.ui.layout.HBox(5).set({ alignX: "right" @@ -192,7 +247,7 @@ qx.Class.define("osparc.study.StudyOptions", { ]) .then(values => { const studyData = values[0]; - this.__studyData = osparc.data.model.Study.deepCloneStudyObject(studyData); + this.setStudyData(studyData); if (values[1] && "walletId" in values[1]) { this.__studyWalletId = values[1]["walletId"]; @@ -201,6 +256,16 @@ qx.Class.define("osparc.study.StudyOptions", { }); }, + setStudyData: function(studyData) { + this.__studyData = osparc.data.model.Study.deepCloneStudyObject(studyData); + + const titleField = this.getChildControl("title-field"); + titleField.setValue(this.__studyData["name"]); + + const studyPricingUnits = this.getChildControl("study-pricing-units"); + studyPricingUnits.setStudyData(this.__studyData); + }, + __applyWallet: function(wallet) { if (wallet) { const walletSelector = this.getChildControl("wallet-selector"); @@ -224,9 +289,6 @@ qx.Class.define("osparc.study.StudyOptions", { const store = osparc.store.Store.getInstance(); const titleField = this.getChildControl("title-field"); - if (this.__studyData) { - titleField.setValue(this.__studyData["name"]); - } titleField.addListener("appear", () => { titleField.focus(); titleField.activate(); @@ -261,21 +323,7 @@ qx.Class.define("osparc.study.StudyOptions", { }, __buildOptionsLayout: function() { - const loadingImage = this.getChildControl("loading-units-spinner"); - const unitsBoxesLayout = this.getChildControl("services-resources-layout"); - const unitsLoading = () => { - loadingImage.show(); - unitsBoxesLayout.exclude(); - }; - const unitsReady = () => { - loadingImage.exclude(); - unitsBoxesLayout.show(); - }; - unitsLoading(); - const studyPricingUnits = new osparc.study.StudyPricingUnits(this.__studyData); - studyPricingUnits.addListener("loadingUnits", () => unitsLoading()); - studyPricingUnits.addListener("unitsReady", () => unitsReady()); - unitsBoxesLayout.add(studyPricingUnits); + this.getChildControl("study-pricing-units"); }, __buildButtons: function() { @@ -291,48 +339,34 @@ qx.Class.define("osparc.study.StudyOptions", { const openButton = this.getChildControl("open-button"); openButton.setFetching(true); - // first, update the name if necessary - const titleSelection = this.getChildControl("title-field").getValue(); - if (this.__studyData && this.__studyData["name"] !== titleSelection) { - await this.__updateName(this.__studyData, titleSelection); - } + if (this.isPatchStudy()) { + // first, update the name if necessary + const titleSelection = this.getChildControl("title-field").getValue(); + if (this.__studyData["name"] !== titleSelection) { + await this.self().updateName(this.__studyData, titleSelection); + } - // second, update the wallet if necessary - const store = osparc.store.Store.getInstance(); - const walletSelection = this.getChildControl("wallet-selector").getSelection(); - const studyId = this.getStudyId(); - if (studyId && walletSelection.length && walletSelection[0]["walletId"]) { - const params = { - url: { - studyId, - "walletId": walletSelection[0]["walletId"] - } - }; - osparc.data.Resources.fetch("studies", "selectWallet", params) - .then(() => { - store.setActiveWallet(this.getWallet()); - this.fireEvent("startStudy"); - }) - .catch(err => { - console.error(err); - const msg = err.message || this.tr("Error selecting Credit Account"); - osparc.FlashMessenger.getInstance().logAs(msg, "ERROR"); - }) - .finally(() => openButton.setFetching(false)); + // second, update the wallet if necessary + const store = osparc.store.Store.getInstance(); + const walletSelection = this.getChildControl("wallet-selector").getSelection(); + if (walletSelection.length && walletSelection[0]["walletId"]) { + const studyId = this.getStudyId(); + const walletId = walletSelection[0]["walletId"]; + this.self().updateWallet(studyId, walletId) + .then(() => { + store.setActiveWallet(this.getWallet()); + this.fireEvent("startStudy"); + }) + .finally(() => openButton.setFetching(false)); + } else { + store.setActiveWallet(this.getWallet()); + this.fireEvent("startStudy"); + openButton.setFetching(false); + } } else { - store.setActiveWallet(this.getWallet()); this.fireEvent("startStudy"); openButton.setFetching(false); } }, - - __updateName: function(studyData, name) { - return osparc.info.StudyUtils.patchStudyData(studyData, "name", name) - .catch(err => { - console.error(err); - const msg = this.tr("Something went wrong Renaming"); - osparc.FlashMessenger.logAs(msg, "ERROR"); - }); - } } }); diff --git a/services/static-webserver/client/source/class/osparc/study/StudyPricingUnits.js b/services/static-webserver/client/source/class/osparc/study/StudyPricingUnits.js index 793fee5cb34..e3e8514fbaf 100644 --- a/services/static-webserver/client/source/class/osparc/study/StudyPricingUnits.js +++ b/services/static-webserver/client/source/class/osparc/study/StudyPricingUnits.js @@ -25,9 +25,11 @@ qx.Class.define("osparc.study.StudyPricingUnits", { layout: new qx.ui.layout.VBox(5) }); - this.__studyData = studyData; + this.__nodePricingUnits = []; - this.__showPricingUnits(); + if (studyData) { + this.setStudyData(studyData); + } }, events: { @@ -35,8 +37,20 @@ qx.Class.define("osparc.study.StudyPricingUnits", { "unitsReady": "qx.event.type.Event" }, + statics: { + includeInList: function(node) { + return !osparc.data.model.Node.isFrontend(node); + }, + }, + members: { __studyData: null, + __nodePricingUnits: null, + + setStudyData: function(studyData) { + this.__studyData = studyData; + this.__showPricingUnits(); + }, __showPricingUnits: function() { const unitsLoading = () => this.fireEvent("loadingUnits"); @@ -48,16 +62,20 @@ qx.Class.define("osparc.study.StudyPricingUnits", { const workbench = this.__studyData["workbench"]; Object.keys(workbench).forEach(nodeId => { const node = workbench[nodeId]; - if (osparc.data.model.Node.isFrontend(node)) { - return; + if (this.self().includeInList(node)) { + const nodePricingUnits = new osparc.study.NodePricingUnits(this.__studyData["uuid"], nodeId, node); + this.__nodePricingUnits.push(nodePricingUnits); + this._add(nodePricingUnits); + promises.push(nodePricingUnits.showPricingUnits()); } - const nodePricingUnits = new osparc.study.NodePricingUnits(this.__studyData["uuid"], nodeId, node); - this._add(nodePricingUnits); - promises.push(nodePricingUnits.showPricingUnits()); }); } Promise.all(promises) .then(() => unitsAdded()); - } + }, + + getNodePricingUnits: function() { + return this.__nodePricingUnits; + }, } }); diff --git a/services/static-webserver/client/source/class/osparc/study/Utils.js b/services/static-webserver/client/source/class/osparc/study/Utils.js index 0240d263e47..66ed40201f4 100644 --- a/services/static-webserver/client/source/class/osparc/study/Utils.js +++ b/services/static-webserver/client/source/class/osparc/study/Utils.js @@ -255,7 +255,7 @@ qx.Class.define("osparc.study.Utils", { }, this); task.addListener("resultReceived", e => { const studyData = e.getData(); - resolve(studyData["uuid"]); + resolve(studyData); }, this); task.addListener("pollingError", e => { const err = e.getData(); diff --git a/services/storage/requirements/_base.txt b/services/storage/requirements/_base.txt index edadd851b65..c73f10b2ef0 100644 --- a/services/storage/requirements/_base.txt +++ b/services/storage/requirements/_base.txt @@ -78,7 +78,6 @@ async-timeout==4.0.3 # via # aiopg # asyncpg - # redis asyncpg==0.29.0 # via sqlalchemy attrs==23.2.0 diff --git a/services/storage/requirements/_test.txt b/services/storage/requirements/_test.txt index 1e33824a7c0..f0132fe4c7c 100644 --- a/services/storage/requirements/_test.txt +++ b/services/storage/requirements/_test.txt @@ -13,10 +13,6 @@ aiosignal==1.3.1 # aiohttp antlr4-python3-runtime==4.13.2 # via moto -async-timeout==4.0.3 - # via - # -c requirements/_base.txt - # redis attrs==23.2.0 # via # -c requirements/_base.txt diff --git a/services/web/server/requirements/_base.txt b/services/web/server/requirements/_base.txt index 5b42c95fffd..01c8859912d 100644 --- a/services/web/server/requirements/_base.txt +++ b/services/web/server/requirements/_base.txt @@ -89,7 +89,6 @@ async-timeout==4.0.3 # via # aiohttp # aiopg - # redis asyncpg==0.27.0 # via # -r requirements/_base.in diff --git a/services/web/server/requirements/_test.txt b/services/web/server/requirements/_test.txt index 67fcd247fda..3aab7cde47d 100644 --- a/services/web/server/requirements/_test.txt +++ b/services/web/server/requirements/_test.txt @@ -18,7 +18,6 @@ async-timeout==4.0.3 # via # -c requirements/_base.txt # aiohttp - # redis asyncpg==0.27.0 # via # -c requirements/_base.txt diff --git a/tests/swarm-deploy/requirements/_test.txt b/tests/swarm-deploy/requirements/_test.txt index 2f4dc983011..dad3c42339d 100644 --- a/tests/swarm-deploy/requirements/_test.txt +++ b/tests/swarm-deploy/requirements/_test.txt @@ -197,6 +197,10 @@ opentelemetry-api==1.27.0 # opentelemetry-exporter-otlp-proto-grpc # opentelemetry-exporter-otlp-proto-http # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiopg + # opentelemetry-instrumentation-asyncpg + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk # opentelemetry-semantic-conventions @@ -213,7 +217,22 @@ opentelemetry-exporter-otlp-proto-grpc==1.27.0 opentelemetry-exporter-otlp-proto-http==1.27.0 # via opentelemetry-exporter-otlp opentelemetry-instrumentation==0.48b0 - # via opentelemetry-instrumentation-requests + # via + # opentelemetry-instrumentation-aiopg + # opentelemetry-instrumentation-asyncpg + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-redis + # opentelemetry-instrumentation-requests +opentelemetry-instrumentation-aiopg==0.48b0 + # via -r requirements/../../../packages/simcore-sdk/requirements/_base.in +opentelemetry-instrumentation-asyncpg==0.48b0 + # via -r requirements/../../../packages/simcore-sdk/requirements/../../../packages/postgres-database/requirements/_base.in +opentelemetry-instrumentation-dbapi==0.48b0 + # via opentelemetry-instrumentation-aiopg +opentelemetry-instrumentation-redis==0.48b0 + # via + # -r requirements/../../../packages/service-library/requirements/_base.in + # -r requirements/../../../packages/simcore-sdk/requirements/../../../packages/service-library/requirements/_base.in opentelemetry-instrumentation-requests==0.48b0 # via # -r requirements/../../../packages/service-library/requirements/_base.in @@ -231,6 +250,9 @@ opentelemetry-sdk==1.27.0 # opentelemetry-exporter-otlp-proto-http opentelemetry-semantic-conventions==0.48b0 # via + # opentelemetry-instrumentation-asyncpg + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-redis # opentelemetry-instrumentation-requests # opentelemetry-sdk opentelemetry-util-http==0.48b0 @@ -494,6 +516,9 @@ wrapt==1.16.0 # via # deprecated # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiopg + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-redis yarl==1.12.1 # via # -r requirements/../../../packages/simcore-sdk/requirements/../../../packages/postgres-database/requirements/_base.in