adap · danieljanes · Feb 28, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024
@@ -14,8 +14,8 @@
 # ==============================================================================
 """Flower server app."""
 
-
 import argparse
+import asyncio
 import importlib.util
 import sys
 import threading
@@ -362,13 +362,15 @@ def run_superlink() -> None:
         )
         grpc_servers.append(fleet_server)
     elif args.fleet_api_type == TRANSPORT_TYPE_VCE:
+        f_stop = asyncio.Event()  # Does nothing
         _run_fleet_api_vce(
             num_supernodes=args.num_supernodes,
             client_app_module_name=args.client_app,
             backend_name=args.backend,
             backend_config_json_stream=args.backend_config,
             working_dir=args.dir,
             state_factory=state_factory,
+            f_stop=f_stop,
         )
     else:
         raise ValueError(f"Unknown fleet_api_type: {args.fleet_api_type}")
@@ -468,6 +470,7 @@ def _run_fleet_api_vce(
     backend_config_json_stream: str,
     working_dir: str,
     state_factory: StateFactory,
+    f_stop: asyncio.Event,
 ) -> None:
     log(INFO, "Flower VCE: Starting Fleet API (VirtualClientEngine)")
 
@@ -478,6 +481,7 @@ def _run_fleet_api_vce(
         backend_config_json_stream=backend_config_json_stream,
         state_factory=state_factory,
         working_dir=working_dir,
+        f_stop=f_stop,
     )
 
 

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Fleet VirtualClientEngine side."""
+"""Fleet Simulation Engine side."""
 
 from .vce_api import start_vce
 

@@ -141,7 +141,7 @@ async def process_message(
 
         Return output message and updated context.
         """
-        node_id = message.metadata.dst_node_id
+        node_id = message.metadata.partition_id
 
         try:
             # Submite a task to the pool
@@ -163,10 +163,9 @@ async def process_message(
         except LoadClientAppError as load_ex:
             log(
                 ERROR,
-                "An exception was raised when processing a message. Terminating %s",
+                "An exception was raised when processing a message by %s",
                 self.__class__.__name__,
             )
-            await self.terminate()
             raise load_ex
 
     async def terminate(self) -> None:

@@ -20,6 +20,8 @@
 from typing import Callable, Dict, Optional, Tuple, Union
 from unittest import IsolatedAsyncioTestCase
 
+import ray
+
 from flwr.client import Client, NumPyClient
 from flwr.client.client_app import ClientApp, LoadClientAppError, load_client_app
 from flwr.common import (
@@ -119,6 +121,11 @@ def _create_message_and_context() -> Tuple[Message, Context, float]:
 class AsyncTestRayBackend(IsolatedAsyncioTestCase):
     """A basic class that allows runnig multliple asyncio tests."""
 
+    async def on_cleanup(self) -> None:
+        """Ensure Ray has shutdown."""
+        if ray.is_initialized():
+            ray.shutdown()
+
     def test_backend_creation_and_termination(self) -> None:
         """Test creation of RayBackend and its termination."""
         backend = RayBackend(backend_config={}, work_dir="")
@@ -171,6 +178,7 @@ def test_backend_creation_submit_and_termination_non_existing_client_app(
             self.test_backend_creation_submit_and_termination(
                 client_app_loader=_load_from_module("a_non_existing_module:app")
             )
+        self.addAsyncCleanup(self.on_cleanup)
 
     def test_backend_creation_submit_and_termination_existing_client_app(
         self,
@@ -198,3 +206,4 @@ def test_backend_creation_submit_and_termination_existing_client_app_unsetworkdi
                 client_app_loader=_load_from_module("raybackend_test:client_app"),
                 workdir="/?&%$^#%@$!",
             )
+        self.addAsyncCleanup(self.on_cleanup)
@@ -12,19 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Fleet VirtualClientEngine API."""
+"""Fleet Simulation Engine API."""
+
 
 import asyncio
 import json
-from logging import ERROR, INFO
-from typing import Dict, Optional
+import traceback
+from logging import DEBUG, ERROR, INFO, WARN
+from typing import Callable, Dict, List, Optional
 
-from flwr.client.client_app import ClientApp, load_client_app
+from flwr.client.client_app import ClientApp, LoadClientAppError, load_client_app
 from flwr.client.node_state import NodeState
 from flwr.common.logger import log
+from flwr.common.serde import message_from_taskins, message_to_taskres
+from flwr.proto.task_pb2 import TaskIns  # pylint: disable=E0611
 from flwr.server.superlink.state import StateFactory
 
-from .backend import error_messages_backends, supported_backends
+from .backend import Backend, error_messages_backends, supported_backends
 
 NodeToPartitionMapping = Dict[int, int]
 
@@ -42,21 +46,217 @@ def _register_nodes(
     return nodes_mapping
 
 
-# pylint: disable=too-many-arguments,unused-argument
+# pylint: disable=too-many-arguments,too-many-locals
+async def worker(
+    app: Callable[[], ClientApp],
+    queue: "asyncio.Queue[TaskIns]",
+    node_states: Dict[int, NodeState],
+    state_factory: StateFactory,
+    nodes_mapping: NodeToPartitionMapping,
+    backend: Backend,
+) -> None:
+    """Get TaskIns from queue and pass it to an actor in the pool to execute it."""
+    state = state_factory.state()
+    while True:
+        try:
+            task_ins: TaskIns = await queue.get()
+            node_id = task_ins.task.consumer.node_id
+
+            # Register and retrieve runstate
+            node_states[node_id].register_context(run_id=task_ins.run_id)
+            context = node_states[node_id].retrieve_context(run_id=task_ins.run_id)
+
+            # Convert TaskIns to Message
+            message = message_from_taskins(task_ins)
+            # Replace node ID with data partition ID
+            message.metadata.partition_id = nodes_mapping[node_id]
+
+            # Let backend process message
+            out_mssg, updated_context = await backend.process_message(
+                app, message, context
+            )
+
+            # Update Context
+            node_states[node_id].update_context(
+                task_ins.run_id, context=updated_context
+            )
+
+            # Convert to TaskRes
+            task_res = message_to_taskres(out_mssg)
+            # Store TaskRes in state
+            state.store_task_res(task_res)
+
+        except asyncio.CancelledError as e:
+            log(DEBUG, "Async worker: %s", e)
+            break
+
+        except LoadClientAppError as app_ex:
+            log(ERROR, "Async worker: %s", app_ex)
+            log(ERROR, traceback.format_exc())
+            raise
+
+        except Exception as ex:  # pylint: disable=broad-exception-caught
+            log(ERROR, ex)
+            log(ERROR, traceback.format_exc())
+            break
+
+
+async def add_taskins_to_queue(
+    queue: "asyncio.Queue[TaskIns]",
+    state_factory: StateFactory,
+    nodes_mapping: NodeToPartitionMapping,
+    backend: Backend,
+    consumers: List["asyncio.Task[None]"],
+    f_stop: asyncio.Event,
+) -> None:
+    """Retrieve TaskIns and add it to the queue."""
+    state = state_factory.state()
+    num_initial_consumers = len(consumers)
+    while not f_stop.is_set():
+        for node_id in nodes_mapping.keys():
+            task_ins = state.get_task_ins(node_id=node_id, limit=1)
+            if task_ins:
+                await queue.put(task_ins[0])
+
+        # Count consumers that are running
+        num_active = sum(not (cc.done()) for cc in consumers)
+
+        # Alert if number of consumers decreased by half
+        if num_active < num_initial_consumers // 2:
+            log(
+                WARN,
+                "Number of active workers has more than halved: (%i/%i active)",
+                num_active,
+                num_initial_consumers,
+            )
+
+        # Break if consumers died
+        if num_active == 0:
+            raise RuntimeError("All workers have died. Ending Simulation.")
+
+        # Log some stats
+        log(
+            DEBUG,
+            "Simulation Engine stats: "
+            "Active workers: (%i/%i) | %s (%i workers) | Tasks in queue: %i)",
+            num_active,
+            num_initial_consumers,
+            backend.__class__.__name__,
+            backend.num_workers,
+            queue.qsize(),
+        )
+        await asyncio.sleep(1.0)
+    log(DEBUG, "Async producer: Stopped pulling from StateFactory.")
+
+
+async def run(
+    app: Callable[[], ClientApp],
+    backend: Backend,
+    nodes_mapping: NodeToPartitionMapping,
+    state_factory: StateFactory,
+    node_states: Dict[int, NodeState],
+    f_stop: asyncio.Event,
+) -> None:
+    """Run the VCE async."""
+    # pylint: disable=fixme
+    queue: "asyncio.Queue[TaskIns]" = asyncio.Queue(128)
+
+    try:
+        # Build backend
+        await backend.build()
+
+        # Add workers (they submit Messages to Backend)
+        worker_tasks = [
+            asyncio.create_task(
+                worker(app, queue, node_states, state_factory, nodes_mapping, backend)
+            )
+            for _ in range(backend.num_workers)
+        ]
+        # Create producer (adds TaskIns into Queue)
+        producer = asyncio.create_task(
+            add_taskins_to_queue(
+                queue, state_factory, nodes_mapping, backend, worker_tasks, f_stop
+            )
+        )
+
+        # Wait for producer to finish
+        # The producer runs forever until f_stop is set or until
+        # all worker (consumer) coroutines are completed. Workers
+        # also run forever and only end if an exception is raised.
+        await asyncio.gather(producer)
+
+    except Exception as ex:
+
+        log(ERROR, "An exception occured!! %s", ex)
+        log(ERROR, traceback.format_exc())
+        log(WARN, "Stopping Simulation Engine.")
+
+        # Manually trigger stopping event
+        f_stop.set()
+
+        # Raise exception
+        raise RuntimeError("Simulation Engine crashed.") from ex
+
+    finally:
+        # Produced task terminated, now cancel worker tasks
+        for w_t in worker_tasks:
+            _ = w_t.cancel()
+
+        while not all(w_t.done() for w_t in worker_tasks):
+            log(DEBUG, "Terminating async workers...")
+            await asyncio.sleep(0.5)
+
+        await asyncio.gather(*[w_t for w_t in worker_tasks if not w_t.done()])
+
+        # Terminate backend
+        await backend.terminate()
+
+
+# pylint: disable=too-many-arguments,unused-argument,too-many-locals
 def start_vce(
-    num_supernodes: int,
     client_app_module_name: str,
     backend_name: str,
     backend_config_json_stream: str,
-    state_factory: StateFactory,
     working_dir: str,
-    f_stop: Optional[asyncio.Event] = None,
+    f_stop: asyncio.Event,
+    num_supernodes: Optional[int] = None,
+    state_factory: Optional[StateFactory] = None,
+    existing_nodes_mapping: Optional[NodeToPartitionMapping] = None,
 ) -> None:
-    """Start Fleet API with the VirtualClientEngine (VCE)."""
-    # Register SuperNodes
-    nodes_mapping = _register_nodes(
-        num_nodes=num_supernodes, state_factory=state_factory
-    )
+    """Start Fleet API with the Simulation Engine."""
+    if num_supernodes is not None and existing_nodes_mapping is not None:
+        raise ValueError(
+            "Both `num_supernodes` and `existing_nodes_mapping` are provided, "
+            "but only one is allowed."
+        )
+    if num_supernodes is None:
+        if state_factory is None or existing_nodes_mapping is None:
+            raise ValueError(
+                "If not passing an existing `state_factory` and associated "
+                "`existing_nodes_mapping` you must supply `num_supernodes` to indicate "
+                "how many nodes to insert into a new StateFactory that will be created."
+            )
+    if existing_nodes_mapping:
+        if state_factory is None:
+            raise ValueError(
+                "You passed `existing_nodes_mapping` but no `state_factory` was passed."
+            )
+        log(INFO, "Using exiting NodeToPartitionMapping and StateFactory.")
+        # Use mapping constructed externally. This also means nodes
+        # have previously being registered.
+        nodes_mapping = existing_nodes_mapping
+
+    if not state_factory:
+        log(INFO, "A StateFactory was not supplied to the SimulationEngine.")
+        # Create an empty in-memory state factory
+        state_factory = StateFactory(":flwr-in-memory-state:")
+        log(INFO, "Created new %s.", state_factory.__class__.__name__)
+
+    if num_supernodes:
+        # Register SuperNodes
+        nodes_mapping = _register_nodes(
+            num_nodes=num_supernodes, state_factory=state_factory
+        )
 
     # Construct mapping of NodeStates
     node_states: Dict[int, NodeState] = {}
@@ -69,7 +269,7 @@ def start_vce(
 
     try:
         backend_type = supported_backends[backend_name]
-        _ = backend_type(backend_config, work_dir=working_dir)
+        backend = backend_type(backend_config, work_dir=working_dir)
     except KeyError as ex:
         log(
             ERROR,
@@ -89,4 +289,15 @@ def _load() -> ClientApp:
         app: ClientApp = load_client_app(client_app_module_name)
         return app
 
-    # start backend
+    app = _load
+
+    asyncio.run(
+        run(
+            app,
+            backend,
+            nodes_mapping,
+            state_factory,
+            node_states,
+            f_stop,
+        )
+    )