Remove recursion in task spec #14402

Sign in to view logs

GitHub Actions / Unit Test Results failed Nov 13, 2024 in 0s

2 errors, 223 fail, 179 skipped, 3 519 pass in 1h 55m 26s

7 files - 18 7 suites - 18 1h 55m 26s ⏱️ - 9h 3m 29s
3 923 tests - 207 3 519 ✅ - 268 179 💤 + 69 223 ❌ - 8 2 🔥 ±0
10 959 runs - 36 735 9 844 ✅ - 34 741 860 💤 - 1 261 253 ❌ - 731 2 🔥 - 2

Results for commit 877fb11. ± Comparison against earlier commit 7be106e.

Annotations

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_basic_merge[inner] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='88efd7ed4eeba2e77f68f8a2f7cad3f4' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: '88efd7ed4eeba2e77f68f8a2f7cad3f4'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:37071', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'inner'

    @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
    @gen_cluster(client=True)
    async def test_basic_merge(c, s, a, b, how):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        joined = a.merge(b, left_on="y", right_on="y", how=how)
    
        if dd._dask_expr_enabled():
            # Ensure we're using a hash join
            from dask_expr._merge import HashJoinP2P
    
            assert any(
                isinstance(expr, HashJoinP2P) for expr in joined.optimize()._expr.walk()
            )
    
        expected = pd.merge(A, B, how, "y")
>       await list_eq(joined, expected)

distributed/shuffle/tests/test_merge.py:91: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/shuffle/tests/test_merge.py:35: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='88efd7ed4eeba2e77f68f8a2f7cad3f4' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_basic_merge[left] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='fa9a14514d4b429907ecb4ca222694d3' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: 'fa9a14514d4b429907ecb4ca222694d3'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:34425', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'left'

    @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
    @gen_cluster(client=True)
    async def test_basic_merge(c, s, a, b, how):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        joined = a.merge(b, left_on="y", right_on="y", how=how)
    
        if dd._dask_expr_enabled():
            # Ensure we're using a hash join
            from dask_expr._merge import HashJoinP2P
    
            assert any(
                isinstance(expr, HashJoinP2P) for expr in joined.optimize()._expr.walk()
            )
    
        expected = pd.merge(A, B, how, "y")
>       await list_eq(joined, expected)

distributed/shuffle/tests/test_merge.py:91: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/shuffle/tests/test_merge.py:35: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='fa9a14514d4b429907ecb4ca222694d3' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_basic_merge[right] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='7ffbc51837584688d9dd80c2a12c24ae' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: '7ffbc51837584688d9dd80c2a12c24ae'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:37301', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'right'

    @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
    @gen_cluster(client=True)
    async def test_basic_merge(c, s, a, b, how):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        joined = a.merge(b, left_on="y", right_on="y", how=how)
    
        if dd._dask_expr_enabled():
            # Ensure we're using a hash join
            from dask_expr._merge import HashJoinP2P
    
            assert any(
                isinstance(expr, HashJoinP2P) for expr in joined.optimize()._expr.walk()
            )
    
        expected = pd.merge(A, B, how, "y")
>       await list_eq(joined, expected)

distributed/shuffle/tests/test_merge.py:91: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/shuffle/tests/test_merge.py:35: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='7ffbc51837584688d9dd80c2a12c24ae' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_basic_merge[outer] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='55c8f67094461ff624233f674037e5c8' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: '55c8f67094461ff624233f674037e5c8'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:45827', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'outer'

    @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
    @gen_cluster(client=True)
    async def test_basic_merge(c, s, a, b, how):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        joined = a.merge(b, left_on="y", right_on="y", how=how)
    
        if dd._dask_expr_enabled():
            # Ensure we're using a hash join
            from dask_expr._merge import HashJoinP2P
    
            assert any(
                isinstance(expr, HashJoinP2P) for expr in joined.optimize()._expr.walk()
            )
    
        expected = pd.merge(A, B, how, "y")
>       await list_eq(joined, expected)

distributed/shuffle/tests/test_merge.py:91: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/shuffle/tests/test_merge.py:35: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='55c8f67094461ff624233f674037e5c8' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_merge_p2p_shuffle_reused_dataframe_with_different_parameters (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='86b13a64d50dcd6bc32f511f59254dc9' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: '86b13a64d50dcd6bc32f511f59254dc9'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:40355', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:41533', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:39121', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>

    @gen_cluster(client=True)
    async def test_merge_p2p_shuffle_reused_dataframe_with_different_parameters(c, s, a, b):
        pdf1 = pd.DataFrame({"a": range(100), "b": range(0, 200, 2)})
        pdf2 = pd.DataFrame({"x": range(200), "y": [1, 2, 3, 4] * 50})
        ddf1 = dd.from_pandas(pdf1, npartitions=5)
        ddf2 = dd.from_pandas(pdf2, npartitions=10)
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            out = (
                ddf1.merge(ddf2, left_on="a", right_on="x")
                # Vary the number of output partitions for the shuffles of dd2
                .repartition(npartitions=20).merge(ddf2, left_on="b", right_on="x")
            )
        # Generate unique shuffle IDs if the input frame is the same but
        # parameters differ. Reusing shuffles in merges is dangerous because of the
        # required coordination and complexity introduced through dynamic clusters.
        assert sum(id_from_key(k) is not None for k in out.dask) == 4
>       result = await c.compute(out)

distributed/shuffle/tests/test_merge.py:126: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='86b13a64d50dcd6bc32f511f59254dc9' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_merge_p2p_shuffle_reused_dataframe_with_same_parameters (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='75290a8c4f23c2d04c2280727d1b7cb9' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: '75290a8c4f23c2d04c2280727d1b7cb9'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:45157', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:36195', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:44547', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>

    @gen_cluster(client=True)
    async def test_merge_p2p_shuffle_reused_dataframe_with_same_parameters(c, s, a, b):
        pdf1 = pd.DataFrame({"a": range(100), "b": range(0, 200, 2)})
        pdf2 = pd.DataFrame({"x": range(200), "y": [1, 2, 3, 4] * 50})
        ddf1 = dd.from_pandas(pdf1, npartitions=5)
        ddf2 = dd.from_pandas(pdf2, npartitions=10)
    
        # This performs two shuffles:
        #   * ddf1 is shuffled on `a`
        #   * ddf2 is shuffled on `x`
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            ddf3 = ddf1.merge(
                ddf2,
                left_on="a",
                right_on="x",
            )
    
        # This performs one shuffle:
        #   * ddf3 is shuffled on `b`
        # We can reuse the shuffle of dd2 on `x` from the previous merge.
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            out = ddf2.merge(
                ddf3,
                left_on="x",
                right_on="b",
            )
        # Generate unique shuffle IDs if the input frame is the same and all its
        # parameters match. Reusing shuffles in merges is dangerous because of the
        # required coordination and complexity introduced through dynamic clusters.
        assert sum(id_from_key(k) is not None for k in out.dask) == 4
>       result = await c.compute(out)

distributed/shuffle/tests/test_merge.py:163: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='75290a8c4f23c2d04c2280727d1b7cb9' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_merge[True-inner] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='129db57dd9cf6a130cec5c2223b03058' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: '129db57dd9cf6a130cec5c2223b03058'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:42569', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'inner', disk = True

    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    @pytest.mark.parametrize("disk", [True, False])
    @gen_cluster(client=True)
    async def test_merge(c, s, a, b, how, disk):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            with dask.config.set({"distributed.p2p.storage.disk": disk}):
                joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
            res = await c.compute(joined)
            assert_eq(
                res,
                pd.merge(A, B, left_index=True, right_index=True, how=how),
            )
            joined = dd.merge(a, b, on="y", how=how)
>           await list_eq(joined, pd.merge(A, B, on="y", how=how))

distributed/shuffle/tests/test_merge.py:189: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/shuffle/tests/test_merge.py:35: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='129db57dd9cf6a130cec5c2223b03058' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_merge[True-outer] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='d02c927e1c2407360c1725e5504c5b5b' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: 'd02c927e1c2407360c1725e5504c5b5b'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:37419', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'outer', disk = True

    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    @pytest.mark.parametrize("disk", [True, False])
    @gen_cluster(client=True)
    async def test_merge(c, s, a, b, how, disk):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            with dask.config.set({"distributed.p2p.storage.disk": disk}):
                joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
            res = await c.compute(joined)
            assert_eq(
                res,
                pd.merge(A, B, left_index=True, right_index=True, how=how),
            )
            joined = dd.merge(a, b, on="y", how=how)
>           await list_eq(joined, pd.merge(A, B, on="y", how=how))

distributed/shuffle/tests/test_merge.py:189: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/shuffle/tests/test_merge.py:35: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='d02c927e1c2407360c1725e5504c5b5b' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_merge[True-left] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='a17398d1df433992275b759dbf6b3345' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: 'a17398d1df433992275b759dbf6b3345'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:45443', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'left', disk = True

    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    @pytest.mark.parametrize("disk", [True, False])
    @gen_cluster(client=True)
    async def test_merge(c, s, a, b, how, disk):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            with dask.config.set({"distributed.p2p.storage.disk": disk}):
                joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
            res = await c.compute(joined)
            assert_eq(
                res,
                pd.merge(A, B, left_index=True, right_index=True, how=how),
            )
            joined = dd.merge(a, b, on="y", how=how)
>           await list_eq(joined, pd.merge(A, B, on="y", how=how))

distributed/shuffle/tests/test_merge.py:189: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/shuffle/tests/test_merge.py:35: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='a17398d1df433992275b759dbf6b3345' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_merge[True-right] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='d37e533e69214d1b7877feed132b7cff' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: 'd37e533e69214d1b7877feed132b7cff'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:33123', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'right', disk = True

    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    @pytest.mark.parametrize("disk", [True, False])
    @gen_cluster(client=True)
    async def test_merge(c, s, a, b, how, disk):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            with dask.config.set({"distributed.p2p.storage.disk": disk}):
                joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
            res = await c.compute(joined)
            assert_eq(
                res,
                pd.merge(A, B, left_index=True, right_index=True, how=how),
            )
            joined = dd.merge(a, b, on="y", how=how)
>           await list_eq(joined, pd.merge(A, B, on="y", how=how))

distributed/shuffle/tests/test_merge.py:189: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/shuffle/tests/test_merge.py:35: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='d37e533e69214d1b7877feed132b7cff' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_merge[False-inner] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='88efd7ed4eeba2e77f68f8a2f7cad3f4' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: '88efd7ed4eeba2e77f68f8a2f7cad3f4'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:38277', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'inner', disk = False

    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    @pytest.mark.parametrize("disk", [True, False])
    @gen_cluster(client=True)
    async def test_merge(c, s, a, b, how, disk):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            with dask.config.set({"distributed.p2p.storage.disk": disk}):
                joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
            res = await c.compute(joined)
            assert_eq(
                res,
                pd.merge(A, B, left_index=True, right_index=True, how=how),
            )
            joined = dd.merge(a, b, on="y", how=how)
>           await list_eq(joined, pd.merge(A, B, on="y", how=how))

distributed/shuffle/tests/test_merge.py:189: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/shuffle/tests/test_merge.py:35: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='88efd7ed4eeba2e77f68f8a2f7cad3f4' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_merge[False-outer] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='d02c927e1c2407360c1725e5504c5b5b' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: 'd02c927e1c2407360c1725e5504c5b5b'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:41549', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'outer', disk = False

    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    @pytest.mark.parametrize("disk", [True, False])
    @gen_cluster(client=True)
    async def test_merge(c, s, a, b, how, disk):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            with dask.config.set({"distributed.p2p.storage.disk": disk}):
                joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
            res = await c.compute(joined)
            assert_eq(
                res,
                pd.merge(A, B, left_index=True, right_index=True, how=how),
            )
            joined = dd.merge(a, b, on="y", how=how)
>           await list_eq(joined, pd.merge(A, B, on="y", how=how))

distributed/shuffle/tests/test_merge.py:189: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/shuffle/tests/test_merge.py:35: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='d02c927e1c2407360c1725e5504c5b5b' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_merge[False-left] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='a17398d1df433992275b759dbf6b3345' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: 'a17398d1df433992275b759dbf6b3345'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:35799', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'left', disk = False

    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    @pytest.mark.parametrize("disk", [True, False])
    @gen_cluster(client=True)
    async def test_merge(c, s, a, b, how, disk):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            with dask.config.set({"distributed.p2p.storage.disk": disk}):
                joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
            res = await c.compute(joined)
            assert_eq(
                res,
                pd.merge(A, B, left_index=True, right_index=True, how=how),
            )
            joined = dd.merge(a, b, on="y", how=how)
>           await list_eq(joined, pd.merge(A, B, on="y", how=how))

distributed/shuffle/tests/test_merge.py:189: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/shuffle/tests/test_merge.py:35: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='a17398d1df433992275b759dbf6b3345' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_merge[False-right] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='7ffbc51837584688d9dd80c2a12c24ae' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: '7ffbc51837584688d9dd80c2a12c24ae'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:43261', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
                   x      y
npartitions=2              
0              int64  int64
4                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
                   y      z
npartitions=2              
0              int64  int64
2                ...    ...
5                ...    ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'right', disk = False

    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    @pytest.mark.parametrize("disk", [True, False])
    @gen_cluster(client=True)
    async def test_merge(c, s, a, b, how, disk):
        A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
        a = dd.repartition(A, [0, 4, 5])
    
        B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
        b = dd.repartition(B, [0, 2, 5])
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            with dask.config.set({"distributed.p2p.storage.disk": disk}):
                joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
            res = await c.compute(joined)
            assert_eq(
                res,
                pd.merge(A, B, left_index=True, right_index=True, how=how),
            )
            joined = dd.merge(a, b, on="y", how=how)
>           await list_eq(joined, pd.merge(A, B, on="y", how=how))

distributed/shuffle/tests/test_merge.py:189: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/shuffle/tests/test_merge.py:35: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='7ffbc51837584688d9dd80c2a12c24ae' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_merge_by_multiple_columns[inner] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='3f19fbff373827e995deef2e4efc6b71' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: '3f19fbff373827e995deef2e4efc6b71'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:43283', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:44127', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:41115', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'inner'

    @pytest.mark.slow
    @gen_cluster(client=True, timeout=120)
    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    async def test_merge_by_multiple_columns(c, s, a, b, how):
        # warnings here from pandas
        pdf1l = pd.DataFrame(
            {
                "a": list("abcdefghij"),
                "b": list("abcdefghij"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf1r = pd.DataFrame(
            {
                "d": list("abcdefghij"),
                "e": list("abcdefghij"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("abcdefghij"),
        )
    
        pdf2l = pd.DataFrame(
            {
                "a": list("abcdeabcde"),
                "b": list("abcabcabca"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf2r = pd.DataFrame(
            {
                "d": list("edcbaedcba"),
                "e": list("aaabbbcccd"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("fghijklmno"),
        )
    
        pdf3l = pd.DataFrame(
            {
                "a": list("aaaaaaaaaa"),
                "b": list("aaaaaaaaaa"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf3r = pd.DataFrame(
            {
                "d": list("aaabbbccaa"),
                "e": list("abbbbbbbbb"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("ABCDEFGHIJ"),
        )
    
        for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:
            for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:
                ddl = dd.from_pandas(pdl, lpart)
                ddr = dd.from_pandas(pdr, rpart)
    
                with dask.config.set({"dataframe.shuffle.method": "p2p"}):
                    expected = pdl.join(pdr, how=how)
                    assert_eq(
                        await c.compute(ddl.join(ddr, how=how)),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not expected.index.empty,
                    )
    
                    expected = pdr.join(pdl, how=how)
                    assert_eq(
                        await c.compute(ddr.join(ddl, how=how)),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not expected.index.empty,
                    )
    
                    expected = pd.merge(
                        pdl, pdr, how=how, left_index=True, right_index=True
                    )
                    assert_eq(
                        await c.compute(
                            dd.merge(
                                ddl,
                                ddr,
                                how=how,
                                left_index=True,
                                right_index=True,
                            )
                        ),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not expected.index.empty,
                    )
    
                    expected = pd.merge(
                        pdr, pdl, how=how, left_index=True, right_index=True
                    )
                    assert_eq(
                        await c.compute(
                            dd.merge(
                                ddr,
                                ddl,
                                how=how,
                                left_index=True,
                                right_index=True,
                            )
                        ),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not expected.index.empty,
                    )
    
                    # hash join
>                   await list_eq(
                        dd.merge(ddl, ddr, how=how, left_on="a", right_on="d"),
                        pd.merge(pdl, pdr, how=how, left_on="a", right_on="d"),
                    )

distributed/shuffle/tests/test_merge.py:351: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/shuffle/tests/test_merge.py:35: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='3f19fbff373827e995deef2e4efc6b71' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_merge_by_multiple_columns[outer] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='ff48afe84a60c2995a3961f2580060e6' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: 'ff48afe84a60c2995a3961f2580060e6'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:42005', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:35681', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:38593', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'outer'

    @pytest.mark.slow
    @gen_cluster(client=True, timeout=120)
    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    async def test_merge_by_multiple_columns(c, s, a, b, how):
        # warnings here from pandas
        pdf1l = pd.DataFrame(
            {
                "a": list("abcdefghij"),
                "b": list("abcdefghij"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf1r = pd.DataFrame(
            {
                "d": list("abcdefghij"),
                "e": list("abcdefghij"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("abcdefghij"),
        )
    
        pdf2l = pd.DataFrame(
            {
                "a": list("abcdeabcde"),
                "b": list("abcabcabca"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf2r = pd.DataFrame(
            {
                "d": list("edcbaedcba"),
                "e": list("aaabbbcccd"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("fghijklmno"),
        )
    
        pdf3l = pd.DataFrame(
            {
                "a": list("aaaaaaaaaa"),
                "b": list("aaaaaaaaaa"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf3r = pd.DataFrame(
            {
                "d": list("aaabbbccaa"),
                "e": list("abbbbbbbbb"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("ABCDEFGHIJ"),
        )
    
        for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:
            for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:
                ddl = dd.from_pandas(pdl, lpart)
                ddr = dd.from_pandas(pdr, rpart)
    
                with dask.config.set({"dataframe.shuffle.method": "p2p"}):
                    expected = pdl.join(pdr, how=how)
                    assert_eq(
                        await c.compute(ddl.join(ddr, how=how)),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not expected.index.empty,
                    )
    
                    expected = pdr.join(pdl, how=how)
                    assert_eq(
                        await c.compute(ddr.join(ddl, how=how)),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not expected.index.empty,
                    )
    
                    expected = pd.merge(
                        pdl, pdr, how=how, left_index=True, right_index=True
                    )
                    assert_eq(
                        await c.compute(
                            dd.merge(
                                ddl,
                                ddr,
                                how=how,
                                left_index=True,
                                right_index=True,
                            )
                        ),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not expected.index.empty,
                    )
    
                    expected = pd.merge(
                        pdr, pdl, how=how, left_index=True, right_index=True
                    )
                    assert_eq(
                        await c.compute(
                            dd.merge(
                                ddr,
                                ddl,
                                how=how,
                                left_index=True,
                                right_index=True,
                            )
                        ),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not expected.index.empty,
                    )
    
                    # hash join
>                   await list_eq(
                        dd.merge(ddl, ddr, how=how, left_on="a", right_on="d"),
                        pd.merge(pdl, pdr, how=how, left_on="a", right_on="d"),
                    )

distributed/shuffle/tests/test_merge.py:351: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/shuffle/tests/test_merge.py:35: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='ff48afe84a60c2995a3961f2580060e6' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_merge_by_multiple_columns[left] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='43555b120d9e1413e94141cd84fb0ebc' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: '43555b120d9e1413e94141cd84fb0ebc'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:33803', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:43739', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:36925', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'left'

    @pytest.mark.slow
    @gen_cluster(client=True, timeout=120)
    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    async def test_merge_by_multiple_columns(c, s, a, b, how):
        # warnings here from pandas
        pdf1l = pd.DataFrame(
            {
                "a": list("abcdefghij"),
                "b": list("abcdefghij"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf1r = pd.DataFrame(
            {
                "d": list("abcdefghij"),
                "e": list("abcdefghij"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("abcdefghij"),
        )
    
        pdf2l = pd.DataFrame(
            {
                "a": list("abcdeabcde"),
                "b": list("abcabcabca"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf2r = pd.DataFrame(
            {
                "d": list("edcbaedcba"),
                "e": list("aaabbbcccd"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("fghijklmno"),
        )
    
        pdf3l = pd.DataFrame(
            {
                "a": list("aaaaaaaaaa"),
                "b": list("aaaaaaaaaa"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf3r = pd.DataFrame(
            {
                "d": list("aaabbbccaa"),
                "e": list("abbbbbbbbb"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("ABCDEFGHIJ"),
        )
    
        for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:
            for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:
                ddl = dd.from_pandas(pdl, lpart)
                ddr = dd.from_pandas(pdr, rpart)
    
                with dask.config.set({"dataframe.shuffle.method": "p2p"}):
                    expected = pdl.join(pdr, how=how)
                    assert_eq(
                        await c.compute(ddl.join(ddr, how=how)),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not expected.index.empty,
                    )
    
                    expected = pdr.join(pdl, how=how)
                    assert_eq(
                        await c.compute(ddr.join(ddl, how=how)),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not expected.index.empty,
                    )
    
                    expected = pd.merge(
                        pdl, pdr, how=how, left_index=True, right_index=True
                    )
                    assert_eq(
                        await c.compute(
                            dd.merge(
                                ddl,
                                ddr,
                                how=how,
                                left_index=True,
                                right_index=True,
                            )
                        ),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not expected.index.empty,
                    )
    
                    expected = pd.merge(
                        pdr, pdl, how=how, left_index=True, right_index=True
                    )
                    assert_eq(
                        await c.compute(
                            dd.merge(
                                ddr,
                                ddl,
                                how=how,
                                left_index=True,
                                right_index=True,
                            )
                        ),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not expected.index.empty,
                    )
    
                    # hash join
>                   await list_eq(
                        dd.merge(ddl, ddr, how=how, left_on="a", right_on="d"),
                        pd.merge(pdl, pdr, how=how, left_on="a", right_on="d"),
                    )

distributed/shuffle/tests/test_merge.py:351: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/shuffle/tests/test_merge.py:35: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='43555b120d9e1413e94141cd84fb0ebc' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_merge_by_multiple_columns[right] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='5e5498128fe80c17101b42e21c7bab99' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: '5e5498128fe80c17101b42e21c7bab99'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:35303', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:44723', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:44417', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'right'

    @pytest.mark.slow
    @gen_cluster(client=True, timeout=120)
    @pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
    async def test_merge_by_multiple_columns(c, s, a, b, how):
        # warnings here from pandas
        pdf1l = pd.DataFrame(
            {
                "a": list("abcdefghij"),
                "b": list("abcdefghij"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf1r = pd.DataFrame(
            {
                "d": list("abcdefghij"),
                "e": list("abcdefghij"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("abcdefghij"),
        )
    
        pdf2l = pd.DataFrame(
            {
                "a": list("abcdeabcde"),
                "b": list("abcabcabca"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf2r = pd.DataFrame(
            {
                "d": list("edcbaedcba"),
                "e": list("aaabbbcccd"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("fghijklmno"),
        )
    
        pdf3l = pd.DataFrame(
            {
                "a": list("aaaaaaaaaa"),
                "b": list("aaaaaaaaaa"),
                "c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
            },
            index=list("abcdefghij"),
        )
        pdf3r = pd.DataFrame(
            {
                "d": list("aaabbbccaa"),
                "e": list("abbbbbbbbb"),
                "f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
            },
            index=list("ABCDEFGHIJ"),
        )
    
        for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:
            for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:
                ddl = dd.from_pandas(pdl, lpart)
                ddr = dd.from_pandas(pdr, rpart)
    
                with dask.config.set({"dataframe.shuffle.method": "p2p"}):
                    expected = pdl.join(pdr, how=how)
                    assert_eq(
                        await c.compute(ddl.join(ddr, how=how)),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not expected.index.empty,
                    )
    
                    expected = pdr.join(pdl, how=how)
                    assert_eq(
                        await c.compute(ddr.join(ddl, how=how)),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not expected.index.empty,
                    )
    
                    expected = pd.merge(
                        pdl, pdr, how=how, left_index=True, right_index=True
                    )
                    assert_eq(
                        await c.compute(
                            dd.merge(
                                ddl,
                                ddr,
                                how=how,
                                left_index=True,
                                right_index=True,
                            )
                        ),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not expected.index.empty,
                    )
    
                    expected = pd.merge(
                        pdr, pdl, how=how, left_index=True, right_index=True
                    )
                    assert_eq(
                        await c.compute(
                            dd.merge(
                                ddr,
                                ddl,
                                how=how,
                                left_index=True,
                                right_index=True,
                            )
                        ),
                        expected,
                        # FIXME: There's an discrepancy with an empty index for
                        # pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
                        # Temporarily avoid index check until the discrepancy is fixed.
                        check_index=not expected.index.empty,
                    )
    
                    # hash join
>                   await list_eq(
                        dd.merge(ddl, ddr, how=how, left_on="a", right_on="d"),
                        pd.merge(pdl, pdr, how=how, left_on="a", right_on="d"),
                    )

distributed/shuffle/tests/test_merge.py:351: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/shuffle/tests/test_merge.py:35: in list_eq
    a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='5e5498128fe80c17101b42e21c7bab99' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_index_merge_p2p[inner] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='d5690dfd886c8bf31a64d23407e4e6b0' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: 'd5690dfd886c8bf31a64d23407e4e6b0'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:38425', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:43171', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:34615', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'inner'

    @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
    @gen_cluster(client=True)
    async def test_index_merge_p2p(c, s, a, b, how):
        pdf_left = pd.DataFrame({"a": [4, 2, 3] * 10, "b": 1}).set_index("a")
        pdf_right = pd.DataFrame({"a": [4, 2, 3] * 10, "c": 1})
    
        left = dd.from_pandas(pdf_left, npartitions=5, sort=False)
        right = dd.from_pandas(pdf_right, npartitions=6)
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            assert_eq(
>               await c.compute(left.merge(right, how=how, left_index=True, right_on="a")),
                pdf_left.merge(pdf_right, how=how, left_index=True, right_on="a"),
            )

distributed/shuffle/tests/test_merge.py:388: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='d5690dfd886c8bf31a64d23407e4e6b0' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_index_merge_p2p[left] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='3eb8a5df59b96e9fb32c674d07990632' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: '3eb8a5df59b96e9fb32c674d07990632'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:36613', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:45137', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:36283', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'left'

    @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
    @gen_cluster(client=True)
    async def test_index_merge_p2p(c, s, a, b, how):
        pdf_left = pd.DataFrame({"a": [4, 2, 3] * 10, "b": 1}).set_index("a")
        pdf_right = pd.DataFrame({"a": [4, 2, 3] * 10, "c": 1})
    
        left = dd.from_pandas(pdf_left, npartitions=5, sort=False)
        right = dd.from_pandas(pdf_right, npartitions=6)
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            assert_eq(
>               await c.compute(left.merge(right, how=how, left_index=True, right_on="a")),
                pdf_left.merge(pdf_right, how=how, left_index=True, right_on="a"),
            )

distributed/shuffle/tests/test_merge.py:388: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='3eb8a5df59b96e9fb32c674d07990632' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_index_merge_p2p[right] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='24911519b4ce6c4b7de008abfae05db9' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: '24911519b4ce6c4b7de008abfae05db9'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:45739', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:36979', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:35315', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'right'

    @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
    @gen_cluster(client=True)
    async def test_index_merge_p2p(c, s, a, b, how):
        pdf_left = pd.DataFrame({"a": [4, 2, 3] * 10, "b": 1}).set_index("a")
        pdf_right = pd.DataFrame({"a": [4, 2, 3] * 10, "c": 1})
    
        left = dd.from_pandas(pdf_left, npartitions=5, sort=False)
        right = dd.from_pandas(pdf_right, npartitions=6)
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            assert_eq(
>               await c.compute(left.merge(right, how=how, left_index=True, right_on="a")),
                pdf_left.merge(pdf_right, how=how, left_index=True, right_on="a"),
            )

distributed/shuffle/tests/test_merge.py:388: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='24911519b4ce6c4b7de008abfae05db9' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_index_merge_p2p[outer] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='4a70d6e91519e0e1e8f20b9daf2fc606' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: '4a70d6e91519e0e1e8f20b9daf2fc606'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:34969', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:42535', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:42853', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'outer'

    @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
    @gen_cluster(client=True)
    async def test_index_merge_p2p(c, s, a, b, how):
        pdf_left = pd.DataFrame({"a": [4, 2, 3] * 10, "b": 1}).set_index("a")
        pdf_right = pd.DataFrame({"a": [4, 2, 3] * 10, "c": 1})
    
        left = dd.from_pandas(pdf_left, npartitions=5, sort=False)
        right = dd.from_pandas(pdf_right, npartitions=6)
    
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
            assert_eq(
>               await c.compute(left.merge(right, how=how, left_index=True, right_on="a")),
                pdf_left.merge(pdf_right, how=how, left_index=True, right_on="a"),
            )

distributed/shuffle/tests/test_merge.py:388: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='4a70d6e91519e0e1e8f20b9daf2fc606' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_merge_with_npartitions[4] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='2866af62945ea73255d6124f0c2ab890' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: '2866af62945ea73255d6124f0c2ab890'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:39931', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:36323', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:35263', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
npartitions = 4

    @pytest.mark.parametrize("npartitions", [4, 5, 10, 20])
    @gen_cluster(client=True)
    async def test_merge_with_npartitions(c, s, a, b, npartitions):
        pdf = pd.DataFrame({"a": [1, 2, 3, 4] * 10, "b": 1})
    
        left = dd.from_pandas(pdf, npartitions=10)
        right = dd.from_pandas(pdf, npartitions=5)
    
        expected = pdf.merge(pdf)
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
>           result = await c.compute(left.merge(right, npartitions=npartitions))

distributed/shuffle/tests/test_merge.py:408: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='2866af62945ea73255d6124f0c2ab890' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_merge_with_npartitions[5] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='5b7becb6715abf4944a35eb5c8397f3f' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: '5b7becb6715abf4944a35eb5c8397f3f'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:40897', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:40535', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:44035', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
npartitions = 5

    @pytest.mark.parametrize("npartitions", [4, 5, 10, 20])
    @gen_cluster(client=True)
    async def test_merge_with_npartitions(c, s, a, b, npartitions):
        pdf = pd.DataFrame({"a": [1, 2, 3, 4] * 10, "b": 1})
    
        left = dd.from_pandas(pdf, npartitions=10)
        right = dd.from_pandas(pdf, npartitions=5)
    
        expected = pdf.merge(pdf)
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
>           result = await c.compute(left.merge(right, npartitions=npartitions))

distributed/shuffle/tests/test_merge.py:408: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='5b7becb6715abf4944a35eb5c8397f3f' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

Check warning on line 0 in distributed.shuffle.tests.test_merge

github-actions / Unit Test Results

test_merge_with_npartitions[10] (distributed.shuffle.tests.test_merge) failed

artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]

Raw output


            distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='0683b1a498812aa6d19c68e52e068983' found
from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
>                   run_spec = self._get(id, worker)

distributed/shuffle/_scheduler_plugin.py:175: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
                    raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
            except P2PConsistencyError as e:
                return error_message(e)
    
        def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
            if worker not in self.scheduler.workers:
                # This should never happen
                raise P2PConsistencyError(
                    f"Scheduler is unaware of this worker {worker!r}"
                )  # pragma: nocover
>           state = self.active_shuffles[id]
E           KeyError: '0683b1a498812aa6d19c68e52e068983'

distributed/shuffle/_scheduler_plugin.py:190: KeyError

The above exception was the direct cause of the following exception:

c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:37385', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:43351', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:37621', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
npartitions = 10

    @pytest.mark.parametrize("npartitions", [4, 5, 10, 20])
    @gen_cluster(client=True)
    async def test_merge_with_npartitions(c, s, a, b, npartitions):
        pdf = pd.DataFrame({"a": [1, 2, 3, 4] * 10, "b": 1})
    
        left = dd.from_pandas(pdf, npartitions=10)
        right = dd.from_pandas(pdf, npartitions=5)
    
        expected = pdf.merge(pdf)
        with dask.config.set({"dataframe.shuffle.method": "p2p"}):
>           result = await c.compute(left.merge(right, npartitions=npartitions))

distributed/shuffle/tests/test_merge.py:408: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
distributed/client.py:410: in _result
    raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
    return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
    result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
    raise error
distributed/utils.py:413: in f
    result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
    value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
    shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
    return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
    shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
    result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
    raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

    from __future__ import annotations
    
    import asyncio
    import contextlib
    import itertools
    import logging
    from collections import defaultdict
    from typing import TYPE_CHECKING, Any
    
    from dask.typing import Key
    
    from distributed.core import ErrorMessage, OKMessage, error_message
    from distributed.diagnostics.plugin import SchedulerPlugin
    from distributed.metrics import time
    from distributed.protocol.pickle import dumps
    from distributed.protocol.serialize import ToPickle
    from distributed.shuffle._core import (
        P2PBarrierTask,
        RunSpecMessage,
        SchedulerShuffleState,
        ShuffleId,
        ShuffleRunSpec,
        ShuffleSpec,
        barrier_key,
        id_from_key,
    )
    from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
    from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
    from distributed.utils import log_errors
    
    if TYPE_CHECKING:
        from distributed.scheduler import (
            Recs,
            Scheduler,
            TaskState,
            TaskStateState,
            WorkerState,
        )
    
    logger = logging.getLogger(__name__)
    
    
    class ShuffleSchedulerPlugin(SchedulerPlugin):
        """
        Shuffle plugin for the scheduler
        This coordinates the individual worker plugins to ensure correctness
        and collects heartbeat messages for the dashboard.
        See Also
        --------
        ShuffleWorkerPlugin
        """
    
        scheduler: Scheduler
        active_shuffles: dict[ShuffleId, SchedulerShuffleState]
        heartbeats: defaultdict[ShuffleId, dict]
        _shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
        _archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
        _shift_counter: itertools.count[int]
    
        def __init__(self, scheduler: Scheduler):
            self.scheduler = scheduler
            self.scheduler.handlers.update(
                {
                    "shuffle_barrier": self.barrier,
                    "shuffle_get": self.get,
                    "shuffle_get_or_create": self.get_or_create,
                    "shuffle_restrict_task": self.restrict_task,
                }
            )
            self.heartbeats = defaultdict(lambda: defaultdict(dict))
            self.active_shuffles = {}
            self.scheduler.add_plugin(self, name="shuffle")
            self._shuffles = defaultdict(set)
            self._archived_by_stimulus = defaultdict(set)
            self._shift_counter = itertools.count()
    
        async def start(self, scheduler: Scheduler) -> None:
            worker_plugin = ShuffleWorkerPlugin()
            await self.scheduler.register_worker_plugin(
                None, dumps(worker_plugin), name="shuffle", idempotent=False
            )
    
        def shuffle_ids(self) -> set[ShuffleId]:
            return set(self.active_shuffles)
    
        async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
            shuffle = self.active_shuffles[id]
            if shuffle.run_id != run_id:
                raise ValueError(f"{run_id=} does not match {shuffle}")
            if not consistent:
                logger.warning(
                    "Shuffle %s restarted due to data inconsistency during barrier",
                    shuffle.id,
                )
                return self._restart_shuffle(
                    shuffle.id,
                    self.scheduler,
                    stimulus_id=f"p2p-barrier-inconsistent-{time()}",
                )
            msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
            workers = list(shuffle.participating_workers)
            no_progress = 0
            while workers:
                res = await self.scheduler.broadcast(
                    msg=msg,
                    workers=workers,
                    on_error="return",
                )
                before = len(workers)
                workers = []
                for w, r in res.items():
                    if r is None:
                        continue
                    if isinstance(r, OSError):
                        workers.append(w)
                    else:
                        raise RuntimeError(
                            f"Unexpected error encountered during P2P barrier: {r!r}"
                        )
                workers = [w for w, r in res.items() if r is not None]
                if workers:
                    logger.warning(
                        "Failure during broadcast of %s, retrying.",
                        shuffle.id,
                    )
                    if any(w not in self.scheduler.workers for w in workers):
                        if not shuffle.archived:
                            # If the shuffle is not yet archived, this could mean that the barrier task fails
                            # before the P2P restarting mechanism can kick in.
                            raise P2PIllegalStateError(
                                "Expected shuffle to be archived if participating worker is not known by scheduler"
                            )
                        raise RuntimeError(
                            f"Worker {workers} left during shuffle {shuffle}"
                        )
                    await asyncio.sleep(0.1)
                    if len(workers) == before:
                        no_progress += 1
                        if no_progress >= 3:
                            raise RuntimeError(
                                f"""Broadcast not making progress for {shuffle}.
                                Aborting. This is possibly due to overloaded
                                workers. Increasing config
                                `distributed.comm.timeouts.connect` timeout may
                                help."""
                            )
    
        def restrict_task(
            self, id: ShuffleId, run_id: int, key: Key, worker: str
        ) -> OKMessage | ErrorMessage:
            try:
                shuffle = self.active_shuffles[id]
                if shuffle.run_id > run_id:
                    raise P2PConsistencyError(
                        f"Request stale, expected {run_id=} for {shuffle}"
                    )
                elif shuffle.run_id < run_id:
                    raise P2PConsistencyError(
                        f"Request invalid, expected {run_id=} for {shuffle}"
                    )
                ts = self.scheduler.tasks[key]
                self._set_restriction(ts, worker)
                return {"status": "OK"}
            except P2PConsistencyError as e:
                return error_message(e)
    
        def heartbeat(self, ws: WorkerState, data: dict) -> None:
            for shuffle_id, d in data.items():
                if shuffle_id in self.shuffle_ids():
                    self.heartbeats[shuffle_id][ws.address].update(d)
    
        def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
            try:
                try:
                    run_spec = self._get(id, worker)
                    return {"status": "OK", "run_spec": ToPickle(run_spec)}
                except KeyError as e:
>                   raise P2PConsistencyError(
                        f"No active shuffle with {id=!r} found"
                    ) from e
E                   distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='0683b1a498812aa6d19c68e52e068983' found

distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError

View more details on GitHub Actions