Remove recursion in task spec #14402
2 errors, 223 fail, 179 skipped, 3 519 pass in 1h 55m 26s
Annotations
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_basic_merge[inner] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='88efd7ed4eeba2e77f68f8a2f7cad3f4' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: '88efd7ed4eeba2e77f68f8a2f7cad3f4'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:37071', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'inner'
@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
@gen_cluster(client=True)
async def test_basic_merge(c, s, a, b, how):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
joined = a.merge(b, left_on="y", right_on="y", how=how)
if dd._dask_expr_enabled():
# Ensure we're using a hash join
from dask_expr._merge import HashJoinP2P
assert any(
isinstance(expr, HashJoinP2P) for expr in joined.optimize()._expr.walk()
)
expected = pd.merge(A, B, how, "y")
> await list_eq(joined, expected)
distributed/shuffle/tests/test_merge.py:91:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/shuffle/tests/test_merge.py:35: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='88efd7ed4eeba2e77f68f8a2f7cad3f4' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_basic_merge[left] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='fa9a14514d4b429907ecb4ca222694d3' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: 'fa9a14514d4b429907ecb4ca222694d3'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:34425', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'left'
@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
@gen_cluster(client=True)
async def test_basic_merge(c, s, a, b, how):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
joined = a.merge(b, left_on="y", right_on="y", how=how)
if dd._dask_expr_enabled():
# Ensure we're using a hash join
from dask_expr._merge import HashJoinP2P
assert any(
isinstance(expr, HashJoinP2P) for expr in joined.optimize()._expr.walk()
)
expected = pd.merge(A, B, how, "y")
> await list_eq(joined, expected)
distributed/shuffle/tests/test_merge.py:91:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/shuffle/tests/test_merge.py:35: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='fa9a14514d4b429907ecb4ca222694d3' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_basic_merge[right] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='7ffbc51837584688d9dd80c2a12c24ae' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: '7ffbc51837584688d9dd80c2a12c24ae'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:37301', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'right'
@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
@gen_cluster(client=True)
async def test_basic_merge(c, s, a, b, how):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
joined = a.merge(b, left_on="y", right_on="y", how=how)
if dd._dask_expr_enabled():
# Ensure we're using a hash join
from dask_expr._merge import HashJoinP2P
assert any(
isinstance(expr, HashJoinP2P) for expr in joined.optimize()._expr.walk()
)
expected = pd.merge(A, B, how, "y")
> await list_eq(joined, expected)
distributed/shuffle/tests/test_merge.py:91:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/shuffle/tests/test_merge.py:35: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='7ffbc51837584688d9dd80c2a12c24ae' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_basic_merge[outer] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='55c8f67094461ff624233f674037e5c8' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: '55c8f67094461ff624233f674037e5c8'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:45827', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'outer'
@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
@gen_cluster(client=True)
async def test_basic_merge(c, s, a, b, how):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
joined = a.merge(b, left_on="y", right_on="y", how=how)
if dd._dask_expr_enabled():
# Ensure we're using a hash join
from dask_expr._merge import HashJoinP2P
assert any(
isinstance(expr, HashJoinP2P) for expr in joined.optimize()._expr.walk()
)
expected = pd.merge(A, B, how, "y")
> await list_eq(joined, expected)
distributed/shuffle/tests/test_merge.py:91:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/shuffle/tests/test_merge.py:35: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='55c8f67094461ff624233f674037e5c8' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_merge_p2p_shuffle_reused_dataframe_with_different_parameters (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='86b13a64d50dcd6bc32f511f59254dc9' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: '86b13a64d50dcd6bc32f511f59254dc9'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:40355', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:41533', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:39121', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
@gen_cluster(client=True)
async def test_merge_p2p_shuffle_reused_dataframe_with_different_parameters(c, s, a, b):
pdf1 = pd.DataFrame({"a": range(100), "b": range(0, 200, 2)})
pdf2 = pd.DataFrame({"x": range(200), "y": [1, 2, 3, 4] * 50})
ddf1 = dd.from_pandas(pdf1, npartitions=5)
ddf2 = dd.from_pandas(pdf2, npartitions=10)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
out = (
ddf1.merge(ddf2, left_on="a", right_on="x")
# Vary the number of output partitions for the shuffles of dd2
.repartition(npartitions=20).merge(ddf2, left_on="b", right_on="x")
)
# Generate unique shuffle IDs if the input frame is the same but
# parameters differ. Reusing shuffles in merges is dangerous because of the
# required coordination and complexity introduced through dynamic clusters.
assert sum(id_from_key(k) is not None for k in out.dask) == 4
> result = await c.compute(out)
distributed/shuffle/tests/test_merge.py:126:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='86b13a64d50dcd6bc32f511f59254dc9' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_merge_p2p_shuffle_reused_dataframe_with_same_parameters (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='75290a8c4f23c2d04c2280727d1b7cb9' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: '75290a8c4f23c2d04c2280727d1b7cb9'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:45157', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:36195', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:44547', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
@gen_cluster(client=True)
async def test_merge_p2p_shuffle_reused_dataframe_with_same_parameters(c, s, a, b):
pdf1 = pd.DataFrame({"a": range(100), "b": range(0, 200, 2)})
pdf2 = pd.DataFrame({"x": range(200), "y": [1, 2, 3, 4] * 50})
ddf1 = dd.from_pandas(pdf1, npartitions=5)
ddf2 = dd.from_pandas(pdf2, npartitions=10)
# This performs two shuffles:
# * ddf1 is shuffled on `a`
# * ddf2 is shuffled on `x`
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
ddf3 = ddf1.merge(
ddf2,
left_on="a",
right_on="x",
)
# This performs one shuffle:
# * ddf3 is shuffled on `b`
# We can reuse the shuffle of dd2 on `x` from the previous merge.
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
out = ddf2.merge(
ddf3,
left_on="x",
right_on="b",
)
# Generate unique shuffle IDs if the input frame is the same and all its
# parameters match. Reusing shuffles in merges is dangerous because of the
# required coordination and complexity introduced through dynamic clusters.
assert sum(id_from_key(k) is not None for k in out.dask) == 4
> result = await c.compute(out)
distributed/shuffle/tests/test_merge.py:163:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='75290a8c4f23c2d04c2280727d1b7cb9' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_merge[True-inner] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='129db57dd9cf6a130cec5c2223b03058' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: '129db57dd9cf6a130cec5c2223b03058'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:42569', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'inner', disk = True
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.storage.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
> await list_eq(joined, pd.merge(A, B, on="y", how=how))
distributed/shuffle/tests/test_merge.py:189:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/shuffle/tests/test_merge.py:35: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='129db57dd9cf6a130cec5c2223b03058' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_merge[True-outer] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='d02c927e1c2407360c1725e5504c5b5b' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: 'd02c927e1c2407360c1725e5504c5b5b'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:37419', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'outer', disk = True
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.storage.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
> await list_eq(joined, pd.merge(A, B, on="y", how=how))
distributed/shuffle/tests/test_merge.py:189:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/shuffle/tests/test_merge.py:35: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='d02c927e1c2407360c1725e5504c5b5b' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_merge[True-left] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='a17398d1df433992275b759dbf6b3345' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: 'a17398d1df433992275b759dbf6b3345'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:45443', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'left', disk = True
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.storage.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
> await list_eq(joined, pd.merge(A, B, on="y", how=how))
distributed/shuffle/tests/test_merge.py:189:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/shuffle/tests/test_merge.py:35: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='a17398d1df433992275b759dbf6b3345' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_merge[True-right] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='d37e533e69214d1b7877feed132b7cff' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: 'd37e533e69214d1b7877feed132b7cff'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:33123', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'right', disk = True
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.storage.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
> await list_eq(joined, pd.merge(A, B, on="y", how=how))
distributed/shuffle/tests/test_merge.py:189:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/shuffle/tests/test_merge.py:35: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='d37e533e69214d1b7877feed132b7cff' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_merge[False-inner] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='88efd7ed4eeba2e77f68f8a2f7cad3f4' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: '88efd7ed4eeba2e77f68f8a2f7cad3f4'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:38277', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'inner', disk = False
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.storage.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
> await list_eq(joined, pd.merge(A, B, on="y", how=how))
distributed/shuffle/tests/test_merge.py:189:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/shuffle/tests/test_merge.py:35: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='88efd7ed4eeba2e77f68f8a2f7cad3f4' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_merge[False-outer] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='d02c927e1c2407360c1725e5504c5b5b' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: 'd02c927e1c2407360c1725e5504c5b5b'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:41549', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'outer', disk = False
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.storage.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
> await list_eq(joined, pd.merge(A, B, on="y", how=how))
distributed/shuffle/tests/test_merge.py:189:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/shuffle/tests/test_merge.py:35: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='d02c927e1c2407360c1725e5504c5b5b' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_merge[False-left] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='a17398d1df433992275b759dbf6b3345' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: 'a17398d1df433992275b759dbf6b3345'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:35799', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'left', disk = False
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.storage.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
> await list_eq(joined, pd.merge(A, B, on="y", how=how))
distributed/shuffle/tests/test_merge.py:189:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/shuffle/tests/test_merge.py:35: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='a17398d1df433992275b759dbf6b3345' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_merge[False-right] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='7ffbc51837584688d9dd80c2a12c24ae' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: '7ffbc51837584688d9dd80c2a12c24ae'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:43261', workers: 0, cores: 0, tasks: 0>
a = Dask DataFrame Structure:
x y
npartitions=2
0 int64 int64
4 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
b = Dask DataFrame Structure:
y z
npartitions=2
0 int64 int64
2 ... ...
5 ... ...
Dask Name: repartition-dataframe, 1 graph layer
how = 'right', disk = False
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
@pytest.mark.parametrize("disk", [True, False])
@gen_cluster(client=True)
async def test_merge(c, s, a, b, how, disk):
A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": [1, 1, 2, 2, 3, 4]})
a = dd.repartition(A, [0, 4, 5])
B = pd.DataFrame({"y": [1, 3, 4, 4, 5, 6], "z": [6, 5, 4, 3, 2, 1]})
b = dd.repartition(B, [0, 2, 5])
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
with dask.config.set({"distributed.p2p.storage.disk": disk}):
joined = dd.merge(a, b, left_index=True, right_index=True, how=how)
res = await c.compute(joined)
assert_eq(
res,
pd.merge(A, B, left_index=True, right_index=True, how=how),
)
joined = dd.merge(a, b, on="y", how=how)
> await list_eq(joined, pd.merge(A, B, on="y", how=how))
distributed/shuffle/tests/test_merge.py:189:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/shuffle/tests/test_merge.py:35: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='7ffbc51837584688d9dd80c2a12c24ae' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_merge_by_multiple_columns[inner] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='3f19fbff373827e995deef2e4efc6b71' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: '3f19fbff373827e995deef2e4efc6b71'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:43283', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:44127', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:41115', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'inner'
@pytest.mark.slow
@gen_cluster(client=True, timeout=120)
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
async def test_merge_by_multiple_columns(c, s, a, b, how):
# warnings here from pandas
pdf1l = pd.DataFrame(
{
"a": list("abcdefghij"),
"b": list("abcdefghij"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf1r = pd.DataFrame(
{
"d": list("abcdefghij"),
"e": list("abcdefghij"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("abcdefghij"),
)
pdf2l = pd.DataFrame(
{
"a": list("abcdeabcde"),
"b": list("abcabcabca"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf2r = pd.DataFrame(
{
"d": list("edcbaedcba"),
"e": list("aaabbbcccd"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("fghijklmno"),
)
pdf3l = pd.DataFrame(
{
"a": list("aaaaaaaaaa"),
"b": list("aaaaaaaaaa"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf3r = pd.DataFrame(
{
"d": list("aaabbbccaa"),
"e": list("abbbbbbbbb"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("ABCDEFGHIJ"),
)
for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:
for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:
ddl = dd.from_pandas(pdl, lpart)
ddr = dd.from_pandas(pdr, rpart)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
expected = pdl.join(pdr, how=how)
assert_eq(
await c.compute(ddl.join(ddr, how=how)),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not expected.index.empty,
)
expected = pdr.join(pdl, how=how)
assert_eq(
await c.compute(ddr.join(ddl, how=how)),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not expected.index.empty,
)
expected = pd.merge(
pdl, pdr, how=how, left_index=True, right_index=True
)
assert_eq(
await c.compute(
dd.merge(
ddl,
ddr,
how=how,
left_index=True,
right_index=True,
)
),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not expected.index.empty,
)
expected = pd.merge(
pdr, pdl, how=how, left_index=True, right_index=True
)
assert_eq(
await c.compute(
dd.merge(
ddr,
ddl,
how=how,
left_index=True,
right_index=True,
)
),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not expected.index.empty,
)
# hash join
> await list_eq(
dd.merge(ddl, ddr, how=how, left_on="a", right_on="d"),
pd.merge(pdl, pdr, how=how, left_on="a", right_on="d"),
)
distributed/shuffle/tests/test_merge.py:351:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/shuffle/tests/test_merge.py:35: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='3f19fbff373827e995deef2e4efc6b71' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_merge_by_multiple_columns[outer] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='ff48afe84a60c2995a3961f2580060e6' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: 'ff48afe84a60c2995a3961f2580060e6'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:42005', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:35681', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:38593', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'outer'
@pytest.mark.slow
@gen_cluster(client=True, timeout=120)
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
async def test_merge_by_multiple_columns(c, s, a, b, how):
# warnings here from pandas
pdf1l = pd.DataFrame(
{
"a": list("abcdefghij"),
"b": list("abcdefghij"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf1r = pd.DataFrame(
{
"d": list("abcdefghij"),
"e": list("abcdefghij"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("abcdefghij"),
)
pdf2l = pd.DataFrame(
{
"a": list("abcdeabcde"),
"b": list("abcabcabca"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf2r = pd.DataFrame(
{
"d": list("edcbaedcba"),
"e": list("aaabbbcccd"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("fghijklmno"),
)
pdf3l = pd.DataFrame(
{
"a": list("aaaaaaaaaa"),
"b": list("aaaaaaaaaa"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf3r = pd.DataFrame(
{
"d": list("aaabbbccaa"),
"e": list("abbbbbbbbb"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("ABCDEFGHIJ"),
)
for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:
for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:
ddl = dd.from_pandas(pdl, lpart)
ddr = dd.from_pandas(pdr, rpart)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
expected = pdl.join(pdr, how=how)
assert_eq(
await c.compute(ddl.join(ddr, how=how)),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not expected.index.empty,
)
expected = pdr.join(pdl, how=how)
assert_eq(
await c.compute(ddr.join(ddl, how=how)),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not expected.index.empty,
)
expected = pd.merge(
pdl, pdr, how=how, left_index=True, right_index=True
)
assert_eq(
await c.compute(
dd.merge(
ddl,
ddr,
how=how,
left_index=True,
right_index=True,
)
),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not expected.index.empty,
)
expected = pd.merge(
pdr, pdl, how=how, left_index=True, right_index=True
)
assert_eq(
await c.compute(
dd.merge(
ddr,
ddl,
how=how,
left_index=True,
right_index=True,
)
),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not expected.index.empty,
)
# hash join
> await list_eq(
dd.merge(ddl, ddr, how=how, left_on="a", right_on="d"),
pd.merge(pdl, pdr, how=how, left_on="a", right_on="d"),
)
distributed/shuffle/tests/test_merge.py:351:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/shuffle/tests/test_merge.py:35: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='ff48afe84a60c2995a3961f2580060e6' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_merge_by_multiple_columns[left] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='43555b120d9e1413e94141cd84fb0ebc' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: '43555b120d9e1413e94141cd84fb0ebc'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:33803', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:43739', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:36925', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'left'
@pytest.mark.slow
@gen_cluster(client=True, timeout=120)
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
async def test_merge_by_multiple_columns(c, s, a, b, how):
# warnings here from pandas
pdf1l = pd.DataFrame(
{
"a": list("abcdefghij"),
"b": list("abcdefghij"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf1r = pd.DataFrame(
{
"d": list("abcdefghij"),
"e": list("abcdefghij"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("abcdefghij"),
)
pdf2l = pd.DataFrame(
{
"a": list("abcdeabcde"),
"b": list("abcabcabca"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf2r = pd.DataFrame(
{
"d": list("edcbaedcba"),
"e": list("aaabbbcccd"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("fghijklmno"),
)
pdf3l = pd.DataFrame(
{
"a": list("aaaaaaaaaa"),
"b": list("aaaaaaaaaa"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf3r = pd.DataFrame(
{
"d": list("aaabbbccaa"),
"e": list("abbbbbbbbb"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("ABCDEFGHIJ"),
)
for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:
for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:
ddl = dd.from_pandas(pdl, lpart)
ddr = dd.from_pandas(pdr, rpart)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
expected = pdl.join(pdr, how=how)
assert_eq(
await c.compute(ddl.join(ddr, how=how)),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not expected.index.empty,
)
expected = pdr.join(pdl, how=how)
assert_eq(
await c.compute(ddr.join(ddl, how=how)),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not expected.index.empty,
)
expected = pd.merge(
pdl, pdr, how=how, left_index=True, right_index=True
)
assert_eq(
await c.compute(
dd.merge(
ddl,
ddr,
how=how,
left_index=True,
right_index=True,
)
),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not expected.index.empty,
)
expected = pd.merge(
pdr, pdl, how=how, left_index=True, right_index=True
)
assert_eq(
await c.compute(
dd.merge(
ddr,
ddl,
how=how,
left_index=True,
right_index=True,
)
),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not expected.index.empty,
)
# hash join
> await list_eq(
dd.merge(ddl, ddr, how=how, left_on="a", right_on="d"),
pd.merge(pdl, pdr, how=how, left_on="a", right_on="d"),
)
distributed/shuffle/tests/test_merge.py:351:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/shuffle/tests/test_merge.py:35: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='43555b120d9e1413e94141cd84fb0ebc' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_merge_by_multiple_columns[right] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='5e5498128fe80c17101b42e21c7bab99' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: '5e5498128fe80c17101b42e21c7bab99'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:35303', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:44723', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:44417', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'right'
@pytest.mark.slow
@gen_cluster(client=True, timeout=120)
@pytest.mark.parametrize("how", ["inner", "outer", "left", "right"])
async def test_merge_by_multiple_columns(c, s, a, b, how):
# warnings here from pandas
pdf1l = pd.DataFrame(
{
"a": list("abcdefghij"),
"b": list("abcdefghij"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf1r = pd.DataFrame(
{
"d": list("abcdefghij"),
"e": list("abcdefghij"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("abcdefghij"),
)
pdf2l = pd.DataFrame(
{
"a": list("abcdeabcde"),
"b": list("abcabcabca"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf2r = pd.DataFrame(
{
"d": list("edcbaedcba"),
"e": list("aaabbbcccd"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("fghijklmno"),
)
pdf3l = pd.DataFrame(
{
"a": list("aaaaaaaaaa"),
"b": list("aaaaaaaaaa"),
"c": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
},
index=list("abcdefghij"),
)
pdf3r = pd.DataFrame(
{
"d": list("aaabbbccaa"),
"e": list("abbbbbbbbb"),
"f": [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
},
index=list("ABCDEFGHIJ"),
)
for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:
for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:
ddl = dd.from_pandas(pdl, lpart)
ddr = dd.from_pandas(pdr, rpart)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
expected = pdl.join(pdr, how=how)
assert_eq(
await c.compute(ddl.join(ddr, how=how)),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not expected.index.empty,
)
expected = pdr.join(pdl, how=how)
assert_eq(
await c.compute(ddr.join(ddl, how=how)),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not expected.index.empty,
)
expected = pd.merge(
pdl, pdr, how=how, left_index=True, right_index=True
)
assert_eq(
await c.compute(
dd.merge(
ddl,
ddr,
how=how,
left_index=True,
right_index=True,
)
),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not expected.index.empty,
)
expected = pd.merge(
pdr, pdl, how=how, left_index=True, right_index=True
)
assert_eq(
await c.compute(
dd.merge(
ddr,
ddl,
how=how,
left_index=True,
right_index=True,
)
),
expected,
# FIXME: There's an discrepancy with an empty index for
# pandas=2.0 (xref https://github.com/dask/dask/issues/9957).
# Temporarily avoid index check until the discrepancy is fixed.
check_index=not expected.index.empty,
)
# hash join
> await list_eq(
dd.merge(ddl, ddr, how=how, left_on="a", right_on="d"),
pd.merge(pdl, pdr, how=how, left_on="a", right_on="d"),
)
distributed/shuffle/tests/test_merge.py:351:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/shuffle/tests/test_merge.py:35: in list_eq
a = await c.compute(a) if isinstance(a, dd.DataFrame) else a
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='5e5498128fe80c17101b42e21c7bab99' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_index_merge_p2p[inner] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='d5690dfd886c8bf31a64d23407e4e6b0' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: 'd5690dfd886c8bf31a64d23407e4e6b0'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:38425', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:43171', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:34615', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'inner'
@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
@gen_cluster(client=True)
async def test_index_merge_p2p(c, s, a, b, how):
pdf_left = pd.DataFrame({"a": [4, 2, 3] * 10, "b": 1}).set_index("a")
pdf_right = pd.DataFrame({"a": [4, 2, 3] * 10, "c": 1})
left = dd.from_pandas(pdf_left, npartitions=5, sort=False)
right = dd.from_pandas(pdf_right, npartitions=6)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
assert_eq(
> await c.compute(left.merge(right, how=how, left_index=True, right_on="a")),
pdf_left.merge(pdf_right, how=how, left_index=True, right_on="a"),
)
distributed/shuffle/tests/test_merge.py:388:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='d5690dfd886c8bf31a64d23407e4e6b0' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_index_merge_p2p[left] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='3eb8a5df59b96e9fb32c674d07990632' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: '3eb8a5df59b96e9fb32c674d07990632'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:36613', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:45137', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:36283', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'left'
@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
@gen_cluster(client=True)
async def test_index_merge_p2p(c, s, a, b, how):
pdf_left = pd.DataFrame({"a": [4, 2, 3] * 10, "b": 1}).set_index("a")
pdf_right = pd.DataFrame({"a": [4, 2, 3] * 10, "c": 1})
left = dd.from_pandas(pdf_left, npartitions=5, sort=False)
right = dd.from_pandas(pdf_right, npartitions=6)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
assert_eq(
> await c.compute(left.merge(right, how=how, left_index=True, right_on="a")),
pdf_left.merge(pdf_right, how=how, left_index=True, right_on="a"),
)
distributed/shuffle/tests/test_merge.py:388:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='3eb8a5df59b96e9fb32c674d07990632' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_index_merge_p2p[right] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='24911519b4ce6c4b7de008abfae05db9' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: '24911519b4ce6c4b7de008abfae05db9'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:45739', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:36979', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:35315', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'right'
@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
@gen_cluster(client=True)
async def test_index_merge_p2p(c, s, a, b, how):
pdf_left = pd.DataFrame({"a": [4, 2, 3] * 10, "b": 1}).set_index("a")
pdf_right = pd.DataFrame({"a": [4, 2, 3] * 10, "c": 1})
left = dd.from_pandas(pdf_left, npartitions=5, sort=False)
right = dd.from_pandas(pdf_right, npartitions=6)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
assert_eq(
> await c.compute(left.merge(right, how=how, left_index=True, right_on="a")),
pdf_left.merge(pdf_right, how=how, left_index=True, right_on="a"),
)
distributed/shuffle/tests/test_merge.py:388:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='24911519b4ce6c4b7de008abfae05db9' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_index_merge_p2p[outer] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='4a70d6e91519e0e1e8f20b9daf2fc606' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: '4a70d6e91519e0e1e8f20b9daf2fc606'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:34969', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:42535', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:42853', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
how = 'outer'
@pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
@gen_cluster(client=True)
async def test_index_merge_p2p(c, s, a, b, how):
pdf_left = pd.DataFrame({"a": [4, 2, 3] * 10, "b": 1}).set_index("a")
pdf_right = pd.DataFrame({"a": [4, 2, 3] * 10, "c": 1})
left = dd.from_pandas(pdf_left, npartitions=5, sort=False)
right = dd.from_pandas(pdf_right, npartitions=6)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
assert_eq(
> await c.compute(left.merge(right, how=how, left_index=True, right_on="a")),
pdf_left.merge(pdf_right, how=how, left_index=True, right_on="a"),
)
distributed/shuffle/tests/test_merge.py:388:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='4a70d6e91519e0e1e8f20b9daf2fc606' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_merge_with_npartitions[4] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='2866af62945ea73255d6124f0c2ab890' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: '2866af62945ea73255d6124f0c2ab890'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:39931', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:36323', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:35263', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
npartitions = 4
@pytest.mark.parametrize("npartitions", [4, 5, 10, 20])
@gen_cluster(client=True)
async def test_merge_with_npartitions(c, s, a, b, npartitions):
pdf = pd.DataFrame({"a": [1, 2, 3, 4] * 10, "b": 1})
left = dd.from_pandas(pdf, npartitions=10)
right = dd.from_pandas(pdf, npartitions=5)
expected = pdf.merge(pdf)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
> result = await c.compute(left.merge(right, npartitions=npartitions))
distributed/shuffle/tests/test_merge.py:408:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='2866af62945ea73255d6124f0c2ab890' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_merge_with_npartitions[5] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='5b7becb6715abf4944a35eb5c8397f3f' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: '5b7becb6715abf4944a35eb5c8397f3f'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:40897', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:40535', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:44035', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
npartitions = 5
@pytest.mark.parametrize("npartitions", [4, 5, 10, 20])
@gen_cluster(client=True)
async def test_merge_with_npartitions(c, s, a, b, npartitions):
pdf = pd.DataFrame({"a": [1, 2, 3, 4] * 10, "b": 1})
left = dd.from_pandas(pdf, npartitions=10)
right = dd.from_pandas(pdf, npartitions=5)
expected = pdf.merge(pdf)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
> result = await c.compute(left.merge(right, npartitions=npartitions))
distributed/shuffle/tests/test_merge.py:408:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='5b7becb6715abf4944a35eb5c8397f3f' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError
Check warning on line 0 in distributed.shuffle.tests.test_merge
github-actions / Unit Test Results
test_merge_with_npartitions[10] (distributed.shuffle.tests.test_merge) failed
artifacts/ubuntu-latest-mindeps-pandas-ci1/pytest.xml [took 0s]
Raw output
distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='0683b1a498812aa6d19c68e52e068983' found
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
> run_spec = self._get(id, worker)
distributed/shuffle/_scheduler_plugin.py:175:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
except P2PConsistencyError as e:
return error_message(e)
def _get(self, id: ShuffleId, worker: str) -> ShuffleRunSpec:
if worker not in self.scheduler.workers:
# This should never happen
raise P2PConsistencyError(
f"Scheduler is unaware of this worker {worker!r}"
) # pragma: nocover
> state = self.active_shuffles[id]
E KeyError: '0683b1a498812aa6d19c68e52e068983'
distributed/shuffle/_scheduler_plugin.py:190: KeyError
The above exception was the direct cause of the following exception:
c = <Client: No scheduler connected>
s = <Scheduler 'tcp://127.0.0.1:37385', workers: 0, cores: 0, tasks: 0>
a = <Worker 'tcp://127.0.0.1:43351', name: 0, status: closed, stored: 0, running: 0/1, ready: 0, comm: 0, waiting: 0>
b = <Worker 'tcp://127.0.0.1:37621', name: 1, status: closed, stored: 0, running: 0/2, ready: 0, comm: 0, waiting: 0>
npartitions = 10
@pytest.mark.parametrize("npartitions", [4, 5, 10, 20])
@gen_cluster(client=True)
async def test_merge_with_npartitions(c, s, a, b, npartitions):
pdf = pd.DataFrame({"a": [1, 2, 3, 4] * 10, "b": 1})
left = dd.from_pandas(pdf, npartitions=10)
right = dd.from_pandas(pdf, npartitions=5)
expected = pdf.merge(pdf)
with dask.config.set({"dataframe.shuffle.method": "p2p"}):
> result = await c.compute(left.merge(right, npartitions=npartitions))
distributed/shuffle/tests/test_merge.py:408:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
distributed/client.py:410: in _result
raise exc.with_traceback(tb)
distributed/shuffle/_core.py:574: in p2p_barrier
return get_worker_plugin().barrier(id, run_ids)
distributed/shuffle/_worker_plugin.py:391: in barrier
result = sync(self.worker.loop, self._barrier, shuffle_id, run_ids)
distributed/utils.py:439: in sync
raise error
distributed/utils.py:413: in f
result = yield future
../../../miniconda3/envs/dask-distributed/lib/python3.10/site-packages/tornado/gen.py:769: in run
value = future.result()
distributed/shuffle/_worker_plugin.py:362: in _barrier
shuffle_run = await self.shuffle_runs.get_most_recent(shuffle_id, run_ids)
distributed/shuffle/_worker_plugin.py:177: in get_most_recent
return await self.get_with_run_id(shuffle_id=shuffle_id, run_id=max(run_ids))
distributed/shuffle/_worker_plugin.py:119: in get_with_run_id
shuffle_run = await self._refresh(shuffle_id=shuffle_id)
distributed/shuffle/_worker_plugin.py:222: in _refresh
result = await self._fetch(shuffle_id=shuffle_id, key=key)
distributed/shuffle/_worker_plugin.py:200: in _fetch
raise exc.with_traceback(tb)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
from __future__ import annotations
import asyncio
import contextlib
import itertools
import logging
from collections import defaultdict
from typing import TYPE_CHECKING, Any
from dask.typing import Key
from distributed.core import ErrorMessage, OKMessage, error_message
from distributed.diagnostics.plugin import SchedulerPlugin
from distributed.metrics import time
from distributed.protocol.pickle import dumps
from distributed.protocol.serialize import ToPickle
from distributed.shuffle._core import (
P2PBarrierTask,
RunSpecMessage,
SchedulerShuffleState,
ShuffleId,
ShuffleRunSpec,
ShuffleSpec,
barrier_key,
id_from_key,
)
from distributed.shuffle._exceptions import P2PConsistencyError, P2PIllegalStateError
from distributed.shuffle._worker_plugin import ShuffleWorkerPlugin
from distributed.utils import log_errors
if TYPE_CHECKING:
from distributed.scheduler import (
Recs,
Scheduler,
TaskState,
TaskStateState,
WorkerState,
)
logger = logging.getLogger(__name__)
class ShuffleSchedulerPlugin(SchedulerPlugin):
"""
Shuffle plugin for the scheduler
This coordinates the individual worker plugins to ensure correctness
and collects heartbeat messages for the dashboard.
See Also
--------
ShuffleWorkerPlugin
"""
scheduler: Scheduler
active_shuffles: dict[ShuffleId, SchedulerShuffleState]
heartbeats: defaultdict[ShuffleId, dict]
_shuffles: defaultdict[ShuffleId, set[SchedulerShuffleState]]
_archived_by_stimulus: defaultdict[str, set[SchedulerShuffleState]]
_shift_counter: itertools.count[int]
def __init__(self, scheduler: Scheduler):
self.scheduler = scheduler
self.scheduler.handlers.update(
{
"shuffle_barrier": self.barrier,
"shuffle_get": self.get,
"shuffle_get_or_create": self.get_or_create,
"shuffle_restrict_task": self.restrict_task,
}
)
self.heartbeats = defaultdict(lambda: defaultdict(dict))
self.active_shuffles = {}
self.scheduler.add_plugin(self, name="shuffle")
self._shuffles = defaultdict(set)
self._archived_by_stimulus = defaultdict(set)
self._shift_counter = itertools.count()
async def start(self, scheduler: Scheduler) -> None:
worker_plugin = ShuffleWorkerPlugin()
await self.scheduler.register_worker_plugin(
None, dumps(worker_plugin), name="shuffle", idempotent=False
)
def shuffle_ids(self) -> set[ShuffleId]:
return set(self.active_shuffles)
async def barrier(self, id: ShuffleId, run_id: int, consistent: bool) -> None:
shuffle = self.active_shuffles[id]
if shuffle.run_id != run_id:
raise ValueError(f"{run_id=} does not match {shuffle}")
if not consistent:
logger.warning(
"Shuffle %s restarted due to data inconsistency during barrier",
shuffle.id,
)
return self._restart_shuffle(
shuffle.id,
self.scheduler,
stimulus_id=f"p2p-barrier-inconsistent-{time()}",
)
msg = {"op": "shuffle_inputs_done", "shuffle_id": id, "run_id": run_id}
workers = list(shuffle.participating_workers)
no_progress = 0
while workers:
res = await self.scheduler.broadcast(
msg=msg,
workers=workers,
on_error="return",
)
before = len(workers)
workers = []
for w, r in res.items():
if r is None:
continue
if isinstance(r, OSError):
workers.append(w)
else:
raise RuntimeError(
f"Unexpected error encountered during P2P barrier: {r!r}"
)
workers = [w for w, r in res.items() if r is not None]
if workers:
logger.warning(
"Failure during broadcast of %s, retrying.",
shuffle.id,
)
if any(w not in self.scheduler.workers for w in workers):
if not shuffle.archived:
# If the shuffle is not yet archived, this could mean that the barrier task fails
# before the P2P restarting mechanism can kick in.
raise P2PIllegalStateError(
"Expected shuffle to be archived if participating worker is not known by scheduler"
)
raise RuntimeError(
f"Worker {workers} left during shuffle {shuffle}"
)
await asyncio.sleep(0.1)
if len(workers) == before:
no_progress += 1
if no_progress >= 3:
raise RuntimeError(
f"""Broadcast not making progress for {shuffle}.
Aborting. This is possibly due to overloaded
workers. Increasing config
`distributed.comm.timeouts.connect` timeout may
help."""
)
def restrict_task(
self, id: ShuffleId, run_id: int, key: Key, worker: str
) -> OKMessage | ErrorMessage:
try:
shuffle = self.active_shuffles[id]
if shuffle.run_id > run_id:
raise P2PConsistencyError(
f"Request stale, expected {run_id=} for {shuffle}"
)
elif shuffle.run_id < run_id:
raise P2PConsistencyError(
f"Request invalid, expected {run_id=} for {shuffle}"
)
ts = self.scheduler.tasks[key]
self._set_restriction(ts, worker)
return {"status": "OK"}
except P2PConsistencyError as e:
return error_message(e)
def heartbeat(self, ws: WorkerState, data: dict) -> None:
for shuffle_id, d in data.items():
if shuffle_id in self.shuffle_ids():
self.heartbeats[shuffle_id][ws.address].update(d)
def get(self, id: ShuffleId, worker: str) -> RunSpecMessage | ErrorMessage:
try:
try:
run_spec = self._get(id, worker)
return {"status": "OK", "run_spec": ToPickle(run_spec)}
except KeyError as e:
> raise P2PConsistencyError(
f"No active shuffle with {id=!r} found"
) from e
E distributed.shuffle._exceptions.P2PConsistencyError: No active shuffle with id='0683b1a498812aa6d19c68e52e068983' found
distributed/shuffle/_scheduler_plugin.py:178: P2PConsistencyError