From ae35cc87aa0b697cb941a2798e8f22126a3c529d Mon Sep 17 00:00:00 2001 From: Pierre Marcenac Date: Mon, 2 Dec 2024 10:05:07 +0000 Subject: [PATCH] Early return for num_shards==0 in the Beam pipeline. --- .../mlcroissant/mlcroissant/_src/operation_graph/execute.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/execute.py b/python/mlcroissant/mlcroissant/_src/operation_graph/execute.py index 1c7a484a..d25a22d4 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/execute.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/execute.py @@ -195,6 +195,10 @@ def execute_operations_in_beam( enumerate(files) ) num_shards = len(files) + if not num_shards: + raise ValueError( + f"Empty {record_set=}. No file found for filters={json.dumps(filters)}" + ) # We don't know in advance the number of records per shards. So we just allocate the # maximum number which is `sys.maxsize // num_shards`. Taking the practical case of