From 9369b81328f4ff3cc7c2136ad5a7f8930f1e7fb8 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Wed, 17 Apr 2024 14:27:45 +0200 Subject: [PATCH 01/16] Improve speed of NaturalIdPartitioner --- .../partitioner/natural_id_partitioner.py | 36 ++++++++++++++++--- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py index 8bad0668595b..38698c58da56 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py @@ -14,10 +14,12 @@ # ============================================================================== """Natural id partitioner class that works with Hugging Face Datasets.""" - from typing import Dict +import numpy as np + import datasets +from flwr_datasets.common.typing import NDArrayInt from flwr_datasets.partitioner.partitioner import Partitioner @@ -30,6 +32,8 @@ def __init__( ): super().__init__() self._partition_id_to_natural_id: Dict[int, str] = {} + self._natural_id_to_partition_id: Dict[str, int] = {} + self._partition_id_to_indices: Dict[int, NDArrayInt] = {} self._partition_by = partition_by def _create_int_partition_id_to_natural_id(self) -> None: @@ -42,6 +46,26 @@ def _create_int_partition_id_to_natural_id(self) -> None: zip(range(len(unique_natural_ids)), unique_natural_ids) ) + def _create_natural_id_to_int_partition_id(self) -> None: + """Create a mapping from unique client ids from dataset to int indices. + + Natural ids come from the column specified in `partition_by`. This object is + inverse of the `self._partition_id_to_natural_id`. This method assumes that + `self._partition_id_to_natural_id` already exist. + """ + self._natural_id_to_partition_id = { + value: key for key, value in self._partition_id_to_natural_id.items() + } + + def _create_partition_id_to_indices(self) -> None: + """Create an assignment of indices to the partition indices.""" + natural_ids = self.dataset[self._partition_by] + unique_natural_ids, inverse = np.unique(natural_ids, return_inverse=True) + + for i, natural_id in enumerate(unique_natural_ids): + partition_id = self._natural_id_to_partition_id[natural_id] + self._partition_id_to_indices[partition_id] = np.where(inverse == i)[0] + def load_partition(self, partition_id: int) -> datasets.Dataset: """Load a single partition corresponding to a single `partition_id`. @@ -60,17 +84,19 @@ def load_partition(self, partition_id: int) -> datasets.Dataset: """ if len(self._partition_id_to_natural_id) == 0: self._create_int_partition_id_to_natural_id() + self._create_natural_id_to_int_partition_id() - return self.dataset.filter( - lambda row: row[self._partition_by] - == self._partition_id_to_natural_id[partition_id] - ) + if len(self._partition_id_to_indices) == 0: + self._create_partition_id_to_indices() + + return self.dataset.select(self._partition_id_to_indices[partition_id]) @property def num_partitions(self) -> int: """Total number of partitions.""" if len(self._partition_id_to_natural_id) == 0: self._create_int_partition_id_to_natural_id() + self._create_natural_id_to_int_partition_id() return len(self._partition_id_to_natural_id) @property From 363d932054c3437b6e6c75f50dbdfb36470d78f0 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Wed, 17 Apr 2024 14:37:24 +0200 Subject: [PATCH 02/16] Add new line --- datasets/flwr_datasets/partitioner/natural_id_partitioner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py index 38698c58da56..c87dda0b342c 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py @@ -14,6 +14,7 @@ # ============================================================================== """Natural id partitioner class that works with Hugging Face Datasets.""" + from typing import Dict import numpy as np From c9ad68ebe9aa3d1fd389ac97742df91f3c469f08 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Fri, 19 Apr 2024 11:57:55 +0200 Subject: [PATCH 03/16] Fix the None case --- .../partitioner/natural_id_partitioner.py | 55 ++++++++++++++++++- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py index c87dda0b342c..4fbad928f2d3 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py @@ -15,7 +15,7 @@ """Natural id partitioner class that works with Hugging Face Datasets.""" -from typing import Dict +from typing import Dict, Union import numpy as np @@ -61,9 +61,44 @@ def _create_natural_id_to_int_partition_id(self) -> None: def _create_partition_id_to_indices(self) -> None: """Create an assignment of indices to the partition indices.""" natural_ids = self.dataset[self._partition_by] - unique_natural_ids, inverse = np.unique(natural_ids, return_inverse=True) + unique_natural_ids = self.dataset.unique(self._partition_by) + + none_present = False + if None in unique_natural_ids: + none_present = True + dtype = self.dataset.features[self._partition_by].dtype + none_replacement: Union[int, str] + if dtype == "string": + none_replacement = "None" + # Ensure the replacement is not in the dataset + while True: + if none_replacement not in unique_natural_ids: + break + none_replacement += "1" + elif "unit" in dtype: + none_replacement = max(natural_ids) + 1 + elif "int" in dtype: + none_replacement = -1 + if none_replacement in unique_natural_ids: + none_replacement = max(natural_ids) + 1 + else: + raise ValueError( + "The type of values in the `partition_by` column needs " + "to be int or string" + ) + + # Replace the None by the none_replacement (in order to be able to use the + # np.unique(value, return_inverse) that requires no None and same val types + is_none = np.vectorize(lambda x: x is None) + mask = is_none(natural_ids) + natural_ids[mask] = none_replacement + + unique_natural_ids, inverse = np.unique(natural_ids, return_inverse=True) for i, natural_id in enumerate(unique_natural_ids): + if none_present and natural_id == none_replacement: + # Use the natural_id that is present in the dataset (not replacement) + natural_id = None partition_id = self._natural_id_to_partition_id[natural_id] self._partition_id_to_indices[partition_id] = np.where(inverse == i)[0] @@ -84,6 +119,7 @@ def load_partition(self, partition_id: int) -> datasets.Dataset: single dataset partition """ if len(self._partition_id_to_natural_id) == 0: + self._check_supported_type_of_value_in_partition_by() self._create_int_partition_id_to_natural_id() self._create_natural_id_to_int_partition_id() @@ -96,6 +132,7 @@ def load_partition(self, partition_id: int) -> datasets.Dataset: def num_partitions(self) -> int: """Total number of partitions.""" if len(self._partition_id_to_natural_id) == 0: + self._check_supported_type_of_value_in_partition_by() self._create_int_partition_id_to_natural_id() self._create_natural_id_to_int_partition_id() return len(self._partition_id_to_natural_id) @@ -114,3 +151,17 @@ def partition_id_to_natural_id(self, value: Dict[int, str]) -> None: raise AttributeError( "Setting the partition_id_to_natural_id dictionary is not allowed." ) + + def _check_supported_type_of_value_in_partition_by(self) -> None: + values = self.dataset[self._partition_by] + values_np = np.array(values) + dtype = values_np.dtype + if not ( + np.issubdtype(dtype, np.object_) + or np.issubdtype(dtype, np.integer) + or np.issubdtype(dtype, np.str_) + ): + raise ValueError( + f"The specified column in {self._partition_by} is of type {dtype} " + f"however only ints (with None) and strings (with None) are acceptable" + ) From 3506f31a8123c3f437043d9f0e91ef3e537ea2f2 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Fri, 19 Apr 2024 12:10:30 +0200 Subject: [PATCH 04/16] Fix the indent on the np.unique --- .../partitioner/natural_id_partitioner.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py index 4fbad928f2d3..1c6d859915de 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py @@ -60,7 +60,7 @@ def _create_natural_id_to_int_partition_id(self) -> None: def _create_partition_id_to_indices(self) -> None: """Create an assignment of indices to the partition indices.""" - natural_ids = self.dataset[self._partition_by] + natural_ids = np.array(self.dataset[self._partition_by]) unique_natural_ids = self.dataset.unique(self._partition_by) none_present = False @@ -93,7 +93,7 @@ def _create_partition_id_to_indices(self) -> None: mask = is_none(natural_ids) natural_ids[mask] = none_replacement - unique_natural_ids, inverse = np.unique(natural_ids, return_inverse=True) + unique_natural_ids, inverse = np.unique(natural_ids, return_inverse=True) for i, natural_id in enumerate(unique_natural_ids): if none_present and natural_id == none_replacement: @@ -165,3 +165,13 @@ def _check_supported_type_of_value_in_partition_by(self) -> None: f"The specified column in {self._partition_by} is of type {dtype} " f"however only ints (with None) and strings (with None) are acceptable" ) + +if __name__ == "__main__": + import datasets + + dataset = datasets.load_dataset("speech_commands", "v0.01") + from flwr_datasets.partitioner import NaturalIdPartitioner + + nip = NaturalIdPartitioner("speaker_id") + nip.dataset = dataset["train"] + ps = [nip.load_partition(i) for i in range(nip.num_partitions)] From 961979cb2d3ada6ac693ff808b51ff78d44643e0 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Fri, 19 Apr 2024 12:19:04 +0200 Subject: [PATCH 05/16] Remove main --- .../partitioner/natural_id_partitioner.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py index 1c6d859915de..63a4c8bc0f3f 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py @@ -163,15 +163,6 @@ def _check_supported_type_of_value_in_partition_by(self) -> None: ): raise ValueError( f"The specified column in {self._partition_by} is of type {dtype} " - f"however only ints (with None) and strings (with None) are acceptable" + f"however only ints (with None) and strings (with None) are acceptable." ) -if __name__ == "__main__": - import datasets - - dataset = datasets.load_dataset("speech_commands", "v0.01") - from flwr_datasets.partitioner import NaturalIdPartitioner - - nip = NaturalIdPartitioner("speaker_id") - nip.dataset = dataset["train"] - ps = [nip.load_partition(i) for i in range(nip.num_partitions)] From d6ef2bf6e0186838980ec64c3aace93511645336 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Fri, 19 Apr 2024 13:50:28 +0200 Subject: [PATCH 06/16] Fix formatting --- datasets/flwr_datasets/partitioner/natural_id_partitioner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py index 63a4c8bc0f3f..d258ebc32790 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py @@ -165,4 +165,3 @@ def _check_supported_type_of_value_in_partition_by(self) -> None: f"The specified column in {self._partition_by} is of type {dtype} " f"however only ints (with None) and strings (with None) are acceptable." ) - From 152c5e0898975e0efc2a342746cc35d402032240 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Fri, 19 Apr 2024 14:03:44 +0200 Subject: [PATCH 07/16] Update error type --- .../flwr_datasets/partitioner/natural_id_partitioner_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py index f447634ad9ed..0839f8ead972 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py @@ -95,7 +95,7 @@ def test_partitioner_with_non_existing_column_partition_by(self) -> None: dataset = _create_dataset(10, 2) partitioner = NaturalIdPartitioner(partition_by="not-existing") partitioner.dataset = dataset - with self.assertRaises(ValueError): + with self.assertRaises(KeyError): partitioner.load_partition(0) @parameterized.expand( # type: ignore From 86b7d7e213071ca8bb68976d242bd91bde473e0a Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Fri, 26 Apr 2024 09:44:17 +0200 Subject: [PATCH 08/16] Check dtype based on only the first object --- datasets/flwr_datasets/partitioner/natural_id_partitioner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py index d258ebc32790..f6a44cd8e622 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py @@ -153,7 +153,7 @@ def partition_id_to_natural_id(self, value: Dict[int, str]) -> None: ) def _check_supported_type_of_value_in_partition_by(self) -> None: - values = self.dataset[self._partition_by] + values = self.dataset[0][self._partition_by] values_np = np.array(values) dtype = values_np.dtype if not ( From c0314209efce4dbcd8651eb9cec1d309662d4c4c Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Fri, 26 Apr 2024 10:04:41 +0200 Subject: [PATCH 09/16] Change the method to counter the None str + object presents --- .../flwr_datasets/partitioner/natural_id_partitioner.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py index f6a44cd8e622..86827c748be9 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py @@ -70,11 +70,15 @@ def _create_partition_id_to_indices(self) -> None: none_replacement: Union[int, str] if dtype == "string": none_replacement = "None" + new_term = "" + counter = 0 # Ensure the replacement is not in the dataset while True: - if none_replacement not in unique_natural_ids: + if none_replacement + new_term not in unique_natural_ids: + none_replacement = none_replacement + new_term break - none_replacement += "1" + counter += 1 + new_term = f"{counter}" elif "unit" in dtype: none_replacement = max(natural_ids) + 1 elif "int" in dtype: From 9a61ba79d2af1ca35dfd6be681ebf4bcf3235e0f Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Mon, 29 Apr 2024 13:49:09 +0200 Subject: [PATCH 10/16] Add tqdm --- datasets/flwr_datasets/partitioner/natural_id_partitioner.py | 5 ++++- datasets/pyproject.toml | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py index 86827c748be9..5223c964a55e 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py @@ -18,6 +18,7 @@ from typing import Dict, Union import numpy as np +from tqdm import tqdm import datasets from flwr_datasets.common.typing import NDArrayInt @@ -99,7 +100,9 @@ def _create_partition_id_to_indices(self) -> None: unique_natural_ids, inverse = np.unique(natural_ids, return_inverse=True) - for i, natural_id in enumerate(unique_natural_ids): + for i, natural_id in tqdm( + enumerate(unique_natural_ids), desc="Generating partition_id_to_indices" + ): if none_present and natural_id == none_replacement: # Use the natural_id that is present in the dataset (not replacement) natural_id = None diff --git a/datasets/pyproject.toml b/datasets/pyproject.toml index 7dfa60138582..c16389e1529b 100644 --- a/datasets/pyproject.toml +++ b/datasets/pyproject.toml @@ -58,6 +58,7 @@ datasets = "^2.14.6" pillow = { version = ">=6.2.1", optional = true } soundfile = { version = ">=0.12.1", optional = true } librosa = { version = ">=0.10.0.post2", optional = true } +tqdm ="^4.66.1" [tool.poetry.dev-dependencies] isort = "==5.13.2" From 2288bc02447293774d8d6fc820fa57ed4b99265a Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Tue, 30 Apr 2024 10:07:50 +0200 Subject: [PATCH 11/16] Check different indices creation method --- .../partitioner/natural_id_partitioner.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py index 5223c964a55e..9ff0c970e2af 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py @@ -60,6 +60,20 @@ def _create_natural_id_to_int_partition_id(self) -> None: } def _create_partition_id_to_indices(self) -> None: + natural_id_to_indices = {} + natural_ids = np.array(self.dataset[self._partition_by]) + + for index, natural_id in enumerate(natural_ids): + if natural_id not in natural_id_to_indices: + natural_id_to_indices[natural_id] = [] + natural_id_to_indices[natural_id].append(index) + + self._partition_id_to_indices = { + self._natural_id_to_partition_id[natural_id]: indices + for natural_id, indices in natural_id_to_indices.items() + } + + def _create_partition_id_to_indices_2(self) -> None: """Create an assignment of indices to the partition indices.""" natural_ids = np.array(self.dataset[self._partition_by]) unique_natural_ids = self.dataset.unique(self._partition_by) From 4fbe467e46fc648971cf1bf329acdb718ab96e23 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Tue, 30 Apr 2024 11:18:21 +0200 Subject: [PATCH 12/16] Use a more efficient method --- .../partitioner/natural_id_partitioner.py | 55 +------------------ 1 file changed, 2 insertions(+), 53 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py index 9ff0c970e2af..4c684f236a26 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py @@ -15,10 +15,9 @@ """Natural id partitioner class that works with Hugging Face Datasets.""" -from typing import Dict, Union +from typing import Dict import numpy as np -from tqdm import tqdm import datasets from flwr_datasets.common.typing import NDArrayInt @@ -60,7 +59,7 @@ def _create_natural_id_to_int_partition_id(self) -> None: } def _create_partition_id_to_indices(self) -> None: - natural_id_to_indices = {} + natural_id_to_indices = {} # type: ignore natural_ids = np.array(self.dataset[self._partition_by]) for index, natural_id in enumerate(natural_ids): @@ -73,56 +72,6 @@ def _create_partition_id_to_indices(self) -> None: for natural_id, indices in natural_id_to_indices.items() } - def _create_partition_id_to_indices_2(self) -> None: - """Create an assignment of indices to the partition indices.""" - natural_ids = np.array(self.dataset[self._partition_by]) - unique_natural_ids = self.dataset.unique(self._partition_by) - - none_present = False - if None in unique_natural_ids: - none_present = True - dtype = self.dataset.features[self._partition_by].dtype - none_replacement: Union[int, str] - if dtype == "string": - none_replacement = "None" - new_term = "" - counter = 0 - # Ensure the replacement is not in the dataset - while True: - if none_replacement + new_term not in unique_natural_ids: - none_replacement = none_replacement + new_term - break - counter += 1 - new_term = f"{counter}" - elif "unit" in dtype: - none_replacement = max(natural_ids) + 1 - elif "int" in dtype: - none_replacement = -1 - if none_replacement in unique_natural_ids: - none_replacement = max(natural_ids) + 1 - else: - raise ValueError( - "The type of values in the `partition_by` column needs " - "to be int or string" - ) - - # Replace the None by the none_replacement (in order to be able to use the - # np.unique(value, return_inverse) that requires no None and same val types - is_none = np.vectorize(lambda x: x is None) - mask = is_none(natural_ids) - natural_ids[mask] = none_replacement - - unique_natural_ids, inverse = np.unique(natural_ids, return_inverse=True) - - for i, natural_id in tqdm( - enumerate(unique_natural_ids), desc="Generating partition_id_to_indices" - ): - if none_present and natural_id == none_replacement: - # Use the natural_id that is present in the dataset (not replacement) - natural_id = None - partition_id = self._natural_id_to_partition_id[natural_id] - self._partition_id_to_indices[partition_id] = np.where(inverse == i)[0] - def load_partition(self, partition_id: int) -> datasets.Dataset: """Load a single partition corresponding to a single `partition_id`. From 61818107f7963b3fe669d58f6167dc969b30e6f0 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Tue, 30 Apr 2024 11:25:56 +0200 Subject: [PATCH 13/16] Add docs --- .../partitioner/natural_id_partitioner.py | 50 ++++++++++++------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py index 4c684f236a26..7b80b6172fad 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py @@ -18,6 +18,7 @@ from typing import Dict import numpy as np +from tqdm import tqdm import datasets from flwr_datasets.common.typing import NDArrayInt @@ -25,7 +26,34 @@ class NaturalIdPartitioner(Partitioner): - """Partitioner for dataset that can be divided by a reference to id in dataset.""" + """Partitioner for dataset that can be divided by a reference to id in dataset. + + Parameters + ---------- + partition_by: str + The name of the column that contains the unique values of partitions. + + + Examples + -------- + "flwrlabs/shakespeare" dataset + >>> from flwr_datasets import FederatedDataset + >>> from flwr_datasets.partitioner import NaturalIdPartitioner + >>> + >>> partitioner = NaturalIdPartitioner(partition_by="character_id") + >>> fds = FederatedDataset(dataset="flwrlabs/shakespeare", + >>> partitioners={"train": partitioner}) + >>> partition = fds.load_partition(0) + + "sentiment140" (aka Twitter) dataset + >>> from flwr_datasets import FederatedDataset + >>> from flwr_datasets.partitioner import NaturalIdPartitioner + >>> + >>> partitioner = NaturalIdPartitioner(partition_by="character_id") + >>> fds = FederatedDataset(dataset="sentiment140", + >>> partitioners={"train": partitioner}) + >>> partition = fds.load_partition(0) + """ def __init__( self, @@ -62,7 +90,9 @@ def _create_partition_id_to_indices(self) -> None: natural_id_to_indices = {} # type: ignore natural_ids = np.array(self.dataset[self._partition_by]) - for index, natural_id in enumerate(natural_ids): + for index, natural_id in tqdm( + enumerate(natural_ids), desc="Generating partition_id_to_indices" + ): if natural_id not in natural_id_to_indices: natural_id_to_indices[natural_id] = [] natural_id_to_indices[natural_id].append(index) @@ -89,7 +119,6 @@ def load_partition(self, partition_id: int) -> datasets.Dataset: single dataset partition """ if len(self._partition_id_to_natural_id) == 0: - self._check_supported_type_of_value_in_partition_by() self._create_int_partition_id_to_natural_id() self._create_natural_id_to_int_partition_id() @@ -102,7 +131,6 @@ def load_partition(self, partition_id: int) -> datasets.Dataset: def num_partitions(self) -> int: """Total number of partitions.""" if len(self._partition_id_to_natural_id) == 0: - self._check_supported_type_of_value_in_partition_by() self._create_int_partition_id_to_natural_id() self._create_natural_id_to_int_partition_id() return len(self._partition_id_to_natural_id) @@ -121,17 +149,3 @@ def partition_id_to_natural_id(self, value: Dict[int, str]) -> None: raise AttributeError( "Setting the partition_id_to_natural_id dictionary is not allowed." ) - - def _check_supported_type_of_value_in_partition_by(self) -> None: - values = self.dataset[0][self._partition_by] - values_np = np.array(values) - dtype = values_np.dtype - if not ( - np.issubdtype(dtype, np.object_) - or np.issubdtype(dtype, np.integer) - or np.issubdtype(dtype, np.str_) - ): - raise ValueError( - f"The specified column in {self._partition_by} is of type {dtype} " - f"however only ints (with None) and strings (with None) are acceptable." - ) From be1d63fff2a6bbc6424a1f16cb63ee4cda205068 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Tue, 30 Apr 2024 11:33:20 +0200 Subject: [PATCH 14/16] Fix tests --- .../flwr_datasets/partitioner/natural_id_partitioner_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py index 0839f8ead972..f447634ad9ed 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner_test.py @@ -95,7 +95,7 @@ def test_partitioner_with_non_existing_column_partition_by(self) -> None: dataset = _create_dataset(10, 2) partitioner = NaturalIdPartitioner(partition_by="not-existing") partitioner.dataset = dataset - with self.assertRaises(KeyError): + with self.assertRaises(ValueError): partitioner.load_partition(0) @parameterized.expand( # type: ignore From 283388d2b5e875b5b8e9526d3812de75b7a64cd9 Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Mon, 6 May 2024 10:15:57 +0200 Subject: [PATCH 15/16] Update datasets/flwr_datasets/partitioner/natural_id_partitioner.py Co-authored-by: Javier --- datasets/flwr_datasets/partitioner/natural_id_partitioner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py index 7b80b6172fad..3c7db0328fe3 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py @@ -49,7 +49,7 @@ class NaturalIdPartitioner(Partitioner): >>> from flwr_datasets import FederatedDataset >>> from flwr_datasets.partitioner import NaturalIdPartitioner >>> - >>> partitioner = NaturalIdPartitioner(partition_by="character_id") + >>> partitioner = NaturalIdPartitioner(partition_by="user") >>> fds = FederatedDataset(dataset="sentiment140", >>> partitioners={"train": partitioner}) >>> partition = fds.load_partition(0) From 6f050bf7e7e9cb81f660af503010312058d98bb8 Mon Sep 17 00:00:00 2001 From: Javier Date: Mon, 6 May 2024 12:25:53 +0100 Subject: [PATCH 16/16] Update datasets/flwr_datasets/partitioner/natural_id_partitioner.py --- datasets/flwr_datasets/partitioner/natural_id_partitioner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py index 3c7db0328fe3..85f1b3af43c2 100644 --- a/datasets/flwr_datasets/partitioner/natural_id_partitioner.py +++ b/datasets/flwr_datasets/partitioner/natural_id_partitioner.py @@ -26,7 +26,7 @@ class NaturalIdPartitioner(Partitioner): - """Partitioner for dataset that can be divided by a reference to id in dataset. + """Partitioner for a dataset that can be divided by a column with partition ids. Parameters ----------