From cb69493c9214864774fc52462be0cf0a52ceb0ad Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Thu, 29 Aug 2024 15:38:07 +0200 Subject: [PATCH 1/6] Add tests for pacs, cinic10, caltech101, office-home --- .../flwr_datasets/federated_dataset_test.py | 63 ++++++++++++++----- datasets/flwr_datasets/mock_utils_test.py | 2 + 2 files changed, 49 insertions(+), 16 deletions(-) diff --git a/datasets/flwr_datasets/federated_dataset_test.py b/datasets/flwr_datasets/federated_dataset_test.py index 64d75a7a7a5a..6ab9ad0ebda1 100644 --- a/datasets/flwr_datasets/federated_dataset_test.py +++ b/datasets/flwr_datasets/federated_dataset_test.py @@ -35,12 +35,29 @@ mocked_datasets = ["cifar100", "svhn", "sentiment140", "speech_commands"] +mocked_by_partial_download_datasets = [ + "flwrlabs/pacs", + "flwrlabs/cinic10", + "flwrlabs/caltech101", + "flwrlabs/office-home", +] + +natural_id_datasets = [ + "flwrlabs/femnist", +] + +mocked_natural_id_datasets = [ + "flwrlabs/ucf101", + "flwrlabs/ambient-acoustic-context", + "LIUM/tedlium", +] + @parameterized_class( ("dataset_name", "test_split", "subset"), [ - # Downloaded - # #Image datasets + ### Downloaded ### + # Image ("mnist", "test", ""), ("cifar10", "test", ""), ("fashion_mnist", "test", ""), @@ -52,8 +69,8 @@ ("scikit-learn/adult-census-income", None, ""), ("jlh/uci-mushrooms", None, ""), ("scikit-learn/iris", None, ""), - # Mocked - # #Image + ### Mocked by local recreation ### + # Image ("cifar100", "test", ""), # Note: there's also the extra split and full_numbers subset ("svhn", "test", "cropped_digits"), @@ -61,6 +78,12 @@ ("sentiment140", "test", ""), # aka twitter # Audio ("speech_commands", "test", "v0.01"), + ### Mocked by partial download ### + # Image + ("flwrlabs/pacs", None, ""), + ("flwrlabs/cinic10", "test", ""), + ("flwrlabs/caltech101", None, ""), + ("flwrlabs/office-home", None, ""), ], ) class BaseFederatedDatasetsTest(unittest.TestCase): @@ -86,10 +109,29 @@ def setUp(self) -> None: self.mock_load_dataset.return_value = _load_mocked_dataset( self.dataset_name, [200, 100], ["train", self.test_split], self.subset ) + elif self.dataset_name in mocked_by_partial_download_datasets: + split_names = ["train"] + skip_take_lists = [[(0, 30), (1000, 30), (2000, 40)]] + # If the dataset has split test update the mocking to include it + if self.test_split is not None: + split_names.append(self.test_split) + skip_take_lists.append([(0, 30), (100, 30), (200, 40)]) + mock_return_value = _load_mocked_dataset_dict_by_partial_download( + dataset_name=self.dataset_name, + split_names=split_names, + skip_take_lists=skip_take_lists, + subset_name=None if self.subset == "" else self.subset, + ) + self.patcher = patch("datasets.load_dataset") + self.mock_load_dataset = self.patcher.start() + self.mock_load_dataset.return_value = mock_return_value def tearDown(self) -> None: """Clean up after the dataset mocking.""" - if self.dataset_name in mocked_datasets: + if ( + self.dataset_name in mocked_datasets + or self.dataset_name in mocked_by_partial_download_datasets + ): patch.stopall() @parameterized.expand( # type: ignore @@ -403,17 +445,6 @@ def test_mixed_type_partitioners_creates_from_int(self) -> None: ) -natural_id_datasets = [ - "flwrlabs/femnist", -] - -mocked_natural_id_datasets = [ - "flwrlabs/ucf101", - "flwrlabs/ambient-acoustic-context", - "LIUM/tedlium", -] - - @parameterized_class( ("dataset_name", "test_split", "subset", "partition_by"), [ diff --git a/datasets/flwr_datasets/mock_utils_test.py b/datasets/flwr_datasets/mock_utils_test.py index 3324ad5e7f51..4cbcdc80e913 100644 --- a/datasets/flwr_datasets/mock_utils_test.py +++ b/datasets/flwr_datasets/mock_utils_test.py @@ -428,6 +428,8 @@ def _load_mocked_dataset_dict_by_partial_download( subset_name: Optional[str] = None, ) -> DatasetDict: """Like _load_mocked_dataset_by_partial_download but for many splits.""" + assert len(split_names) == len(skip_take_lists), "The split_names should be the" + "same length as the skip_take_lists." dataset_dict = {} for split_name, skip_take_list in zip(split_names, skip_take_lists): dataset_dict[split_name] = _load_mocked_dataset_by_partial_download( From d63ab020a0dad439aabdefa15249dde372771e64 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Fri, 30 Aug 2024 13:18:59 +0200 Subject: [PATCH 2/6] Fix formatting --- datasets/flwr_datasets/mock_utils_test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/datasets/flwr_datasets/mock_utils_test.py b/datasets/flwr_datasets/mock_utils_test.py index 4cbcdc80e913..bc1254eccbd0 100644 --- a/datasets/flwr_datasets/mock_utils_test.py +++ b/datasets/flwr_datasets/mock_utils_test.py @@ -428,8 +428,9 @@ def _load_mocked_dataset_dict_by_partial_download( subset_name: Optional[str] = None, ) -> DatasetDict: """Like _load_mocked_dataset_by_partial_download but for many splits.""" - assert len(split_names) == len(skip_take_lists), "The split_names should be the" - "same length as the skip_take_lists." + assert len(split_names) == len( + skip_take_lists + ), "The split_names should be thesame length as the skip_take_lists." dataset_dict = {} for split_name, skip_take_list in zip(split_names, skip_take_lists): dataset_dict[split_name] = _load_mocked_dataset_by_partial_download( From b5d0d35faa21689f4bcaa7b625111faf2ca89583 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Tue, 10 Sep 2024 11:06:00 +0200 Subject: [PATCH 3/6] Add fed_isic2019 --- datasets/flwr_datasets/federated_dataset_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datasets/flwr_datasets/federated_dataset_test.py b/datasets/flwr_datasets/federated_dataset_test.py index 6ab9ad0ebda1..98dc65913d3e 100644 --- a/datasets/flwr_datasets/federated_dataset_test.py +++ b/datasets/flwr_datasets/federated_dataset_test.py @@ -40,6 +40,7 @@ "flwrlabs/cinic10", "flwrlabs/caltech101", "flwrlabs/office-home", + "flwrlabs/fed-isic2019", ] natural_id_datasets = [ @@ -84,6 +85,7 @@ ("flwrlabs/cinic10", "test", ""), ("flwrlabs/caltech101", None, ""), ("flwrlabs/office-home", None, ""), + ("flwrlabs/fed-isic2019", "test", ""), ], ) class BaseFederatedDatasetsTest(unittest.TestCase): From 445a89cfba997cbc1b0acb958d71619bf899d4d1 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Tue, 10 Sep 2024 11:06:11 +0200 Subject: [PATCH 4/6] Update tested datasets list --- datasets/flwr_datasets/utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/datasets/flwr_datasets/utils.py b/datasets/flwr_datasets/utils.py index 32904ded2861..b591d4d3e2ff 100644 --- a/datasets/flwr_datasets/utils.py +++ b/datasets/flwr_datasets/utils.py @@ -48,6 +48,11 @@ "Mike0307/MNIST-M", "flwrlabs/usps", "scikit-learn/iris", + "flwrlabs/pacs", + "flwrlabs/cinic10", + "flwrlabs/caltech101", + "flwrlabs/office-home", + "flwrlabs/fed-isic2019", ] From 1b9c5ce1c448e956b89516e9b8a5d5d0992e911a Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Tue, 10 Sep 2024 11:07:12 +0200 Subject: [PATCH 5/6] Remove more than 1 leading # --- datasets/flwr_datasets/federated_dataset_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datasets/flwr_datasets/federated_dataset_test.py b/datasets/flwr_datasets/federated_dataset_test.py index 98dc65913d3e..846312acf468 100644 --- a/datasets/flwr_datasets/federated_dataset_test.py +++ b/datasets/flwr_datasets/federated_dataset_test.py @@ -57,7 +57,7 @@ @parameterized_class( ("dataset_name", "test_split", "subset"), [ - ### Downloaded ### + # Downloaded # Image ("mnist", "test", ""), ("cifar10", "test", ""), @@ -70,7 +70,7 @@ ("scikit-learn/adult-census-income", None, ""), ("jlh/uci-mushrooms", None, ""), ("scikit-learn/iris", None, ""), - ### Mocked by local recreation ### + # Mocked by local recreation # Image ("cifar100", "test", ""), # Note: there's also the extra split and full_numbers subset @@ -79,7 +79,7 @@ ("sentiment140", "test", ""), # aka twitter # Audio ("speech_commands", "test", "v0.01"), - ### Mocked by partial download ### + # Mocked by partial download # Image ("flwrlabs/pacs", None, ""), ("flwrlabs/cinic10", "test", ""), From 6f40e672ec4d4a9155d62204dafb9f83e7f4ad92 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Tue, 10 Sep 2024 19:05:12 +0100 Subject: [PATCH 6/6] format --- datasets/flwr_datasets/federated_dataset_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/flwr_datasets/federated_dataset_test.py b/datasets/flwr_datasets/federated_dataset_test.py index 846312acf468..6ccf06ccf073 100644 --- a/datasets/flwr_datasets/federated_dataset_test.py +++ b/datasets/flwr_datasets/federated_dataset_test.py @@ -70,7 +70,7 @@ ("scikit-learn/adult-census-income", None, ""), ("jlh/uci-mushrooms", None, ""), ("scikit-learn/iris", None, ""), - # Mocked by local recreation + # Mocked by local recreation # Image ("cifar100", "test", ""), # Note: there's also the extra split and full_numbers subset @@ -79,7 +79,7 @@ ("sentiment140", "test", ""), # aka twitter # Audio ("speech_commands", "test", "v0.01"), - # Mocked by partial download + # Mocked by partial download # Image ("flwrlabs/pacs", None, ""), ("flwrlabs/cinic10", "test", ""),