From 915a6f38cd4d1ef77d09e6c1c217a99fbe1dc82f Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Sat, 20 Jul 2024 13:51:20 +0200 Subject: [PATCH 1/4] Add function to perform partial download of dataset for tests --- datasets/flwr_datasets/mock_utils_test.py | 46 ++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/mock_utils_test.py b/datasets/flwr_datasets/mock_utils_test.py index bd49de8033de..fe118ff19319 100644 --- a/datasets/flwr_datasets/mock_utils_test.py +++ b/datasets/flwr_datasets/mock_utils_test.py @@ -19,7 +19,7 @@ import random import string from datetime import datetime, timedelta -from typing import Any, Dict, List, Set, Tuple, Union +from typing import Any, Dict, List, Optional, Set, Tuple, Union import numpy as np from PIL import Image @@ -375,3 +375,47 @@ def _load_mocked_dataset( for params in zip(num_rows, split_names): dataset_dict[params[1]] = dataset_creation_fnc(params[0]) return datasets.DatasetDict(dataset_dict) + + +def _download_partial_dataset( + dataset_name: str, + split_name: str, + skip_take_list: List[Tuple[int, int]], + subset_name: Optional[str] = None, +) -> Dataset: + """Download a partial dataset. + + This functionality is not supported in the datasets library. This is an informal + way of achieving this by using the `streaming=True` and creating a dataset.Dataset + from in-memory objects. + + Parameters + ---------- + dataset_name: str + Name of the dataset (passed to load_dataset). + split_name: str + Name of the split (passed to load_dataset) e.g. "train". + skip_take_list: List[Tuple[int, int]] + The streaming mode has a specific type of accessing the data, the first tuple + value is how many samples to skip, the second is how many samples to take. Due + to this mechanism, diverse samples can be taken (especially if the dataset is + sorted by the natual_id for NaturalIdPartitioner). + subset_name: Optional[str] + Name of the subset (passed to load_dataset) e.g. "v0.01" for speech_commands. + + Returns + ------- + dataset: Dataset + The dataset with the requested samples. + """ + dataset = datasets.load_dataset( + dataset_name, name=subset_name, split=split_name, streaming=True + ) + dataset_list = [] + # It's a list of dict such that each dict represent a single sample of the dataset + # The sample is exactly the same as if the full dataset was downloaded and indexed + for skip, take in skip_take_list: + # dataset.skip(n).take(m) in streaming mode is equivalent (in terms of return) + # to the fully downloaded dataset index: dataset[n+1: (n+1 + m)] + dataset_list.extend(list(dataset.skip(skip).take(take))) + return Dataset.from_list(dataset_list) From e81680d1cbed28b892041142bc75fe0ec67dcec4 Mon Sep 17 00:00:00 2001 From: Adam Narozniak Date: Sat, 20 Jul 2024 15:26:03 +0200 Subject: [PATCH 2/4] Add a function that handles multiple splits --- datasets/flwr_datasets/mock_utils_test.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/mock_utils_test.py b/datasets/flwr_datasets/mock_utils_test.py index fe118ff19319..93e771b5bb60 100644 --- a/datasets/flwr_datasets/mock_utils_test.py +++ b/datasets/flwr_datasets/mock_utils_test.py @@ -377,7 +377,7 @@ def _load_mocked_dataset( return datasets.DatasetDict(dataset_dict) -def _download_partial_dataset( +def _load_mocked_dataset_by_partial_download( dataset_name: str, split_name: str, skip_take_list: List[Tuple[int, int]], @@ -419,3 +419,18 @@ def _download_partial_dataset( # to the fully downloaded dataset index: dataset[n+1: (n+1 + m)] dataset_list.extend(list(dataset.skip(skip).take(take))) return Dataset.from_list(dataset_list) + + +def _load_mocked_dataset_dict_by_partial_download( + dataset_name: str, + split_names: List[str], + skip_take_lists: List[List[Tuple[int, int]]], + subset_name: Optional[str] = None, +) -> DatasetDict: + """Like _load_mocked_dataset_by_partial_download but for many splits.""" + dataset_dict = {} + for split_name, skip_take_list in zip(split_names, skip_take_lists): + dataset_dict[split_name] = _load_mocked_dataset_by_partial_download( + dataset_name, split_name, skip_take_list, subset_name + ) + return DatasetDict(dataset_dict) From 93764337dce8ef4aa771ccefbd6b77c6823b0948 Mon Sep 17 00:00:00 2001 From: Adam Narozniak <51029327+adam-narozniak@users.noreply.github.com> Date: Mon, 22 Jul 2024 15:33:13 +0200 Subject: [PATCH 3/4] Update datasets/flwr_datasets/mock_utils_test.py Co-authored-by: Javier --- datasets/flwr_datasets/mock_utils_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/flwr_datasets/mock_utils_test.py b/datasets/flwr_datasets/mock_utils_test.py index 93e771b5bb60..80e699aed388 100644 --- a/datasets/flwr_datasets/mock_utils_test.py +++ b/datasets/flwr_datasets/mock_utils_test.py @@ -386,8 +386,8 @@ def _load_mocked_dataset_by_partial_download( """Download a partial dataset. This functionality is not supported in the datasets library. This is an informal - way of achieving this by using the `streaming=True` and creating a dataset.Dataset - from in-memory objects. + way of achieving partial dataset download by using the `streaming=True` and creating + a dataset.Dataset from in-memory objects. Parameters ---------- From ea413e83f840197066ff20f95968ae27d8fea113 Mon Sep 17 00:00:00 2001 From: jafermarq Date: Mon, 22 Jul 2024 14:35:32 +0100 Subject: [PATCH 4/4] format --- datasets/flwr_datasets/mock_utils_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/flwr_datasets/mock_utils_test.py b/datasets/flwr_datasets/mock_utils_test.py index 80e699aed388..7ee3bae890ff 100644 --- a/datasets/flwr_datasets/mock_utils_test.py +++ b/datasets/flwr_datasets/mock_utils_test.py @@ -386,7 +386,7 @@ def _load_mocked_dataset_by_partial_download( """Download a partial dataset. This functionality is not supported in the datasets library. This is an informal - way of achieving partial dataset download by using the `streaming=True` and creating + way of achieving partial dataset download by using the `streaming=True` and creating a dataset.Dataset from in-memory objects. Parameters