diff --git a/datasets/flwr_datasets/utils.py b/datasets/flwr_datasets/utils.py index a6e4fa8d0f0b..346d897ccdd6 100644 --- a/datasets/flwr_datasets/utils.py +++ b/datasets/flwr_datasets/utils.py @@ -133,6 +133,7 @@ def divide_dataset( >>> train_test = divide_dataset(dataset=partition, division=division) >>> train, test = train_test["train"], train_test["test"] """ + _check_division_config_correctness(division) dataset_length = len(dataset) ranges = _create_division_indices_ranges(dataset_length, division) if isinstance(division, (list, tuple)): @@ -162,7 +163,7 @@ def _create_division_indices_ranges( for fraction in division: end_idx += int(dataset_length * fraction) ranges.append(range(start_idx, end_idx)) - start_idx += end_idx + start_idx = end_idx elif isinstance(division, dict): ranges = [] start_idx = 0 @@ -170,7 +171,7 @@ def _create_division_indices_ranges( for fraction in division.values(): end_idx += int(dataset_length * fraction) ranges.append(range(start_idx, end_idx)) - start_idx += end_idx + start_idx = end_idx else: TypeError( f"The type of the `division` should be dict, " @@ -274,6 +275,7 @@ def concatenate_divisions( concatenated_divisions : Dataset A dataset created as concatenation of the divisions from all partitions. """ + _check_division_config_correctness(partition_division) divisions = [] zero_len_divisions = 0 for partition_id in range(partitioner.num_partitions): diff --git a/datasets/flwr_datasets/utils_test.py b/datasets/flwr_datasets/utils_test.py index 3bf5afddf978..4add9f88eeb5 100644 --- a/datasets/flwr_datasets/utils_test.py +++ b/datasets/flwr_datasets/utils_test.py @@ -31,13 +31,32 @@ "expected_concatenation_size", ), [ + # Create 1 division + ((1.0,), [40], 0, 40), + ({"train": 1.0}, [40], "train", 40), + # Create 2 divisions ((0.8, 0.2), [32, 8], 1, 8), - ([0.8, 0.2], [32, 8], 1, 8), ({"train": 0.8, "test": 0.2}, [32, 8], "test", 8), + # Create 3 divisions + ([0.6, 0.2, 0.2], [24, 8, 8], 1, 8), + ({"train": 0.6, "valid": 0.2, "test": 0.2}, [24, 8, 8], "test", 8), + # Create 4 divisions + ([0.4, 0.2, 0.2, 0.2], [16, 8, 8, 8], 1, 8), + ({"0": 0.4, "1": 0.2, "2": 0.2, "3": 0.2}, [16, 8, 8, 8], "1", 8), # Not full dataset + # Create 1 division + ([0.8], [32], 0, 32), + ({"train": 0.8}, [32], "train", 32), + # Create 2 divisions ([0.2, 0.1], [8, 4], 1, 4), ((0.2, 0.1), [8, 4], 0, 8), ({"train": 0.2, "test": 0.1}, [8, 4], "test", 4), + # Create 3 divisions + ([0.6, 0.2, 0.1], [24, 8, 4], 2, 4), + ({"train": 0.6, "valid": 0.2, "test": 0.1}, [24, 8, 4], "test", 4), + # Create 4 divisions + ([0.4, 0.2, 0.1, 0.2], [16, 8, 4, 8], 2, 4), + ({"0": 0.4, "1": 0.2, "2": 0.1, "3": 0.2}, [16, 8, 4, 8], "2", 4), ], ) class UtilsTests(unittest.TestCase): @@ -60,7 +79,7 @@ def test_correct_sizes(self) -> None: else: lengths = [len(split) for split in divided_dataset.values()] - self.assertEqual(lengths, self.sizes) + self.assertEqual(self.sizes, lengths) def test_correct_return_types(self) -> None: """Test correct types of the divided dataset based on the config.""" diff --git a/examples/custom-mods/README.md b/examples/custom-mods/README.md index b0ad668c2dec..6b03abcfbfe0 100644 --- a/examples/custom-mods/README.md +++ b/examples/custom-mods/README.md @@ -288,7 +288,7 @@ $ tree . pip install -r requirements.txt ``` -For [W&B](wandb.ai) you will also need a valid account. +For [W&B](https://wandb.ai) you will also need a valid account. ### Start the long-running Flower server (SuperLink) @@ -328,7 +328,7 @@ flower-server-app server:app --insecure ### Check the results -For W&B, you will need to login to the [website](wandb.ai). +For W&B, you will need to login to the [website](https://wandb.ai). For TensorBoard, you will need to run the following command in your terminal: diff --git a/examples/vertical-fl/README.md b/examples/vertical-fl/README.md index 78588180d3d6..d8c599d617c4 100644 --- a/examples/vertical-fl/README.md +++ b/examples/vertical-fl/README.md @@ -123,7 +123,7 @@ In `task.py`, you'll find the preprocessing functions we'll apply to our data: 'Adult' for ages between 11 and 40, and 'Elderly' for those over 40. If the age isn't listed, we'll label it as 'Unknown'. - ```python3 + ```python def _bin_age(age_series): bins = [-np.inf, 10, 40, np.inf] labels = ["Child", "Adult", "Elderly"] @@ -138,7 +138,7 @@ In `task.py`, you'll find the preprocessing functions we'll apply to our data: understand social status and family roles, simplifying rare titles into a single 'Rare' category and converting any French titles to their English equivalents. - ```python3 + ```python def _extract_title(name_series): titles = name_series.str.extract(" ([A-Za-z]+)\.", expand=False) rare_titles = { @@ -170,7 +170,7 @@ In `task.py`, you'll find the preprocessing functions we'll apply to our data: 'Pclass', 'Embarked', 'Title', 'Cabin', and the binned 'Age' into One-Hot encodings. - ```python3 + ```python def _create_features(df): # Convert 'Age' to numeric, coercing errors to NaN df["Age"] = pd.to_numeric(df["Age"], errors="coerce") @@ -190,7 +190,7 @@ In `task.py`, you'll find the preprocessing functions we'll apply to our data: In `task.py`, we also partition our data for our 3 clients to mirror real-life collaborations where different organizations hold different feature sets: -```python3 +```python def _partition_data(df, all_keywords): partitions = [] keywords_sets = [{"Parch", "Cabin", "Pclass"}, {"Sex", "Title"}] @@ -236,7 +236,7 @@ collective intelligence without sharing sensitive information. Note that our final data processing function looks like that: -```python3 +```python def get_partitions_and_label(): df = pd.read_csv("_static/data/train.csv") processed_df = df.dropna(subset=["Embarked", "Fare"]).copy() @@ -259,7 +259,7 @@ Each client's model is a neural network designed to operate on a distinct subset of features held by a client. In this example we will use simple linear regression models. -```python3 +```python class ClientModel(nn.Module): def __init__(self, input_size): super(ClientModel, self).__init__() @@ -281,7 +281,7 @@ The server's model acts as the central aggregator in the VFL system. It's also a neural network but with a slightly different architecture tailored to its role in aggregating the client models' outputs. -```python3 +```python class ServerModel(nn.Module): def __init__(self): super(ServerModel, self).__init__() @@ -305,7 +305,7 @@ a probability score indicative of the likelihood of survival. The strategy we will write to perform the aggregation will inherit from `FedAvg` and set the following additional attributes: -```python3 +```python self.model = ServerModel(12) self.initial_parameters = ndarrays_to_parameters( [val.cpu().numpy() for _, val in self.model.state_dict().items()] @@ -319,7 +319,7 @@ With `labels` given as an argument to the strategy. We then redefine the `aggregate_fit` method: -```python3 +```python def aggregate_fit( self, rnd, @@ -406,7 +406,7 @@ The last thing we have to do is to redefine the `aggregate_evaluate` function to disable distributed evaluation (as the clients do not hold any labels to test their local models). -```python3 +```python def aggregate_evaluate( self, rnd, @@ -420,7 +420,7 @@ def aggregate_evaluate( Our `FlowerClient` class is going to be quite straight forward. -```python3 +```python class FlowerClient(fl.client.NumPyClient): def __init__(self, cid, data): self.cid = cid @@ -487,7 +487,7 @@ the `aggregate_evaluate` function of the strategy. Putting everything together, to start our simulation we use the following function: -```python3 +```python hist = fl.simulation.start_simulation( client_fn=client_fn, num_clients=3,