Merge branch 'main' of https://github.com/adap/flower into cpp-better…

…-comms
adap · Apr 4, 2024 · 982d1d4 · 982d1d4
2 parents ecf0aef + 9826ad9
commit 982d1d4
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 18 deletions.
diff --git a/datasets/flwr_datasets/utils.py b/datasets/flwr_datasets/utils.py
@@ -133,6 +133,7 @@ def divide_dataset(
     >>> train_test = divide_dataset(dataset=partition, division=division)
     >>> train, test = train_test["train"], train_test["test"]
     """
+    _check_division_config_correctness(division)
     dataset_length = len(dataset)
     ranges = _create_division_indices_ranges(dataset_length, division)
     if isinstance(division, (list, tuple)):
@@ -162,15 +163,15 @@ def _create_division_indices_ranges(
         for fraction in division:
             end_idx += int(dataset_length * fraction)
             ranges.append(range(start_idx, end_idx))
-            start_idx += end_idx
+            start_idx = end_idx
     elif isinstance(division, dict):
         ranges = []
         start_idx = 0
         end_idx = 0
         for fraction in division.values():
             end_idx += int(dataset_length * fraction)
             ranges.append(range(start_idx, end_idx))
-            start_idx += end_idx
+            start_idx = end_idx
     else:
         TypeError(
             f"The type of the `division` should be dict, "
@@ -274,6 +275,7 @@ def concatenate_divisions(
     concatenated_divisions : Dataset
         A dataset created as concatenation of the divisions from all partitions.
     """
+    _check_division_config_correctness(partition_division)
     divisions = []
     zero_len_divisions = 0
     for partition_id in range(partitioner.num_partitions):

diff --git a/datasets/flwr_datasets/utils_test.py b/datasets/flwr_datasets/utils_test.py
@@ -31,13 +31,32 @@
         "expected_concatenation_size",
     ),
     [
+        # Create 1 division
+        ((1.0,), [40], 0, 40),
+        ({"train": 1.0}, [40], "train", 40),
+        # Create 2 divisions
         ((0.8, 0.2), [32, 8], 1, 8),
-        ([0.8, 0.2], [32, 8], 1, 8),
         ({"train": 0.8, "test": 0.2}, [32, 8], "test", 8),
+        # Create 3 divisions
+        ([0.6, 0.2, 0.2], [24, 8, 8], 1, 8),
+        ({"train": 0.6, "valid": 0.2, "test": 0.2}, [24, 8, 8], "test", 8),
+        # Create 4 divisions
+        ([0.4, 0.2, 0.2, 0.2], [16, 8, 8, 8], 1, 8),
+        ({"0": 0.4, "1": 0.2, "2": 0.2, "3": 0.2}, [16, 8, 8, 8], "1", 8),
         # Not full dataset
+        # Create 1 division
+        ([0.8], [32], 0, 32),
+        ({"train": 0.8}, [32], "train", 32),
+        # Create 2 divisions
         ([0.2, 0.1], [8, 4], 1, 4),
         ((0.2, 0.1), [8, 4], 0, 8),
         ({"train": 0.2, "test": 0.1}, [8, 4], "test", 4),
+        # Create 3 divisions
+        ([0.6, 0.2, 0.1], [24, 8, 4], 2, 4),
+        ({"train": 0.6, "valid": 0.2, "test": 0.1}, [24, 8, 4], "test", 4),
+        # Create 4 divisions
+        ([0.4, 0.2, 0.1, 0.2], [16, 8, 4, 8], 2, 4),
+        ({"0": 0.4, "1": 0.2, "2": 0.1, "3": 0.2}, [16, 8, 4, 8], "2", 4),
     ],
 )
 class UtilsTests(unittest.TestCase):
@@ -60,7 +79,7 @@ def test_correct_sizes(self) -> None:
         else:
             lengths = [len(split) for split in divided_dataset.values()]
 
-        self.assertEqual(lengths, self.sizes)
+        self.assertEqual(self.sizes, lengths)
 
     def test_correct_return_types(self) -> None:
         """Test correct types of the divided dataset based on the config."""

diff --git a/examples/custom-mods/README.md b/examples/custom-mods/README.md
@@ -288,7 +288,7 @@ $ tree .
 pip install -r requirements.txt
 ```
 
-For [W&B](wandb.ai) you will also need a valid account.
+For [W&B](https://wandb.ai) you will also need a valid account.
 
 ### Start the long-running Flower server (SuperLink)
 
@@ -328,7 +328,7 @@ flower-server-app server:app --insecure
 
 ### Check the results
 
-For W&B, you will need to login to the [website](wandb.ai).
+For W&B, you will need to login to the [website](https://wandb.ai).
 
 For TensorBoard, you will need to run the following command in your terminal:
 

diff --git a/examples/vertical-fl/README.md b/examples/vertical-fl/README.md
@@ -123,7 +123,7 @@ In `task.py`, you'll find the preprocessing functions we'll apply to our data:
   'Adult' for ages between 11 and 40, and 'Elderly' for those over 40. If the age
   isn't listed, we'll label it as 'Unknown'.
 
-  ```python3
+  ```python
     def _bin_age(age_series):
         bins = [-np.inf, 10, 40, np.inf]
         labels = ["Child", "Adult", "Elderly"]
@@ -138,7 +138,7 @@ In `task.py`, you'll find the preprocessing functions we'll apply to our data:
   understand social status and family roles, simplifying rare titles into a single
   'Rare' category and converting any French titles to their English equivalents.
 
-  ```python3
+  ```python
     def _extract_title(name_series):
         titles = name_series.str.extract(" ([A-Za-z]+)\.", expand=False)
         rare_titles = {
@@ -170,7 +170,7 @@ In `task.py`, you'll find the preprocessing functions we'll apply to our data:
   'Pclass', 'Embarked', 'Title', 'Cabin', and the binned 'Age' into One-Hot
   encodings.
 
-  ```python3
+  ```python
     def _create_features(df):
         # Convert 'Age' to numeric, coercing errors to NaN
         df["Age"] = pd.to_numeric(df["Age"], errors="coerce")
@@ -190,7 +190,7 @@ In `task.py`, you'll find the preprocessing functions we'll apply to our data:
 In `task.py`, we also partition our data for our 3 clients to mirror real-life
 collaborations where different organizations hold different feature sets:
 
-```python3
+```python
 def _partition_data(df, all_keywords):
     partitions = []
     keywords_sets = [{"Parch", "Cabin", "Pclass"}, {"Sex", "Title"}]
@@ -236,7 +236,7 @@ collective intelligence without sharing sensitive information.
 
 Note that our final data processing function looks like that:
 
-```python3
+```python
 def get_partitions_and_label():
     df = pd.read_csv("_static/data/train.csv")
     processed_df = df.dropna(subset=["Embarked", "Fare"]).copy()
@@ -259,7 +259,7 @@ Each client's model is a neural network designed to operate on a distinct subset
 of features held by a client. In this example we will use simple linear
 regression models.
 
-```python3
+```python
 class ClientModel(nn.Module):
     def __init__(self, input_size):
         super(ClientModel, self).__init__()
@@ -281,7 +281,7 @@ The server's model acts as the central aggregator in the VFL system. It's also a
 neural network but with a slightly different architecture tailored to its role
 in aggregating the client models' outputs.
 
-```python3
+```python
 class ServerModel(nn.Module):
     def __init__(self):
         super(ServerModel, self).__init__()
@@ -305,7 +305,7 @@ a probability score indicative of the likelihood of survival.
 The strategy we will write to perform the aggregation will inherit from `FedAvg`
 and set the following additional attributes:
 
-```python3
+```python
 self.model = ServerModel(12)
 self.initial_parameters = ndarrays_to_parameters(
     [val.cpu().numpy() for _, val in self.model.state_dict().items()]
@@ -319,7 +319,7 @@ With `labels` given as an argument to the strategy.
 
 We then redefine the `aggregate_fit` method:
 
-```python3
+```python
 def aggregate_fit(
     self,
     rnd,
@@ -406,7 +406,7 @@ The last thing we have to do is to redefine the `aggregate_evaluate` function to
 disable distributed evaluation (as the clients do not hold any labels to test
 their local models).
 
-```python3
+```python
 def aggregate_evaluate(
     self,
     rnd,
@@ -420,7 +420,7 @@ def aggregate_evaluate(
 
 Our `FlowerClient` class is going to be quite straight forward.
 
-```python3
+```python
 class FlowerClient(fl.client.NumPyClient):
     def __init__(self, cid, data):
         self.cid = cid
@@ -487,7 +487,7 @@ the `aggregate_evaluate` function of the strategy.
 Putting everything together, to start our simulation we use the following
 function:
 
-```python3
+```python
 hist = fl.simulation.start_simulation(
     client_fn=client_fn,
     num_clients=3,