From 6235372e93cf549428ce31ebac3547f663317d3d Mon Sep 17 00:00:00 2001 From: Chong Shen Ng Date: Wed, 17 Jul 2024 13:37:44 +0100 Subject: [PATCH] Add expression for distribution partitioner --- .../partitioner/distribution_partitioner.py | 26 ++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/distribution_partitioner.py b/datasets/flwr_datasets/partitioner/distribution_partitioner.py index d98b7d62a1e2..f6567846f6ff 100644 --- a/datasets/flwr_datasets/partitioner/distribution_partitioner.py +++ b/datasets/flwr_datasets/partitioner/distribution_partitioner.py @@ -28,9 +28,29 @@ class DistributionPartitioner(Partitioner): # pylint: disable=R0902 """Partitioner based on a distribution. - Inspired from implementations of Li et al. - Federated Optimization in Heterogeneous Networks (2020) - https://arxiv.org/abs/1812.06127. + Inspired from implementations of Li et al. Federated Optimization in + Heterogeneous Networks (2020) https://arxiv.org/abs/1812.06127. + + Given a user-specified distribution, the algorithm splits the dataset for each + unique label per partition where each label is assigned to the partitions in a + deterministic pathological manner. That is, given a distribution array of shape, + `num_unique_labels_per_partition` x `num_partitions` + ( `num_unique_labels`, ---------------------------------------------------- ), + `num_unique_labels` + the label_id at the i'th row is assigned to the partition_id based on the formula: + partition_id = alpha + beta + where, + alpha* = (i - num_unique_labels_per_partition + 1) \ + + (j % num_unique_labels_per_partition) + alpha = alpha* + (alpha* > 0 ? 0 : num_unique_labels) + beta = num_unique_labels * (j // num_unique_labels_per_partition) + and j in {0, 1, 2, ..., `num_columns`}. Each list representing the partition_ids for + the i'th row is sorted in ascending order. So, for a dataset with 10 unique labels + and a configuration with 20 partitions and 2 unique labels per partition, the 0'th + row of the distribution array (corresponding to class 0) will be assigned to + partitions [0, 9, 10, 19], 1st row (class 1) to [0, 1, 10, 11], 2nd row (class 2) + to [1, 2, 11, 12], 3rd row (class 3) to [2, 3, 12, 13], etc ... + The list representing the unique labels is sorted in ascending order. Parameters ----------