From e8cb7abbcd216d9f662da03d2fb426d95797ef2f Mon Sep 17 00:00:00 2001 From: KarhouTam Date: Fri, 21 Jun 2024 00:30:22 +0800 Subject: [PATCH] Reformat --- .../flwr_datasets/partitioner/__init__.py | 2 +- .../partitioner/semantic_partitioner.py | 48 +++++++++---------- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/datasets/flwr_datasets/partitioner/__init__.py b/datasets/flwr_datasets/partitioner/__init__.py index 72e696a606c6..59435d072c60 100644 --- a/datasets/flwr_datasets/partitioner/__init__.py +++ b/datasets/flwr_datasets/partitioner/__init__.py @@ -35,7 +35,7 @@ "LinearPartitioner", "NaturalIdPartitioner", "Partitioner", - "SemanticPartitioner" + "SemanticPartitioner", "ShardPartitioner", "SizePartitioner", "SquarePartitioner", diff --git a/datasets/flwr_datasets/partitioner/semantic_partitioner.py b/datasets/flwr_datasets/partitioner/semantic_partitioner.py index f43a38e148d6..4f64cf3ee965 100644 --- a/datasets/flwr_datasets/partitioner/semantic_partitioner.py +++ b/datasets/flwr_datasets/partitioner/semantic_partitioner.py @@ -44,31 +44,31 @@ class SemanticPartitioner(Partitioner): References: What Do We Mean by Generalization in Federated Learning? (accepted by ICLR 2022) - https://arxiv.org/abs/2110.14216 + https://arxiv.org/abs/2110.14216 (Cited from section 4.1 in the paper) - - Semantic partitioner's goal is to reverse-engineer the federated dataset-generating - process so that each client possesses semantically similar data. For example, for - the EMNIST dataset, we expect every client (writer) to (i) write in a consistent style - for each digit (intra-client intra-label similarity) and (ii) use a consistent writing - style across all digits (intra-client inter-label similarity). A simple approach might - be to cluster similar examples together and sample client data from clusters. However, - if one directly clusters the entire dataset, the resulting clusters may end up largely - correlated to labels. To disentangle the effect of label heterogeneity and semantic - heterogeneity, we propose the following algorithm to enforce intra-client intra-label + + Semantic partitioner's goal is to reverse-engineer the federated dataset-generating + process so that each client possesses semantically similar data. For example, for + the EMNIST dataset, we expect every client (writer) to (i) write in a consistent style + for each digit (intra-client intra-label similarity) and (ii) use a consistent writing + style across all digits (intra-client inter-label similarity). A simple approach might + be to cluster similar examples together and sample client data from clusters. However, + if one directly clusters the entire dataset, the resulting clusters may end up largely + correlated to labels. To disentangle the effect of label heterogeneity and semantic + heterogeneity, we propose the following algorithm to enforce intra-client intra-label similarity and intra-client inter-label similarity in two separate stages. - - • Stage 1: For each label, we embed examples using a pretrained neural network - (extracting semantic features), and fit a Gaussian Mixture Model to cluster pretrained - embeddings into groups. Note that this results in multiple groups per label. + + • Stage 1: For each label, we embed examples using a pretrained neural network + (extracting semantic features), and fit a Gaussian Mixture Model to cluster pretrained + embeddings into groups. Note that this results in multiple groups per label. This stage enforces intra-client intra-label consistency. - - • Stage 2: To package the clusters from different labels into clients, we aim to compute - an optimal multi-partite matching with cost-matrix defined by KL-divergence between - the Gaussian clusters. To reduce complexity, we heuristically solve the optimal multi-partite - matching by progressively solving the optimal bipartite matching at each time for - randomly-chosen label pairs. + + • Stage 2: To package the clusters from different labels into clients, we aim to compute + an optimal multi-partite matching with cost-matrix defined by KL-divergence between + the Gaussian clusters. To reduce complexity, we heuristically solve the optimal multi-partite + matching by progressively solving the optimal bipartite matching at each time for + randomly-chosen label pairs. This stage enforces intra-client inter-label consistency. Parameters @@ -338,9 +338,7 @@ def _determine_partition_id_to_indices_if_needed(self) -> None: self._partition_id_to_indices_determined = True def _preprocess_dataset_images(self): - images = np.array( - self.dataset[self._data_column_name], dtype=np.float32 - ) + images = np.array(self.dataset[self._data_column_name], dtype=np.float32) if len(images.shape) == 3: # 1D images = np.reshape( images, (images.shape[0], 1, images.shape[1], images.shape[2]) @@ -401,7 +399,7 @@ def _check_data_validation_if_needed(self): ) elif len(data.shape) == 3: x, y, z = data.shape - if not ((x < y and x < z) or (z < x and z < y)) : + if not ((x < y and x < z) or (z < x and z < y)): raise ValueError( "The 3D image shape should be [C, H, W] or [H, W, C]. " f"Now: {data.shape}. "