Add compute_counts and frequencies fnc

adap · Jun 5, 2024 · 80131b0 · 80131b0
1 parent 9e7d28d
commit 80131b0
Show file tree

Hide file tree

Showing 3 changed files with 154 additions and 66 deletions.
diff --git a/datasets/flwr_datasets/metrics/__init__.py b/datasets/flwr_datasets/metrics/__init__.py
@@ -15,9 +15,9 @@
 """Metrics package."""
 
 
-from flwr_datasets.metrics.utils import compute_counts, compute_frequency
+from flwr_datasets.metrics.utils import compute_counts, compute_frequencies
 
 __all__ = [
     "compute_counts",
-    "compute_frequency",
+    "compute_frequencies",
 ]
diff --git a/datasets/flwr_datasets/metrics/utils.py b/datasets/flwr_datasets/metrics/utils.py
@@ -13,14 +13,139 @@
 # limitations under the License.
 # ==============================================================================
 """Utils for metrics computation."""
-
-
-from typing import List, Union
+import warnings
+from typing import List, Optional, Union
 
 import pandas as pd
 
+from flwr_datasets.partitioner import Partitioner
+
 
 def compute_counts(
+    partitioner: Partitioner,
+    column_name: str,
+    verbose_names: bool = False,
+    max_num_partitions: Optional[int] = None,
+) -> pd.DataFrame:
+    """Compute the counts of unique values in a given column in the partitions.
+
+    Take into account all possible labels in dataset when computing count for each
+    partition (assign 0 as the size when there are no values for a label in the
+    partition).
+
+    Parameters
+    ----------
+    partitioner : Partitioner
+        Partitioner with an assigned dataset.
+    column_name : str
+        Column name identifying label based on which the count will be calculated.
+    verbose_names : bool
+        Whether to use verbose versions of the values in the column specified by
+        `column_name`. The verbose value are possible to extract if the column is a
+        feature of type `ClassLabel`.
+    max_num_partitions : Optional[int]
+        The number of partitions that will be used. If left None, then all partitions
+        will be used.
+
+    Returns
+    -------
+    dataframe: pd.DataFrame
+        DataFrame where the rows represent the partition id and the column represent
+        the unique values found in column specified by `column_name`.
+    """
+    if column_name not in partitioner.dataset.column_names:
+        raise ValueError(
+            f"The specified 'column_name': '{column_name}' is not present in the "
+            f"dataset. The dataset contains columns {partitioner.dataset.column_names}."
+        )
+
+    if max_num_partitions is None:
+        max_num_partitions = partitioner.num_partitions
+    else:
+        max_num_partitions = min(max_num_partitions, partitioner.num_partitions)
+    assert isinstance(max_num_partitions, int)
+    partitions = [partitioner.load_partition(i) for i in range(max_num_partitions)]
+
+    partition = partitions[0]
+    try:
+        # Unique labels are needed to represent the correct count of each class
+        # (some of the classes can have zero samples that's why this
+        # adjustment is needed)
+        unique_labels = partition.features[column_name].str2int(
+            partition.features[column_name].names
+        )
+    except AttributeError:  # If the column_name is not formally a Label
+        unique_labels = partitioner.dataset.unique(column_name)
+
+    partition_id_to_label_absolute_size = {
+        pid: _compute_counts(partition[column_name], unique_labels)
+        for pid, partition in enumerate(partitions)
+    }
+
+    dataframe = pd.DataFrame.from_dict(
+        partition_id_to_label_absolute_size, orient="index"
+    )
+    dataframe.index.name = "Partition ID"
+
+    if verbose_names:
+        # Adjust the column name values of the dataframe
+        current_labels = dataframe.columns
+        try:
+            legend_names = partitioner.dataset.features[column_name].int2str(
+                [int(v) for v in current_labels]
+            )
+            dataframe.columns = legend_names
+        except AttributeError:
+            warnings.warn(
+                "The verbose names can not be established. "
+                "The column specified by 'column_name' needs to be of type "
+                "'ClassLabel' to create a verbose names. "
+                "The available names will used.",
+                stacklevel=1,
+            )
+    return dataframe
+
+
+def compute_frequencies(
+    partitioner: Partitioner,
+    column_name: str,
+    verbose_names: bool = False,
+    max_num_partitions: Optional[int] = None,
+) -> pd.DataFrame:
+    """Compute the frequencies of unique values in a given column in the partitions.
+
+    The frequencies sum up to 1 for a given partition id. Take into account all
+    possible labels in dataset when computing count for each partition (assign 0 as the
+    size when there are no values for a label in the partition).
+
+    Parameters
+    ----------
+    partitioner : Partitioner
+        Partitioner with an assigned dataset.
+    column_name : str
+        Column name identifying label based on which the count will be calculated.
+    verbose_names : bool
+        Whether to use verbose versions of the values in the column specified by
+        `column_name`. The verbose value are possible to extract if the column is a
+        feature of type `ClassLabel`.
+    max_num_partitions : Optional[int]
+        The number of partitions that will be used. If left None, then all partitions
+        will be used.
+
+    Returns
+    -------
+    dataframe: pd.DataFrame
+        DataFrame where the rows represent the partition id and the column represent
+        the unique values found in column specified by `column_name`.
+    """
+    dataframe = compute_counts(
+        partitioner, column_name, verbose_names, max_num_partitions
+    )
+    dataframe = dataframe.div(dataframe.sum(axis=1), axis=0)
+    return dataframe
+
+
+def _compute_counts(
     labels: Union[List[int], List[str]], unique_labels: Union[List[int], List[str]]
 ) -> pd.Series:
     """Compute the count of labels when taking into account all possible labels.
@@ -51,7 +176,7 @@ def compute_counts(
     return label_counts_with_zeros
 
 
-def compute_frequency(
+def _compute_frequencies(
     labels: Union[List[int], List[str]], unique_labels: Union[List[int], List[str]]
 ) -> pd.Series:
     """Compute the distribution of labels when taking into account all possible labels.
@@ -70,9 +195,9 @@ def compute_frequency(
     -------
         The pd.Series with label as indices and probabilities as values.
     """
-    counts = compute_counts(labels, unique_labels)
+    counts = _compute_counts(labels, unique_labels)
     if len(labels) == 0:
-        counts = counts.astype(float)
-        return counts
-    counts = counts.divide(len(labels))
-    return counts
+        frequencies = counts.astype(float)
+        return frequencies
+    frequencies = counts.divide(len(labels))
+    return frequencies
diff --git a/datasets/flwr_datasets/visualization/label_distribution.py b/datasets/flwr_datasets/visualization/label_distribution.py
@@ -15,15 +15,14 @@
 """Label distribution plotting."""
 
 
-import warnings
 from typing import Any, Dict, Optional, Tuple, Union
 
 import matplotlib.colors as mcolors
 import pandas as pd
 from matplotlib.axes import Axes
 from matplotlib.figure import Figure
 
-from flwr_datasets.metrics import compute_counts
+from flwr_datasets.metrics.utils import compute_counts, compute_frequencies
 from flwr_datasets.partitioner import Partitioner
 from flwr_datasets.visualization.bar_plot import _plot_bar
 from flwr_datasets.visualization.heatmap_plot import _plot_heatmap
@@ -81,7 +80,9 @@ def plot_label_distributions(
         Title for the legend. If None, the defaults will be takes based on the type of
         plot.
     verbose_labels : bool
-        Whether to use verbose versions of the labels.
+        Whether to use verbose versions of the labels. These values are used as columns
+        of the returned dataframe and as labels on the legend in a bar plot and columns/
+        rows ticks in a heatmap plot.
     plot_kwargs: Optional[Dict[str, Any]]
         Any key value pair that can be passed to a plot function that are not supported
         directly. In case of the parameter doubling (e.g. specifying cmap here too) the
@@ -192,60 +193,22 @@ def plot_label_distributions(
     """
     _validate_parameters(plot_type, size_unit, partition_id_axis)
 
-    if label_name not in partitioner.dataset.column_names:
-        raise ValueError(
-            f"The specified 'label_name': '{label_name}' is not present in the "
-            f"dataset. The dataset contains columns {partitioner.dataset.column_names}."
+    dataframe = pd.DataFrame()
+    if size_unit == "absolute":
+        dataframe = compute_counts(
+            partitioner=partitioner,
+            column_name=label_name,
+            verbose_names=verbose_labels,
+            max_num_partitions=max_num_partitions,
         )
-
-    if max_num_partitions is None:
-        max_num_partitions = partitioner.num_partitions
-    else:
-        max_num_partitions = min(max_num_partitions, partitioner.num_partitions)
-    assert isinstance(max_num_partitions, int)
-    partitions = [partitioner.load_partition(i) for i in range(max_num_partitions)]
-
-    partition = partitions[0]
-    try:
-        # Unique labels are needed to represent the correct count of each class
-        # (some of the classes can have zero samples that's why this
-        # adjustment is needed)
-        unique_labels = partition.features[label_name].str2int(
-            partition.features[label_name].names
+    elif size_unit == "percent":
+        dataframe = compute_frequencies(
+            partitioner=partitioner,
+            column_name=label_name,
+            verbose_names=verbose_labels,
+            max_num_partitions=max_num_partitions,
         )
-    except AttributeError:  # If the label_name is not formally a Label
-        unique_labels = partitioner.dataset.unique(label_name)
-
-    partition_id_to_label_absolute_size = {
-        pid: compute_counts(partition[label_name], unique_labels)
-        for pid, partition in enumerate(partitions)
-    }
-
-    dataframe = pd.DataFrame.from_dict(
-        partition_id_to_label_absolute_size, orient="index"
-    )
-    dataframe.index.name = "Partition ID"
-
-    if size_unit == "percent":
-        dataframe = dataframe.div(dataframe.sum(axis=1), axis=0) * 100.0
-
-    if verbose_labels:
-        # Adjust the column name values of the dataframe
-        # (these values are used for as labels in bar plot and columns/rows ticks
-        # in heatmap)
-        current_labels = dataframe.columns
-        try:
-            legend_names = partition.features[label_name].int2str(
-                [int(v) for v in current_labels]
-            )
-            dataframe.columns = legend_names
-        except AttributeError:
-            warnings.warn(
-                "The verbose label names can not be established. "
-                "The column specified by 'label_name' needs to be of type "
-                "'ClassLabel'",
-                stacklevel=1,
-            )
+        dataframe = dataframe * 100.0
 
     if plot_type == "bar":
         axis = _plot_bar(