Skip to content

Commit

Permalink
Add compute_counts and frequencies fnc
Browse files Browse the repository at this point in the history
  • Loading branch information
adam-narozniak committed Jun 5, 2024
1 parent 9e7d28d commit 80131b0
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 66 deletions.
4 changes: 2 additions & 2 deletions datasets/flwr_datasets/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
"""Metrics package."""


from flwr_datasets.metrics.utils import compute_counts, compute_frequency
from flwr_datasets.metrics.utils import compute_counts, compute_frequencies

__all__ = [
"compute_counts",
"compute_frequency",
"compute_frequencies",
]
143 changes: 134 additions & 9 deletions datasets/flwr_datasets/metrics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,139 @@
# limitations under the License.
# ==============================================================================
"""Utils for metrics computation."""


from typing import List, Union
import warnings
from typing import List, Optional, Union

import pandas as pd

from flwr_datasets.partitioner import Partitioner


def compute_counts(
partitioner: Partitioner,
column_name: str,
verbose_names: bool = False,
max_num_partitions: Optional[int] = None,
) -> pd.DataFrame:
"""Compute the counts of unique values in a given column in the partitions.
Take into account all possible labels in dataset when computing count for each
partition (assign 0 as the size when there are no values for a label in the
partition).
Parameters
----------
partitioner : Partitioner
Partitioner with an assigned dataset.
column_name : str
Column name identifying label based on which the count will be calculated.
verbose_names : bool
Whether to use verbose versions of the values in the column specified by
`column_name`. The verbose value are possible to extract if the column is a
feature of type `ClassLabel`.
max_num_partitions : Optional[int]
The number of partitions that will be used. If left None, then all partitions
will be used.
Returns
-------
dataframe: pd.DataFrame
DataFrame where the rows represent the partition id and the column represent
the unique values found in column specified by `column_name`.
"""
if column_name not in partitioner.dataset.column_names:
raise ValueError(
f"The specified 'column_name': '{column_name}' is not present in the "
f"dataset. The dataset contains columns {partitioner.dataset.column_names}."
)

if max_num_partitions is None:
max_num_partitions = partitioner.num_partitions
else:
max_num_partitions = min(max_num_partitions, partitioner.num_partitions)
assert isinstance(max_num_partitions, int)
partitions = [partitioner.load_partition(i) for i in range(max_num_partitions)]

partition = partitions[0]
try:
# Unique labels are needed to represent the correct count of each class
# (some of the classes can have zero samples that's why this
# adjustment is needed)
unique_labels = partition.features[column_name].str2int(
partition.features[column_name].names
)
except AttributeError: # If the column_name is not formally a Label
unique_labels = partitioner.dataset.unique(column_name)

partition_id_to_label_absolute_size = {
pid: _compute_counts(partition[column_name], unique_labels)
for pid, partition in enumerate(partitions)
}

dataframe = pd.DataFrame.from_dict(
partition_id_to_label_absolute_size, orient="index"
)
dataframe.index.name = "Partition ID"

if verbose_names:
# Adjust the column name values of the dataframe
current_labels = dataframe.columns
try:
legend_names = partitioner.dataset.features[column_name].int2str(
[int(v) for v in current_labels]
)
dataframe.columns = legend_names
except AttributeError:
warnings.warn(
"The verbose names can not be established. "
"The column specified by 'column_name' needs to be of type "
"'ClassLabel' to create a verbose names. "
"The available names will used.",
stacklevel=1,
)
return dataframe


def compute_frequencies(
partitioner: Partitioner,
column_name: str,
verbose_names: bool = False,
max_num_partitions: Optional[int] = None,
) -> pd.DataFrame:
"""Compute the frequencies of unique values in a given column in the partitions.
The frequencies sum up to 1 for a given partition id. Take into account all
possible labels in dataset when computing count for each partition (assign 0 as the
size when there are no values for a label in the partition).
Parameters
----------
partitioner : Partitioner
Partitioner with an assigned dataset.
column_name : str
Column name identifying label based on which the count will be calculated.
verbose_names : bool
Whether to use verbose versions of the values in the column specified by
`column_name`. The verbose value are possible to extract if the column is a
feature of type `ClassLabel`.
max_num_partitions : Optional[int]
The number of partitions that will be used. If left None, then all partitions
will be used.
Returns
-------
dataframe: pd.DataFrame
DataFrame where the rows represent the partition id and the column represent
the unique values found in column specified by `column_name`.
"""
dataframe = compute_counts(
partitioner, column_name, verbose_names, max_num_partitions
)
dataframe = dataframe.div(dataframe.sum(axis=1), axis=0)
return dataframe


def _compute_counts(
labels: Union[List[int], List[str]], unique_labels: Union[List[int], List[str]]
) -> pd.Series:
"""Compute the count of labels when taking into account all possible labels.
Expand Down Expand Up @@ -51,7 +176,7 @@ def compute_counts(
return label_counts_with_zeros


def compute_frequency(
def _compute_frequencies(
labels: Union[List[int], List[str]], unique_labels: Union[List[int], List[str]]
) -> pd.Series:
"""Compute the distribution of labels when taking into account all possible labels.
Expand All @@ -70,9 +195,9 @@ def compute_frequency(
-------
The pd.Series with label as indices and probabilities as values.
"""
counts = compute_counts(labels, unique_labels)
counts = _compute_counts(labels, unique_labels)
if len(labels) == 0:
counts = counts.astype(float)
return counts
counts = counts.divide(len(labels))
return counts
frequencies = counts.astype(float)
return frequencies
frequencies = counts.divide(len(labels))
return frequencies
73 changes: 18 additions & 55 deletions datasets/flwr_datasets/visualization/label_distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,14 @@
"""Label distribution plotting."""


import warnings
from typing import Any, Dict, Optional, Tuple, Union

import matplotlib.colors as mcolors
import pandas as pd
from matplotlib.axes import Axes
from matplotlib.figure import Figure

from flwr_datasets.metrics import compute_counts
from flwr_datasets.metrics.utils import compute_counts, compute_frequencies
from flwr_datasets.partitioner import Partitioner
from flwr_datasets.visualization.bar_plot import _plot_bar
from flwr_datasets.visualization.heatmap_plot import _plot_heatmap
Expand Down Expand Up @@ -81,7 +80,9 @@ def plot_label_distributions(
Title for the legend. If None, the defaults will be takes based on the type of
plot.
verbose_labels : bool
Whether to use verbose versions of the labels.
Whether to use verbose versions of the labels. These values are used as columns
of the returned dataframe and as labels on the legend in a bar plot and columns/
rows ticks in a heatmap plot.
plot_kwargs: Optional[Dict[str, Any]]
Any key value pair that can be passed to a plot function that are not supported
directly. In case of the parameter doubling (e.g. specifying cmap here too) the
Expand Down Expand Up @@ -192,60 +193,22 @@ def plot_label_distributions(
"""
_validate_parameters(plot_type, size_unit, partition_id_axis)

if label_name not in partitioner.dataset.column_names:
raise ValueError(
f"The specified 'label_name': '{label_name}' is not present in the "
f"dataset. The dataset contains columns {partitioner.dataset.column_names}."
dataframe = pd.DataFrame()
if size_unit == "absolute":
dataframe = compute_counts(
partitioner=partitioner,
column_name=label_name,
verbose_names=verbose_labels,
max_num_partitions=max_num_partitions,
)

if max_num_partitions is None:
max_num_partitions = partitioner.num_partitions
else:
max_num_partitions = min(max_num_partitions, partitioner.num_partitions)
assert isinstance(max_num_partitions, int)
partitions = [partitioner.load_partition(i) for i in range(max_num_partitions)]

partition = partitions[0]
try:
# Unique labels are needed to represent the correct count of each class
# (some of the classes can have zero samples that's why this
# adjustment is needed)
unique_labels = partition.features[label_name].str2int(
partition.features[label_name].names
elif size_unit == "percent":
dataframe = compute_frequencies(
partitioner=partitioner,
column_name=label_name,
verbose_names=verbose_labels,
max_num_partitions=max_num_partitions,
)
except AttributeError: # If the label_name is not formally a Label
unique_labels = partitioner.dataset.unique(label_name)

partition_id_to_label_absolute_size = {
pid: compute_counts(partition[label_name], unique_labels)
for pid, partition in enumerate(partitions)
}

dataframe = pd.DataFrame.from_dict(
partition_id_to_label_absolute_size, orient="index"
)
dataframe.index.name = "Partition ID"

if size_unit == "percent":
dataframe = dataframe.div(dataframe.sum(axis=1), axis=0) * 100.0

if verbose_labels:
# Adjust the column name values of the dataframe
# (these values are used for as labels in bar plot and columns/rows ticks
# in heatmap)
current_labels = dataframe.columns
try:
legend_names = partition.features[label_name].int2str(
[int(v) for v in current_labels]
)
dataframe.columns = legend_names
except AttributeError:
warnings.warn(
"The verbose label names can not be established. "
"The column specified by 'label_name' needs to be of type "
"'ClassLabel'",
stacklevel=1,
)
dataframe = dataframe * 100.0

if plot_type == "bar":
axis = _plot_bar(
Expand Down

0 comments on commit 80131b0

Please sign in to comment.