diff --git a/datasets/doc/source/how-to-use-with-local-data.rst b/datasets/doc/source/how-to-use-with-local-data.rst
new file mode 100644
index 000000000000..276f6d6936ee
--- /dev/null
+++ b/datasets/doc/source/how-to-use-with-local-data.rst
@@ -0,0 +1,257 @@
+Use with Local Data
+===================
+
+You can partition your local files and Python objects in
+``Flower Datasets`` library using any available ``Partitioner``.
+
+This guide details how to create a `Hugging Face `_ `Dataset `_ which is the required type of input for Partitioners.
+We will cover:
+
+* local files: CSV, JSON, image, audio,
+* in-memory data: dictionary, list, pd.DataFrame, np.ndarray.
+
+
+General Overview
+----------------
+An all-in-one dataset preparation (downloading, preprocessing, partitioning) happens
+using `FederatedDataset `_. However, we
+will use only the `Partitioner` here since we use locally accessible data.
+
+The rest of this guide will explain how to create a
+`Dataset `_
+from local files and existing (in memory) Python objects.
+
+Local Files
+-----------
+CSV
+^^^
+.. code-block:: python
+
+ from datasets import load_dataset
+ from flwr_datasets.partitioner import ChosenPartitioner
+
+ # Single file
+ data_files = "path-to-my-file.csv"
+
+ # Multiple Files
+ data_files = [ "path-to-my-file-1.csv", "path-to-my-file-2.csv", ...]
+ dataset = load_dataset("csv", data_files=data_files)
+
+ # Divided Dataset
+ data_files = {
+ "train": single_train_file_or_list_of_files,
+ "test": single_test_file_or_list_of_files,
+ "can-have-more-splits": ...
+ }
+ dataset = load_dataset("csv", data_files=data_files)
+
+ partitioner = ChosenPartitioner(...)
+ partitioner.dataset = dataset
+ partition = partitioner.load_partition(partition_id=0)
+
+JSON
+^^^^
+
+.. code-block:: python
+
+ from datasets import load_dataset
+ from flwr_datasets.partitioner import ChosenPartitioner
+
+ # Single file
+ data_files = "path-to-my-file.json"
+
+ # Multitple Files
+ data_files = [ "path-to-my-file-1.json", "path-to-my-file-2.json", ...]
+ dataset = load_dataset("json", data_files=data_files)
+
+ # Divided Dataset
+ data_files = {
+ "train": single_train_file_or_list_of_files,
+ "test": single_test_file_or_list_of_files,
+ "can-have-more-splits": ...
+ }
+ dataset = load_dataset("json", data_files=data_files)
+
+ partitioner = ChosenPartitioner(...)
+ partitioner.dataset = dataset
+ partition = partitioner.load_partition(partition_id=0)
+
+
+Image
+^^^^^
+You can create an image dataset in two ways:
+
+1) give a path the directory
+
+The directory needs to be structured in the following way: dataset-name/split/class/name. For example:
+
+.. code-block::
+
+ mnist/train/1/unique_name.png
+ mnist/train/1/unique_name.png
+ mnist/train/2/unique_name.png
+ ...
+ mnist/test/1/unique_name.png
+ mnist/test/1/unique_name.png
+ mnist/test/2/unique_name.png
+
+Then, the path you can give is `./mnist`.
+
+.. code-block:: python
+
+ from datasets import load_dataset
+ from flwr_datasets.partitioner import ChosenPartitioner
+
+ # Directly from a directory
+ dataset = load_dataset("imagefolder", data_dir="/path/to/folder")
+ partitioner = ChosenPartitioner(...)
+ partitioner.dataset = dataset
+ partition = partitioner.load_partition(partition_id=0)
+
+2) create a dataset from a CSV/JSON file and cast the path column to Image.
+
+.. code-block:: python
+
+ from datasets import Image, load_dataset
+ from flwr_datasets.partitioner import ChosenPartitioner
+
+ dataset = load_dataset(...)
+ dataset = dataset.cast_column("path", Image())
+
+ partitioner = ChosenPartitioner(...)
+ partitioner.dataset = dataset
+ partition = partitioner.load_partition(partition_id=0)
+
+
+Audio
+^^^^^
+Analogously to the image datasets, there are two methods here:
+
+1) give a path to the directory
+
+.. code-block:: python
+
+ from datasets import load_dataset
+ from flwr_datasets.partitioner import ChosenPartitioner
+
+ dataset = load_dataset("audiofolder", data_dir="/path/to/folder")
+
+ partitioner = ChosenPartitioner(...)
+ partitioner.dataset = dataset
+ partition = partitioner.load_partition(partition_id=0)
+
+2) create a dataset from a CSV/JSON file and cast the path column to Audio.
+
+.. code-block:: python
+
+ from datasets import Audio, load_dataset
+ from flwr_datasets.partitioner import ChosenPartitioner
+
+ dataset = load_dataset(...)
+ dataset = dataset.cast_column("path", Audio())
+
+ partitioner = ChosenPartitioner(...)
+ partitioner.dataset = dataset
+ partition = partitioner.load_partition(partition_id=0)
+
+In-Memory
+---------
+
+From dictionary
+^^^^^^^^^^^^^^^
+.. code-block:: python
+
+ from datasets import Dataset
+ from flwr_datasets.partitioner import ChosenPartitioner
+ data = {"features": [1, 2, 3], "labels": [0, 0, 1]}
+ dataset = Dataset.from_dict(data)
+
+ partitioner = ChosenPartitioner(...)
+ partitioner.dataset = dataset
+ partition = partitioner.load_partition(partition_id=0)
+
+From list
+^^^^^^^^^
+.. code-block:: python
+
+ from datasets import Dataset
+ from flwr_datasets.partitioner import ChosenPartitioner
+
+ my_list = [
+ {"features": 1, "labels": 0},
+ {"features": 2, "labels": 0},
+ {"features": 3, "labels": 1}
+ ]
+ dataset = Dataset.from_list(my_list)
+
+ partitioner = ChosenPartitioner(...)
+ partitioner.dataset = dataset
+ partition = partitioner.load_partition(partition_id=0)
+
+From pd.DataFrame
+^^^^^^^^^^^^^^^^^
+.. code-block:: python
+
+ from datasets import Dataset
+ from flwr_datasets.partitioner import ChosenPartitioner
+
+ data = {"features": [1, 2, 3], "labels": [0, 0, 1]}
+ df = pd.DataFrame(data)
+ dataset = Dataset.from_pandas(df)
+
+ partitioner = ChosenPartitioner(...)
+ partitioner.dataset = dataset
+ partition = partitioner.load_partition(partition_id=0)
+
+From np.ndarray
+^^^^^^^^^^^^^^^
+The np.ndarray will be first transformed to pd.DataFrame
+
+.. code-block:: python
+
+ from datasets import Dataset
+ from flwr_datasets.partitioner import ChosenPartitioner
+
+ data = np.array([[1, 2, 3], [0, 0, 1]]).T
+ # You can add the column names by passing columns=["features", "labels"]
+ df = pd.DataFrame(data)
+ dataset = Dataset.from_pandas(df)
+
+ partitioner = ChosenPartitioner(...)
+ partitioner.dataset = dataset
+ partition = partitioner.load_partition(partition_id=0)
+
+Partitioner Details
+-------------------
+Partitioning is triggered automatically during the first ``load_partition`` call.
+You do not need to call any “do_partitioning” method.
+
+Partitioner abstraction is designed to allow for a single dataset assignment.
+
+.. code-block:: python
+
+ partitioner.dataset = your_dataset
+
+If you need to do the same partitioning on a different dataset, create a new Partitioner
+for that, e.g.:
+
+.. code-block:: python
+
+ from flwr_datasets.partitioner import IidPartitioner
+
+ iid_partitioner_for_mnist = IidPartitioner(num_partitions=10)
+ iid_partitioner_for_mnist.dataset = mnist_dataset
+
+ iid_partitioner_for_cifar = IidPartitioner(num_partitions=10)
+ iid_partitioner_for_cifar.dataset = cifar_dataset
+
+
+More Resources
+--------------
+If you are looking for more details or you have not found the format you are looking for, please visit the `HuggingFace Datasets docs `_.
+This guide is based on the following ones:
+
+* `General Information `_
+* `Tabular Data `_
+* `Image Data `_
+* `Audio Data `_
diff --git a/datasets/doc/source/index.rst b/datasets/doc/source/index.rst
index fd226b308bd5..2144c527f8cd 100644
--- a/datasets/doc/source/index.rst
+++ b/datasets/doc/source/index.rst
@@ -31,6 +31,7 @@ Problem-oriented how-to guides show step-by-step how to achieve a specific goal.
how-to-use-with-pytorch
how-to-use-with-tensorflow
how-to-use-with-numpy
+ how-to-use-with-local-data
how-to-disable-enable-progress-bar
References
diff --git a/examples/embedded-devices/Dockerfile b/examples/embedded-devices/Dockerfile
index a85c05c4bb7a..48602c89970a 100644
--- a/examples/embedded-devices/Dockerfile
+++ b/examples/embedded-devices/Dockerfile
@@ -8,7 +8,7 @@ RUN pip3 install --upgrade pip
# Install flower
RUN pip3 install flwr>=1.0
-RUN pip3 install flwr-datsets>=0.2
+RUN pip3 install flwr-datsets>=0.0.2
RUN pip3 install tqdm==4.65.0
WORKDIR /client
diff --git a/src/py/flwr/cli/new/new.py b/src/py/flwr/cli/new/new.py
index 83b84ce52e24..0c429ce34cf2 100644
--- a/src/py/flwr/cli/new/new.py
+++ b/src/py/flwr/cli/new/new.py
@@ -22,7 +22,12 @@
import typer
from typing_extensions import Annotated
-from ..utils import prompt_options, prompt_text
+from ..utils import (
+ is_valid_project_name,
+ prompt_options,
+ prompt_text,
+ sanitize_project_name,
+)
class MlFramework(str, Enum):
@@ -81,6 +86,16 @@ def new(
] = None,
) -> None:
"""Create new Flower project."""
+ if project_name is None:
+ project_name = prompt_text("Please provide project name")
+ if not is_valid_project_name(project_name):
+ project_name = prompt_text(
+ "Please provide a name that only contains "
+ "characters in {'_', 'a-zA-Z', '0-9'}",
+ predicate=is_valid_project_name,
+ default=sanitize_project_name(project_name),
+ )
+
print(
typer.style(
f"🔨 Creating Flower project {project_name}...",
@@ -89,9 +104,6 @@ def new(
)
)
- if project_name is None:
- project_name = prompt_text("Please provide project name")
-
if framework is not None:
framework_str = str(framework.value)
else:
diff --git a/src/py/flwr/cli/utils.py b/src/py/flwr/cli/utils.py
index 4e86f0c3b8c8..7a36c3eb7b84 100644
--- a/src/py/flwr/cli/utils.py
+++ b/src/py/flwr/cli/utils.py
@@ -14,18 +14,23 @@
# ==============================================================================
"""Flower command line interface utils."""
-from typing import List, cast
+from typing import Callable, List, Optional, cast
import typer
-def prompt_text(text: str) -> str:
+def prompt_text(
+ text: str,
+ predicate: Callable[[str], bool] = lambda _: True,
+ default: Optional[str] = None,
+) -> str:
"""Ask user to enter text input."""
while True:
result = typer.prompt(
- typer.style(f"\n💬 {text}", fg=typer.colors.MAGENTA, bold=True)
+ typer.style(f"\n💬 {text}", fg=typer.colors.MAGENTA, bold=True),
+ default=default,
)
- if len(result) > 0:
+ if predicate(result) and len(result) > 0:
break
print(typer.style("❌ Invalid entry", fg=typer.colors.RED, bold=True))
@@ -65,3 +70,54 @@ def prompt_options(text: str, options: List[str]) -> str:
result = options[int(index)]
return result
+
+
+def is_valid_project_name(name: str) -> bool:
+ """Check if the given string is a valid Python module name.
+
+ A valid module name must start with a letter or an underscore, and can only contain
+ letters, digits, and underscores.
+ """
+ if not name:
+ return False
+
+ # Check if the first character is a letter or underscore
+ if not (name[0].isalpha() or name[0] == "_"):
+ return False
+
+ # Check if the rest of the characters are valid (letter, digit, or underscore)
+ for char in name[1:]:
+ if not (char.isalnum() or char == "_"):
+ return False
+
+ return True
+
+
+def sanitize_project_name(name: str) -> str:
+ """Sanitize the given string to make it a valid Python module name.
+
+ This version replaces hyphens with underscores, removes any characters not allowed
+ in Python module names, makes the string lowercase, and ensures it starts with a
+ valid character.
+ """
+ # Replace '-' with '_'
+ name_with_underscores = name.replace("-", "_").replace(" ", "_")
+
+ # Allowed characters in a module name: letters, digits, underscore
+ allowed_chars = set(
+ "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"
+ )
+
+ # Make the string lowercase
+ sanitized_name = name_with_underscores.lower()
+
+ # Remove any characters not allowed in Python module names
+ sanitized_name = "".join(c for c in sanitized_name if c in allowed_chars)
+
+ # Ensure the first character is a letter or underscore
+ if sanitized_name and (
+ sanitized_name[0].isdigit() or sanitized_name[0] not in allowed_chars
+ ):
+ sanitized_name = "_" + sanitized_name
+
+ return sanitized_name