diff --git a/datasets/doc/source/how-to-use-with-local-data.rst b/datasets/doc/source/how-to-use-with-local-data.rst new file mode 100644 index 000000000000..276f6d6936ee --- /dev/null +++ b/datasets/doc/source/how-to-use-with-local-data.rst @@ -0,0 +1,257 @@ +Use with Local Data +=================== + +You can partition your local files and Python objects in +``Flower Datasets`` library using any available ``Partitioner``. + +This guide details how to create a `Hugging Face `_ `Dataset `_ which is the required type of input for Partitioners. +We will cover: + +* local files: CSV, JSON, image, audio, +* in-memory data: dictionary, list, pd.DataFrame, np.ndarray. + + +General Overview +---------------- +An all-in-one dataset preparation (downloading, preprocessing, partitioning) happens +using `FederatedDataset `_. However, we +will use only the `Partitioner` here since we use locally accessible data. + +The rest of this guide will explain how to create a +`Dataset `_ +from local files and existing (in memory) Python objects. + +Local Files +----------- +CSV +^^^ +.. code-block:: python + + from datasets import load_dataset + from flwr_datasets.partitioner import ChosenPartitioner + + # Single file + data_files = "path-to-my-file.csv" + + # Multiple Files + data_files = [ "path-to-my-file-1.csv", "path-to-my-file-2.csv", ...] + dataset = load_dataset("csv", data_files=data_files) + + # Divided Dataset + data_files = { + "train": single_train_file_or_list_of_files, + "test": single_test_file_or_list_of_files, + "can-have-more-splits": ... + } + dataset = load_dataset("csv", data_files=data_files) + + partitioner = ChosenPartitioner(...) + partitioner.dataset = dataset + partition = partitioner.load_partition(partition_id=0) + +JSON +^^^^ + +.. code-block:: python + + from datasets import load_dataset + from flwr_datasets.partitioner import ChosenPartitioner + + # Single file + data_files = "path-to-my-file.json" + + # Multitple Files + data_files = [ "path-to-my-file-1.json", "path-to-my-file-2.json", ...] + dataset = load_dataset("json", data_files=data_files) + + # Divided Dataset + data_files = { + "train": single_train_file_or_list_of_files, + "test": single_test_file_or_list_of_files, + "can-have-more-splits": ... + } + dataset = load_dataset("json", data_files=data_files) + + partitioner = ChosenPartitioner(...) + partitioner.dataset = dataset + partition = partitioner.load_partition(partition_id=0) + + +Image +^^^^^ +You can create an image dataset in two ways: + +1) give a path the directory + +The directory needs to be structured in the following way: dataset-name/split/class/name. For example: + +.. code-block:: + + mnist/train/1/unique_name.png + mnist/train/1/unique_name.png + mnist/train/2/unique_name.png + ... + mnist/test/1/unique_name.png + mnist/test/1/unique_name.png + mnist/test/2/unique_name.png + +Then, the path you can give is `./mnist`. + +.. code-block:: python + + from datasets import load_dataset + from flwr_datasets.partitioner import ChosenPartitioner + + # Directly from a directory + dataset = load_dataset("imagefolder", data_dir="/path/to/folder") + partitioner = ChosenPartitioner(...) + partitioner.dataset = dataset + partition = partitioner.load_partition(partition_id=0) + +2) create a dataset from a CSV/JSON file and cast the path column to Image. + +.. code-block:: python + + from datasets import Image, load_dataset + from flwr_datasets.partitioner import ChosenPartitioner + + dataset = load_dataset(...) + dataset = dataset.cast_column("path", Image()) + + partitioner = ChosenPartitioner(...) + partitioner.dataset = dataset + partition = partitioner.load_partition(partition_id=0) + + +Audio +^^^^^ +Analogously to the image datasets, there are two methods here: + +1) give a path to the directory + +.. code-block:: python + + from datasets import load_dataset + from flwr_datasets.partitioner import ChosenPartitioner + + dataset = load_dataset("audiofolder", data_dir="/path/to/folder") + + partitioner = ChosenPartitioner(...) + partitioner.dataset = dataset + partition = partitioner.load_partition(partition_id=0) + +2) create a dataset from a CSV/JSON file and cast the path column to Audio. + +.. code-block:: python + + from datasets import Audio, load_dataset + from flwr_datasets.partitioner import ChosenPartitioner + + dataset = load_dataset(...) + dataset = dataset.cast_column("path", Audio()) + + partitioner = ChosenPartitioner(...) + partitioner.dataset = dataset + partition = partitioner.load_partition(partition_id=0) + +In-Memory +--------- + +From dictionary +^^^^^^^^^^^^^^^ +.. code-block:: python + + from datasets import Dataset + from flwr_datasets.partitioner import ChosenPartitioner + data = {"features": [1, 2, 3], "labels": [0, 0, 1]} + dataset = Dataset.from_dict(data) + + partitioner = ChosenPartitioner(...) + partitioner.dataset = dataset + partition = partitioner.load_partition(partition_id=0) + +From list +^^^^^^^^^ +.. code-block:: python + + from datasets import Dataset + from flwr_datasets.partitioner import ChosenPartitioner + + my_list = [ + {"features": 1, "labels": 0}, + {"features": 2, "labels": 0}, + {"features": 3, "labels": 1} + ] + dataset = Dataset.from_list(my_list) + + partitioner = ChosenPartitioner(...) + partitioner.dataset = dataset + partition = partitioner.load_partition(partition_id=0) + +From pd.DataFrame +^^^^^^^^^^^^^^^^^ +.. code-block:: python + + from datasets import Dataset + from flwr_datasets.partitioner import ChosenPartitioner + + data = {"features": [1, 2, 3], "labels": [0, 0, 1]} + df = pd.DataFrame(data) + dataset = Dataset.from_pandas(df) + + partitioner = ChosenPartitioner(...) + partitioner.dataset = dataset + partition = partitioner.load_partition(partition_id=0) + +From np.ndarray +^^^^^^^^^^^^^^^ +The np.ndarray will be first transformed to pd.DataFrame + +.. code-block:: python + + from datasets import Dataset + from flwr_datasets.partitioner import ChosenPartitioner + + data = np.array([[1, 2, 3], [0, 0, 1]]).T + # You can add the column names by passing columns=["features", "labels"] + df = pd.DataFrame(data) + dataset = Dataset.from_pandas(df) + + partitioner = ChosenPartitioner(...) + partitioner.dataset = dataset + partition = partitioner.load_partition(partition_id=0) + +Partitioner Details +------------------- +Partitioning is triggered automatically during the first ``load_partition`` call. +You do not need to call any “do_partitioning” method. + +Partitioner abstraction is designed to allow for a single dataset assignment. + +.. code-block:: python + + partitioner.dataset = your_dataset + +If you need to do the same partitioning on a different dataset, create a new Partitioner +for that, e.g.: + +.. code-block:: python + + from flwr_datasets.partitioner import IidPartitioner + + iid_partitioner_for_mnist = IidPartitioner(num_partitions=10) + iid_partitioner_for_mnist.dataset = mnist_dataset + + iid_partitioner_for_cifar = IidPartitioner(num_partitions=10) + iid_partitioner_for_cifar.dataset = cifar_dataset + + +More Resources +-------------- +If you are looking for more details or you have not found the format you are looking for, please visit the `HuggingFace Datasets docs `_. +This guide is based on the following ones: + +* `General Information `_ +* `Tabular Data `_ +* `Image Data `_ +* `Audio Data `_ diff --git a/datasets/doc/source/index.rst b/datasets/doc/source/index.rst index fd226b308bd5..2144c527f8cd 100644 --- a/datasets/doc/source/index.rst +++ b/datasets/doc/source/index.rst @@ -31,6 +31,7 @@ Problem-oriented how-to guides show step-by-step how to achieve a specific goal. how-to-use-with-pytorch how-to-use-with-tensorflow how-to-use-with-numpy + how-to-use-with-local-data how-to-disable-enable-progress-bar References diff --git a/examples/embedded-devices/Dockerfile b/examples/embedded-devices/Dockerfile index a85c05c4bb7a..48602c89970a 100644 --- a/examples/embedded-devices/Dockerfile +++ b/examples/embedded-devices/Dockerfile @@ -8,7 +8,7 @@ RUN pip3 install --upgrade pip # Install flower RUN pip3 install flwr>=1.0 -RUN pip3 install flwr-datsets>=0.2 +RUN pip3 install flwr-datsets>=0.0.2 RUN pip3 install tqdm==4.65.0 WORKDIR /client diff --git a/src/py/flwr/cli/new/new.py b/src/py/flwr/cli/new/new.py index 83b84ce52e24..0c429ce34cf2 100644 --- a/src/py/flwr/cli/new/new.py +++ b/src/py/flwr/cli/new/new.py @@ -22,7 +22,12 @@ import typer from typing_extensions import Annotated -from ..utils import prompt_options, prompt_text +from ..utils import ( + is_valid_project_name, + prompt_options, + prompt_text, + sanitize_project_name, +) class MlFramework(str, Enum): @@ -81,6 +86,16 @@ def new( ] = None, ) -> None: """Create new Flower project.""" + if project_name is None: + project_name = prompt_text("Please provide project name") + if not is_valid_project_name(project_name): + project_name = prompt_text( + "Please provide a name that only contains " + "characters in {'_', 'a-zA-Z', '0-9'}", + predicate=is_valid_project_name, + default=sanitize_project_name(project_name), + ) + print( typer.style( f"🔨 Creating Flower project {project_name}...", @@ -89,9 +104,6 @@ def new( ) ) - if project_name is None: - project_name = prompt_text("Please provide project name") - if framework is not None: framework_str = str(framework.value) else: diff --git a/src/py/flwr/cli/utils.py b/src/py/flwr/cli/utils.py index 4e86f0c3b8c8..7a36c3eb7b84 100644 --- a/src/py/flwr/cli/utils.py +++ b/src/py/flwr/cli/utils.py @@ -14,18 +14,23 @@ # ============================================================================== """Flower command line interface utils.""" -from typing import List, cast +from typing import Callable, List, Optional, cast import typer -def prompt_text(text: str) -> str: +def prompt_text( + text: str, + predicate: Callable[[str], bool] = lambda _: True, + default: Optional[str] = None, +) -> str: """Ask user to enter text input.""" while True: result = typer.prompt( - typer.style(f"\n💬 {text}", fg=typer.colors.MAGENTA, bold=True) + typer.style(f"\n💬 {text}", fg=typer.colors.MAGENTA, bold=True), + default=default, ) - if len(result) > 0: + if predicate(result) and len(result) > 0: break print(typer.style("❌ Invalid entry", fg=typer.colors.RED, bold=True)) @@ -65,3 +70,54 @@ def prompt_options(text: str, options: List[str]) -> str: result = options[int(index)] return result + + +def is_valid_project_name(name: str) -> bool: + """Check if the given string is a valid Python module name. + + A valid module name must start with a letter or an underscore, and can only contain + letters, digits, and underscores. + """ + if not name: + return False + + # Check if the first character is a letter or underscore + if not (name[0].isalpha() or name[0] == "_"): + return False + + # Check if the rest of the characters are valid (letter, digit, or underscore) + for char in name[1:]: + if not (char.isalnum() or char == "_"): + return False + + return True + + +def sanitize_project_name(name: str) -> str: + """Sanitize the given string to make it a valid Python module name. + + This version replaces hyphens with underscores, removes any characters not allowed + in Python module names, makes the string lowercase, and ensures it starts with a + valid character. + """ + # Replace '-' with '_' + name_with_underscores = name.replace("-", "_").replace(" ", "_") + + # Allowed characters in a module name: letters, digits, underscore + allowed_chars = set( + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_" + ) + + # Make the string lowercase + sanitized_name = name_with_underscores.lower() + + # Remove any characters not allowed in Python module names + sanitized_name = "".join(c for c in sanitized_name if c in allowed_chars) + + # Ensure the first character is a letter or underscore + if sanitized_name and ( + sanitized_name[0].isdigit() or sanitized_name[0] not in allowed_chars + ): + sanitized_name = "_" + sanitized_name + + return sanitized_name