Merge branch 'main' into refactor-example-script

adap · Apr 12, 2024 · e86b81a · e86b81a
2 parents b5ba274 + 236990e
commit e86b81a
Show file tree

Hide file tree

Showing 19 changed files with 414 additions and 84 deletions.
diff --git a/datasets/doc/source/how-to-use-with-local-data.rst b/datasets/doc/source/how-to-use-with-local-data.rst
@@ -0,0 +1,257 @@
+Use with Local Data
+===================
+
+You can partition your local files and Python objects in
+``Flower Datasets`` library using any available ``Partitioner``.
+
+This guide details how to create a `Hugging Face <https://huggingface.co/>`_ `Dataset <https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset>`_ which is the required type of input for Partitioners.
+We will cover:
+
+* local files: CSV, JSON, image, audio,
+* in-memory data: dictionary, list, pd.DataFrame, np.ndarray.
+
+
+General Overview
+----------------
+An all-in-one dataset preparation (downloading, preprocessing, partitioning) happens
+using `FederatedDataset <ref-api/flwr_datasets.FederatedDataset.html>`_. However, we
+will use only the `Partitioner` here since we use locally accessible data.
+
+The rest of this guide will explain how to create a
+`Dataset <https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.Dataset>`_
+from local files and existing (in memory) Python objects.
+
+Local Files
+-----------
+CSV
+^^^
+.. code-block:: python
+
+  from datasets import load_dataset
+  from flwr_datasets.partitioner import ChosenPartitioner
+
+  # Single file
+  data_files = "path-to-my-file.csv"
+
+  # Multiple Files
+  data_files = [ "path-to-my-file-1.csv", "path-to-my-file-2.csv", ...]
+  dataset = load_dataset("csv", data_files=data_files)
+
+  # Divided Dataset
+  data_files = {
+    "train": single_train_file_or_list_of_files,
+    "test": single_test_file_or_list_of_files,
+    "can-have-more-splits": ...
+  }
+  dataset = load_dataset("csv", data_files=data_files)
+
+  partitioner = ChosenPartitioner(...)
+  partitioner.dataset = dataset
+  partition = partitioner.load_partition(partition_id=0)
+
+JSON
+^^^^
+
+.. code-block:: python
+
+  from datasets import load_dataset
+  from flwr_datasets.partitioner import ChosenPartitioner
+
+  # Single file
+  data_files = "path-to-my-file.json"
+
+  # Multitple Files
+  data_files = [ "path-to-my-file-1.json", "path-to-my-file-2.json", ...]
+  dataset = load_dataset("json", data_files=data_files)
+
+  # Divided Dataset
+  data_files = {
+    "train": single_train_file_or_list_of_files,
+     "test": single_test_file_or_list_of_files,
+     "can-have-more-splits": ...
+  }
+  dataset = load_dataset("json", data_files=data_files)
+
+  partitioner = ChosenPartitioner(...)
+  partitioner.dataset = dataset
+  partition = partitioner.load_partition(partition_id=0)
+
+
+Image
+^^^^^
+You can create an image dataset in two ways:
+
+1) give a path the directory
+
+The directory needs to be structured in the following way: dataset-name/split/class/name. For example:
+
+.. code-block::
+
+  mnist/train/1/unique_name.png
+  mnist/train/1/unique_name.png
+  mnist/train/2/unique_name.png
+  ...
+  mnist/test/1/unique_name.png
+  mnist/test/1/unique_name.png
+  mnist/test/2/unique_name.png
+
+Then, the path you can give is `./mnist`.
+
+.. code-block:: python
+
+  from datasets import load_dataset
+  from flwr_datasets.partitioner import ChosenPartitioner
+
+  # Directly from a directory
+  dataset = load_dataset("imagefolder", data_dir="/path/to/folder")
+  partitioner = ChosenPartitioner(...)
+  partitioner.dataset = dataset
+  partition = partitioner.load_partition(partition_id=0)
+
+2) create a dataset from a CSV/JSON file and cast the path column to Image.
+
+.. code-block:: python
+
+  from datasets import Image, load_dataset
+  from flwr_datasets.partitioner import ChosenPartitioner
+
+  dataset = load_dataset(...)
+  dataset = dataset.cast_column("path", Image())
+
+  partitioner = ChosenPartitioner(...)
+  partitioner.dataset = dataset
+  partition = partitioner.load_partition(partition_id=0)
+
+
+Audio
+^^^^^
+Analogously to the image datasets, there are two methods here:
+
+1) give a path to the directory
+
+.. code-block:: python
+
+  from datasets import load_dataset
+  from flwr_datasets.partitioner import ChosenPartitioner
+
+  dataset = load_dataset("audiofolder", data_dir="/path/to/folder")
+
+  partitioner = ChosenPartitioner(...)
+  partitioner.dataset = dataset
+  partition = partitioner.load_partition(partition_id=0)
+
+2) create a dataset from a CSV/JSON file and cast the path column to Audio.
+
+.. code-block:: python
+
+  from datasets import Audio, load_dataset
+  from flwr_datasets.partitioner import ChosenPartitioner
+
+  dataset = load_dataset(...)
+  dataset = dataset.cast_column("path", Audio())
+
+  partitioner = ChosenPartitioner(...)
+  partitioner.dataset = dataset
+  partition = partitioner.load_partition(partition_id=0)
+
+In-Memory
+---------
+
+From dictionary
+^^^^^^^^^^^^^^^
+.. code-block:: python
+
+  from datasets import Dataset
+  from flwr_datasets.partitioner import ChosenPartitioner
+  data = {"features": [1, 2, 3], "labels": [0, 0, 1]}
+  dataset = Dataset.from_dict(data)
+
+  partitioner = ChosenPartitioner(...)
+  partitioner.dataset = dataset
+  partition = partitioner.load_partition(partition_id=0)
+
+From list
+^^^^^^^^^
+.. code-block:: python
+
+  from datasets import Dataset
+  from flwr_datasets.partitioner import ChosenPartitioner
+  
+  my_list = [
+    {"features": 1, "labels": 0},
+    {"features": 2, "labels": 0},
+    {"features": 3, "labels": 1}
+  ]
+  dataset = Dataset.from_list(my_list)
+
+  partitioner = ChosenPartitioner(...)
+  partitioner.dataset = dataset
+  partition = partitioner.load_partition(partition_id=0)
+
+From pd.DataFrame
+^^^^^^^^^^^^^^^^^
+.. code-block:: python
+
+  from datasets import Dataset
+  from flwr_datasets.partitioner import ChosenPartitioner
+  
+  data = {"features": [1, 2, 3], "labels": [0, 0, 1]}
+  df = pd.DataFrame(data)
+  dataset = Dataset.from_pandas(df)
+
+  partitioner = ChosenPartitioner(...)
+  partitioner.dataset = dataset
+  partition = partitioner.load_partition(partition_id=0)
+
+From np.ndarray
+^^^^^^^^^^^^^^^
+The np.ndarray will be first transformed to pd.DataFrame
+
+.. code-block:: python
+
+  from datasets import Dataset
+  from flwr_datasets.partitioner import ChosenPartitioner
+  
+  data = np.array([[1, 2, 3], [0, 0, 1]]).T
+  # You can add the column names by passing columns=["features", "labels"]
+  df = pd.DataFrame(data)
+  dataset = Dataset.from_pandas(df)
+
+  partitioner = ChosenPartitioner(...)
+  partitioner.dataset = dataset
+  partition = partitioner.load_partition(partition_id=0)
+
+Partitioner Details
+-------------------
+Partitioning is triggered automatically during the first ``load_partition`` call.
+You do not need to call any “do_partitioning” method.
+
+Partitioner abstraction is designed to allow for a single dataset assignment.
+
+.. code-block:: python
+
+  partitioner.dataset = your_dataset
+
+If you need to do the same partitioning on a different dataset, create a new Partitioner
+for that, e.g.:
+
+.. code-block:: python
+
+  from flwr_datasets.partitioner import IidPartitioner
+
+  iid_partitioner_for_mnist = IidPartitioner(num_partitions=10)
+  iid_partitioner_for_mnist.dataset = mnist_dataset
+
+  iid_partitioner_for_cifar = IidPartitioner(num_partitions=10)
+  iid_partitioner_for_cifar.dataset = cifar_dataset
+
+
+More Resources
+--------------
+If you are looking for more details or you have not found the format you are looking for, please visit the `HuggingFace Datasets docs <https://huggingface.co/docs/datasets/index>`_.
+This guide is based on the following ones:
+
+* `General Information <https://huggingface.co/docs/datasets/en/loading>`_
+* `Tabular Data <https://huggingface.co/docs/datasets/en/tabular_load>`_
+* `Image Data <https://huggingface.co/docs/datasets/en/image_load>`_
+* `Audio Data <https://huggingface.co/docs/datasets/en/audio_load>`_
diff --git a/datasets/doc/source/index.rst b/datasets/doc/source/index.rst
@@ -31,6 +31,7 @@ Problem-oriented how-to guides show step-by-step how to achieve a specific goal.
    how-to-use-with-pytorch
    how-to-use-with-tensorflow
    how-to-use-with-numpy
+   how-to-use-with-local-data
    how-to-disable-enable-progress-bar
 
 References

diff --git a/datasets/flwr_datasets/federated_dataset.py b/datasets/flwr_datasets/federated_dataset.py
@@ -59,7 +59,8 @@ class FederatedDataset:
         argument. Defaults to True.
     seed : Optional[int]
         Seed used for dataset shuffling. It has no effect if `shuffle` is False. The
-        seed cannot be set in the later stages.
+        seed cannot be set in the later stages. If `None`, then fresh, unpredictable entropy
+        will be pulled from the OS. Defaults to 42.
 
     Examples
     --------

diff --git a/examples/embedded-devices/Dockerfile b/examples/embedded-devices/Dockerfile
@@ -8,7 +8,7 @@ RUN pip3 install --upgrade pip
 
 # Install flower
 RUN pip3 install flwr>=1.0
-RUN pip3 install flwr-datsets>=0.2
+RUN pip3 install flwr-datsets>=0.0.2
 RUN pip3 install tqdm==4.65.0
 
 WORKDIR /client
diff --git a/src/py/flwr/cli/example.py b/src/py/flwr/cli/example.py
@@ -39,7 +39,9 @@ def example() -> None:
     with urllib.request.urlopen(examples_directory_url) as res:
         data = json.load(res)
         example_names = [
-            item["path"] for item in data["tree"] if item["path"] not in [".gitignore"]
+            item["path"]
+            for item in data["tree"]
+            if item["path"] not in [".gitignore", "doc"]
         ]
 
     example_name = prompt_options(

diff --git a/src/py/flwr/cli/new/new.py b/src/py/flwr/cli/new/new.py
@@ -22,7 +22,12 @@
 import typer
 from typing_extensions import Annotated
 
-from ..utils import prompt_options, prompt_text
+from ..utils import (
+    is_valid_project_name,
+    prompt_options,
+    prompt_text,
+    sanitize_project_name,
+)
 
 
 class MlFramework(str, Enum):
@@ -81,6 +86,16 @@ def new(
     ] = None,
 ) -> None:
     """Create new Flower project."""
+    if project_name is None:
+        project_name = prompt_text("Please provide project name")
+    if not is_valid_project_name(project_name):
+        project_name = prompt_text(
+            "Please provide a name that only contains "
+            "characters in {'_', 'a-zA-Z', '0-9'}",
+            predicate=is_valid_project_name,
+            default=sanitize_project_name(project_name),
+        )
+
     print(
         typer.style(
             f"🔨 Creating Flower project {project_name}...",
@@ -89,9 +104,6 @@ def new(
         )
     )
 
-    if project_name is None:
-        project_name = prompt_text("Please provide project name")
-
     if framework is not None:
         framework_str = str(framework.value)
     else:
@@ -116,7 +128,6 @@ def new(
     # List of files to render
     files = {
         "README.md": {"template": "app/README.md.tpl"},
-        "requirements.txt": {"template": f"app/requirements.{framework_str}.txt.tpl"},
         "flower.toml": {"template": "app/flower.toml.tpl"},
         "pyproject.toml": {"template": f"app/pyproject.{framework_str}.toml.tpl"},
         f"{pnl}/__init__.py": {"template": "app/code/__init__.py.tpl"},

diff --git a/src/py/flwr/cli/new/new_test.py b/src/py/flwr/cli/new/new_test.py
@@ -66,7 +66,6 @@ def test_new(tmp_path: str) -> None:
     project_name = "FedGPT"
     framework = MlFramework.PYTORCH
     expected_files_top_level = {
-        "requirements.txt",
         "fedgpt",
         "README.md",
         "flower.toml",

diff --git a/src/py/flwr/cli/new/templates/app/pyproject.numpy.toml.tpl b/src/py/flwr/cli/new/templates/app/pyproject.numpy.toml.tpl
@@ -1,19 +1,19 @@
 [build-system]
-requires = ["poetry-core>=1.4.0"]
-build-backend = "poetry.core.masonry.api"
+requires = ["hatchling"]
+build-backend = "hatchling.build"
 
-[tool.poetry]
+[project]
 name = "$project_name"
 version = "1.0.0"
 description = ""
-license = "Apache-2.0"
 authors = [
-    "The Flower Authors <hello@flower.ai>",
+    { name = "The Flower Authors", email = "[email protected]" },
+]
+license = {text = "Apache License (2.0)"}
+dependencies = [
+    "flwr[simulation]>=1.8.0,<2.0",
+    "numpy>=1.21.0",
 ]
-readme = "README.md"
 
-[tool.poetry.dependencies]
-python = "^3.9"
-# Mandatory dependencies
-numpy = "^1.21.0"
-flwr = { version = "^1.8.0", extras = ["simulation"] }
+[tool.hatch.build.targets.wheel]
+packages = ["."]