update load_dataset doctring (#7301)

* update load_dataset doctring * style * minor * drop python 3.8
huggingface · Nov 29, 2024 · 06c3235 · 06c3235
1 parent 17f17b3
commit 06c3235
Showing 10 changed files with 111 additions and 99 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -21,7 +21,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
-          python-version: "3.8"
+          python-version: "3.9"
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
@@ -44,10 +44,10 @@ jobs:
       - uses: actions/checkout@v4
         with:
           fetch-depth: 0
-      - name: Set up Python 3.8
+      - name: Set up Python 3.9
         uses: actions/setup-python@v5
         with:
-          python-version: "3.8"
+          python-version: "3.9"
       - name: Upgrade pip
         run: python -m pip install --upgrade pip
       - name: Pin setuptools-scm

diff --git a/.github/workflows/release-conda.yml b/.github/workflows/release-conda.yml
@@ -25,7 +25,7 @@ jobs:
           auto-update-conda: true
           auto-activate-base: false
           activate-environment: "build-datasets"
-          python-version: 3.8
+          python-version: 3.9
           channels: huggingface
 
       - name: Setup conda env

diff --git a/setup.py b/setup.py
@@ -251,7 +251,7 @@
         "datasets.utils.resources": ["*.json", "*.yaml", "*.tsv"],
     },
     entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]},
-    python_requires=">=3.8.0",
+    python_requires=">=3.9.0",
     install_requires=REQUIRED_PKGS,
     extras_require=EXTRAS_REQUIRE,
     classifiers=[
@@ -262,7 +262,6 @@
         "License :: OSI Approved :: Apache Software License",
         "Operating System :: OS Independent",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -372,7 +372,7 @@ def to_tf_dataset(
                 a small buffer of batches for training. Improves performance by allowing data to be loaded in the
                 background while the model is training.
             num_workers (`int`, defaults to `0`):
-                Number of workers to use for loading the dataset. Only supported on Python versions >= 3.8.
+                Number of workers to use for loading the dataset.
             num_test_batches (`int`, defaults to `20`):
                 Number of batches to use to infer the output signature of the dataset.
                 The higher this number, the more accurate the signature will be, but the longer it will take to

diff --git a/src/datasets/load.py b/src/datasets/load.py
@@ -242,9 +242,11 @@ def __reduce__(self):  # to make dynamically created class pickable, see _Initia
 def get_dataset_builder_class(
     dataset_module: "DatasetModule", dataset_name: Optional[str] = None
 ) -> Type[DatasetBuilder]:
-    with lock_importable_file(
-        dataset_module.importable_file_path
-    ) if dataset_module.importable_file_path else nullcontext():
+    with (
+        lock_importable_file(dataset_module.importable_file_path)
+        if dataset_module.importable_file_path
+        else nullcontext()
+    ):
         builder_cls = import_main_class(dataset_module.module_path)
     if dataset_module.builder_configs_parameters.builder_configs:
         dataset_name = dataset_name or dataset_module.builder_kwargs.get("dataset_name")
@@ -1751,42 +1753,36 @@ def load_dataset_builder(
     _require_default_config_name=True,
     **config_kwargs,
 ) -> DatasetBuilder:
-    """Load a dataset builder from the Hugging Face Hub, or a local dataset. A dataset builder can be used to inspect general information that is required to build a dataset (cache directory, config, dataset info, etc.)
-    without downloading the dataset itself.
-
-    You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`].
+    """Load a dataset builder which can be used to:
 
-    A dataset is a directory that contains:
+    - Inspect general information that is required to build a dataset (cache directory, config, dataset info, features, data files, etc.)
+    - Download and prepare the dataset as Arrow files in the cache
+    - Get a streaming dataset without downloading or caching anything
 
-    - some data files in generic formats (JSON, CSV, Parquet, text, etc.)
-    - and optionally a dataset script, if it requires some code to read the data files. This is used to load any kind of formats or structures.
+    You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`].
 
-    Note that dataset scripts can also download and read data files from anywhere - in case your data files already exist online.
+    A dataset is a directory that contains some data files in generic formats (JSON, CSV, Parquet, etc.) and possibly
+    in a generic structure (Webdataset, ImageFolder, AudioFolder, VideoFolder, etc.)
 
     Args:
 
         path (`str`):
             Path or name of the dataset.
-            Depending on `path`, the dataset builder that is used comes from a generic dataset script (JSON, CSV, Parquet, text etc.) or from the dataset script (a python file) inside the dataset directory.
 
-            For local datasets:
+            - if `path` is a dataset repository on the HF hub (list all available datasets with [`huggingface_hub.list_datasets`])
+              -> load the dataset builder from supported files in the repository (csv, json, parquet, etc.)
+              e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing the data files.
 
-            - if `path` is a local directory (containing data files only)
-              -> load a generic dataset builder (csv, json, text etc.) based on the content of the directory
+            - if `path` is a local directory
+              -> load the dataset builder from supported files in the directory (csv, json, parquet, etc.)
               e.g. `'./path/to/directory/with/my/csv/data'`.
-            - if `path` is a local dataset script or a directory containing a local dataset script (if the script has the same name as the directory)
-              -> load the dataset builder from the dataset script
-              e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'`.
-
-            For datasets on the Hugging Face Hub (list all available datasets with [`huggingface_hub.list_datasets`])
 
-            - if `path` is a dataset repository on the HF hub (containing data files only)
-              -> load a generic dataset builder (csv, text etc.) based on the content of the repository
-              e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing your data files.
-            - if `path` is a dataset repository on the HF hub with a dataset script (if the script has the same name as the directory)
-              -> load the dataset builder from the dataset script in the dataset repository
-              e.g. `glue`, `squad`, `'username/dataset_name'`, a dataset repository on the HF hub containing a dataset script `'dataset_name.py'`.
+            - if `path` is the name of a dataset builder and `data_files` or `data_dir` is specified
+              (available builders are "json", "csv", "parquet", "arrow", "text", "xml", "webdataset", "imagefolder", "audiofolder", "videofolder")
+              -> load the dataset builder from the files in `data_files` or `data_dir`
+              e.g. `'parquet'`.
 
+            It can also point to a local dataset script but this is not recommended.
         name (`str`, *optional*):
             Defining the name of the dataset configuration.
         data_dir (`str`, *optional*):
@@ -1837,7 +1833,7 @@ def load_dataset_builder(
 
     ```py
     >>> from datasets import load_dataset_builder
-    >>> ds_builder = load_dataset_builder('rotten_tomatoes')
+    >>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
     >>> ds_builder.info.features
     {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
      'text': Value(dtype='string', id=None)}
@@ -1931,61 +1927,55 @@ def load_dataset(
 
     You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`].
 
-    A dataset is a directory that contains:
-
-    - some data files in generic formats (JSON, CSV, Parquet, text, etc.).
-    - and optionally a dataset script, if it requires some code to read the data files. This is used to load any kind of formats or structures.
-
-    Note that dataset scripts can also download and read data files from anywhere - in case your data files already exist online.
+    A dataset is a directory that contains some data files in generic formats (JSON, CSV, Parquet, etc.) and possibly
+    in a generic structure (Webdataset, ImageFolder, AudioFolder, VideoFolder, etc.)
 
     This function does the following under the hood:
 
-        1. Download and import in the library the dataset script from `path` if it's not already cached inside the library.
+        1. Load a dataset builder:
 
-            If the dataset has no dataset script, then a generic dataset script is imported instead (JSON, CSV, Parquet, text, etc.)
+            * Find the most common data format in the dataset and pick its associated builder (JSON, CSV, Parquet, Webdataset, ImageFolder, AudioFolder, etc.)
+            * Find which file goes into which split (e.g. train/test) based on file and directory names or on the YAML configuration
+            * It is also possible to specify `data_files` manually, and which dataset builder to use (e.g. "parquet").
 
-            Dataset scripts are small python scripts that define dataset builders. They define the citation, info and format of the dataset,
-            contain the path or URL to the original data files and the code to load examples from the original data files.
+        2. Run the dataset builder:
 
-            You can find the complete list of datasets in the Datasets [Hub](https://huggingface.co/datasets).
+            In the general case:
 
-        2. Run the dataset script which will:
-
-            * Download the dataset file from the original URL (see the script) if it's not already available locally or cached.
+            * Download the data files from the dataset if they are not already available locally or cached.
             * Process and cache the dataset in typed Arrow tables for caching.
 
                 Arrow table are arbitrarily long, typed tables which can store nested objects and be mapped to numpy/pandas/python generic types.
                 They can be directly accessed from disk, loaded in RAM or even streamed over the web.
 
+            In the streaming case:
+
+            * Don't download or cache anything. Instead, the dataset is lazily loaded and will be streamed on-the-fly when iterating on it.
+
         3. Return a dataset built from the requested splits in `split` (default: all).
 
-    It also allows to load a dataset from a local directory or a dataset repository on the Hugging Face Hub without dataset script.
-    In this case, it automatically loads all the data files from the directory or the dataset repository.
+    It can also use a custom dataset builder if the dataset contains a dataset script, but this feature is mostly for backward compatibility.
+    In this case the dataset script file must be named after the dataset repository or directory and end with ".py".
 
     Args:
 
         path (`str`):
             Path or name of the dataset.
-            Depending on `path`, the dataset builder that is used comes from a generic dataset script (JSON, CSV, Parquet, text etc.) or from the dataset script (a python file) inside the dataset directory.
 
-            For local datasets:
+            - if `path` is a dataset repository on the HF hub (list all available datasets with [`huggingface_hub.list_datasets`])
+              -> load the dataset from supported files in the repository (csv, json, parquet, etc.)
+              e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing the data files.
 
-            - if `path` is a local directory (containing data files only)
-              -> load a generic dataset builder (csv, json, text etc.) based on the content of the directory
+            - if `path` is a local directory
+              -> load the dataset from supported files in the directory (csv, json, parquet, etc.)
               e.g. `'./path/to/directory/with/my/csv/data'`.
-            - if `path` is a local dataset script or a directory containing a local dataset script (if the script has the same name as the directory)
-              -> load the dataset builder from the dataset script
-              e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'`.
 
-            For datasets on the Hugging Face Hub (list all available datasets with [`huggingface_hub.list_datasets`])
-
-            - if `path` is a dataset repository on the HF hub (containing data files only)
-              -> load a generic dataset builder (csv, text etc.) based on the content of the repository
-              e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing your data files.
-            - if `path` is a dataset repository on the HF hub with a dataset script (if the script has the same name as the directory)
-              -> load the dataset builder from the dataset script in the dataset repository
-              e.g. `glue`, `squad`, `'username/dataset_name'`, a dataset repository on the HF hub containing a dataset script `'dataset_name.py'`.
+            - if `path` is the name of a dataset builder and `data_files` or `data_dir` is specified
+              (available builders are "json", "csv", "parquet", "arrow", "text", "xml", "webdataset", "imagefolder", "audiofolder", "videofolder")
+              -> load the dataset from the files in `data_files` or `data_dir`
+              e.g. `'parquet'`.
 
+            It can also point to a local dataset script but this is not recommended.
         name (`str`, *optional*):
             Defining the name of the dataset configuration.
         data_dir (`str`, *optional*):
@@ -2072,11 +2062,18 @@ def load_dataset(
 
     ```py
     >>> from datasets import load_dataset
-    >>> ds = load_dataset('rotten_tomatoes', split='train')
+    >>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='train')
 
-    # Map data files to splits
+    # Load a subset or dataset configuration (here 'sst2')
+    >>> from datasets import load_dataset
+    >>> ds = load_dataset('nyu-mll/glue', 'sst2', split='train')
+
+    # Manual mapping of data files to splits
     >>> data_files = {'train': 'train.csv', 'test': 'test.csv'}
     >>> ds = load_dataset('namespace/your_dataset_name', data_files=data_files)
+
+    # Manual selection of a directory to load
+    >>> ds = load_dataset('namespace/your_dataset_name', data_dir='folder_name')
     ```
 
     Load a local dataset:
@@ -2090,7 +2087,7 @@ def load_dataset(
     >>> from datasets import load_dataset
     >>> ds = load_dataset('json', data_files='path/to/local/my_dataset.json')
 
-    # Load from a local loading script
+    # Load from a local loading script (not recommended)
     >>> from datasets import load_dataset
     >>> ds = load_dataset('path/to/local/loading_script/loading_script.py', split='train')
     ```
@@ -2099,7 +2096,7 @@ def load_dataset(
 
     ```py
     >>> from datasets import load_dataset
-    >>> ds = load_dataset('rotten_tomatoes', split='train', streaming=True)
+    >>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='train', streaming=True)
     ```
 
     Load an image dataset with the `ImageFolder` dataset builder:

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
@@ -2717,9 +2717,11 @@ def test_format_vectors(self, in_memory):
         import tensorflow as tf
         import torch
 
-        with tempfile.TemporaryDirectory() as tmp_dir, self._create_dummy_dataset(
-            in_memory, tmp_dir
-        ) as dset, dset.map(lambda ex, i: {"vec": np.ones(3) * i}, with_indices=True) as dset:
+        with (
+            tempfile.TemporaryDirectory() as tmp_dir,
+            self._create_dummy_dataset(in_memory, tmp_dir) as dset,
+            dset.map(lambda ex, i: {"vec": np.ones(3) * i}, with_indices=True) as dset,
+        ):
             columns = dset.column_names
 
             self.assertIsNotNone(dset[0])
@@ -2770,9 +2772,11 @@ def test_format_ragged_vectors(self, in_memory):
         import tensorflow as tf
         import torch
 
-        with tempfile.TemporaryDirectory() as tmp_dir, self._create_dummy_dataset(
-            in_memory, tmp_dir
-        ) as dset, dset.map(lambda ex, i: {"vec": np.ones(3 + i) * i}, with_indices=True) as dset:
+        with (
+            tempfile.TemporaryDirectory() as tmp_dir,
+            self._create_dummy_dataset(in_memory, tmp_dir) as dset,
+            dset.map(lambda ex, i: {"vec": np.ones(3 + i) * i}, with_indices=True) as dset,
+        ):
             columns = dset.column_names
 
             self.assertIsNotNone(dset[0])
@@ -2830,9 +2834,11 @@ def test_format_nested(self, in_memory):
         import tensorflow as tf
         import torch
 
-        with tempfile.TemporaryDirectory() as tmp_dir, self._create_dummy_dataset(
-            in_memory, tmp_dir
-        ) as dset, dset.map(lambda ex: {"nested": [{"foo": np.ones(3)}] * len(ex["filename"])}, batched=True) as dset:
+        with (
+            tempfile.TemporaryDirectory() as tmp_dir,
+            self._create_dummy_dataset(in_memory, tmp_dir) as dset,
+            dset.map(lambda ex: {"nested": [{"foo": np.ones(3)}] * len(ex["filename"])}, batched=True) as dset,
+        ):
             self.assertDictEqual(
                 dset.features, Features({"filename": Value("string"), "nested": {"foo": Sequence(Value("float64"))}})
             )
@@ -3224,11 +3230,11 @@ def test_concatenate_mixed_memory_and_disk(self):
         info1 = DatasetInfo(description="Dataset1")
         info2 = DatasetInfo(description="Dataset2")
         with tempfile.TemporaryDirectory() as tmp_dir:
-            with Dataset.from_dict(data1, info=info1).map(
-                cache_file_name=os.path.join(tmp_dir, "d1.arrow")
-            ) as dset1, Dataset.from_dict(data2, info=info2).map(
-                cache_file_name=os.path.join(tmp_dir, "d2.arrow")
-            ) as dset2, Dataset.from_dict(data3) as dset3:
+            with (
+                Dataset.from_dict(data1, info=info1).map(cache_file_name=os.path.join(tmp_dir, "d1.arrow")) as dset1,
+                Dataset.from_dict(data2, info=info2).map(cache_file_name=os.path.join(tmp_dir, "d2.arrow")) as dset2,
+                Dataset.from_dict(data3) as dset3,
+            ):
                 with concatenate_datasets([dset1, dset2, dset3]) as concatenated_dset:
                     self.assertEqual(len(concatenated_dset), len(dset1) + len(dset2) + len(dset3))
                     self.assertListEqual(concatenated_dset["id"], dset1["id"] + dset2["id"] + dset3["id"])
@@ -4130,9 +4136,10 @@ def test_dataset_to_json(dataset, tmp_path):
 )
 def test_pickle_dataset_after_transforming_the_table(in_memory, method_and_params, arrow_file):
     method, args, kwargs = method_and_params
-    with Dataset.from_file(arrow_file, in_memory=in_memory) as dataset, Dataset.from_file(
-        arrow_file, in_memory=in_memory
-    ) as reference_dataset:
+    with (
+        Dataset.from_file(arrow_file, in_memory=in_memory) as dataset,
+        Dataset.from_file(arrow_file, in_memory=in_memory) as reference_dataset,
+    ):
         out = getattr(dataset, method)(*args, **kwargs)
         dataset = out if out is not None else dataset
         pickled_dataset = pickle.dumps(dataset)

diff --git a/tests/test_hub.py b/tests/test_hub.py
@@ -84,7 +84,7 @@ def test_convert_to_parquet(temporary_repo, hf_api, hf_token, ci_hub_config, ci_
           - name: train
             num_bytes: 55
             num_examples: 5
-          download_size: 726
+          download_size: 717
           dataset_size: 55
         {METADATA_CONFIGS_FIELD}:
         - config_name: first
@@ -105,7 +105,7 @@ def test_convert_to_parquet(temporary_repo, hf_api, hf_token, ci_hub_config, ci_
           - name: train
             num_bytes: 60
             num_examples: 5
-          download_size: 732
+          download_size: 723
           dataset_size: 60
         {METADATA_CONFIGS_FIELD}:
         - config_name: second
@@ -115,6 +115,9 @@ def test_convert_to_parquet(temporary_repo, hf_api, hf_token, ci_hub_config, ci_
         ---
         """),
     ]
+    if PYARROW_VERSION < version.parse("18.1.0"):
+        expected_readmes[0] = expected_readmes[0].replace("download_size: 717", "download_size: 726")
+        expected_readmes[1] = expected_readmes[1].replace("download_size: 723", "download_size: 732")
     if PYARROW_VERSION < version.parse("18.0.0"):
         expected_readmes[0] = expected_readmes[0].replace("download_size: 726", "download_size: 790")
         expected_readmes[1] = expected_readmes[1].replace("download_size: 732", "download_size: 798")

diff --git a/tests/test_py_utils.py b/tests/test_py_utils.py
@@ -116,9 +116,10 @@ class Foo:
     ],
 )
 def test_map_nested_num_proc(iterable_length, num_proc, expected_num_proc):
-    with patch("datasets.utils.py_utils._single_map_nested") as mock_single_map_nested, patch(
-        "datasets.parallel.parallel.Pool"
-    ) as mock_multiprocessing_pool:
+    with (
+        patch("datasets.utils.py_utils._single_map_nested") as mock_single_map_nested,
+        patch("datasets.parallel.parallel.Pool") as mock_multiprocessing_pool,
+    ):
         data_struct = {f"{i}": i for i in range(iterable_length)}
         _ = map_nested(lambda x: x + 10, data_struct, num_proc=num_proc, parallel_min_length=16)
         if expected_num_proc == 1:

diff --git a/tests/test_search.py b/tests/test_search.py
@@ -88,9 +88,11 @@ def test_add_elasticsearch_index(self):
         from elasticsearch import Elasticsearch
 
         dset: Dataset = self._create_dummy_dataset()
-        with patch("elasticsearch.Elasticsearch.search") as mocked_search, patch(
-            "elasticsearch.client.IndicesClient.create"
-        ) as mocked_index_create, patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk:
+        with (
+            patch("elasticsearch.Elasticsearch.search") as mocked_search,
+            patch("elasticsearch.client.IndicesClient.create") as mocked_index_create,
+            patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk,
+        ):
             mocked_index_create.return_value = {"acknowledged": True}
             mocked_bulk.return_value([(True, None)] * 30)
             mocked_search.return_value = {"hits": {"hits": [{"_score": 1, "_id": 29}]}}
@@ -198,9 +200,11 @@ class ElasticSearchIndexTest(TestCase):
     def test_elasticsearch(self):
         from elasticsearch import Elasticsearch
 
-        with patch("elasticsearch.Elasticsearch.search") as mocked_search, patch(
-            "elasticsearch.client.IndicesClient.create"
-        ) as mocked_index_create, patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk:
+        with (
+            patch("elasticsearch.Elasticsearch.search") as mocked_search,
+            patch("elasticsearch.client.IndicesClient.create") as mocked_index_create,
+            patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk,
+        ):
             es_client = Elasticsearch()
             mocked_index_create.return_value = {"acknowledged": True}
             index = ElasticSearchIndex(es_client=es_client)

diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py
@@ -242,8 +242,9 @@ def test_push_dataset_dict_to_hub_with_multiple_commits(self, temporary_repo):
         with temporary_repo() as ds_name:
             self._api.create_repo(ds_name, token=self._token, repo_type="dataset")
             num_commits_before_push = len(self._api.list_repo_commits(ds_name, repo_type="dataset", token=self._token))
-            with patch("datasets.config.MAX_SHARD_SIZE", "16KB"), patch(
-                "datasets.config.UPLOADS_MAX_NUMBER_PER_COMMIT", 1
+            with (
+                patch("datasets.config.MAX_SHARD_SIZE", "16KB"),
+                patch("datasets.config.UPLOADS_MAX_NUMBER_PER_COMMIT", 1),
             ):
                 local_ds.push_to_hub(ds_name, token=self._token)
             hub_ds = load_dataset(ds_name, download_mode="force_redownload")