Skip to content

Commit

Permalink
update load_dataset doctring (#7301)
Browse files Browse the repository at this point in the history
* update load_dataset doctring

* style

* minor

* drop python 3.8
lhoestq authored Nov 29, 2024
1 parent 17f17b3 commit 06c3235
Showing 10 changed files with 111 additions and 99 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -21,7 +21,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.8"
python-version: "3.9"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
@@ -44,10 +44,10 @@ jobs:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Set up Python 3.8
- name: Set up Python 3.9
uses: actions/setup-python@v5
with:
python-version: "3.8"
python-version: "3.9"
- name: Upgrade pip
run: python -m pip install --upgrade pip
- name: Pin setuptools-scm
2 changes: 1 addition & 1 deletion .github/workflows/release-conda.yml
Original file line number Diff line number Diff line change
@@ -25,7 +25,7 @@ jobs:
auto-update-conda: true
auto-activate-base: false
activate-environment: "build-datasets"
python-version: 3.8
python-version: 3.9
channels: huggingface

- name: Setup conda env
3 changes: 1 addition & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -251,7 +251,7 @@
"datasets.utils.resources": ["*.json", "*.yaml", "*.tsv"],
},
entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]},
python_requires=">=3.8.0",
python_requires=">=3.9.0",
install_requires=REQUIRED_PKGS,
extras_require=EXTRAS_REQUIRE,
classifiers=[
@@ -262,7 +262,6 @@
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
2 changes: 1 addition & 1 deletion src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
@@ -372,7 +372,7 @@ def to_tf_dataset(
a small buffer of batches for training. Improves performance by allowing data to be loaded in the
background while the model is training.
num_workers (`int`, defaults to `0`):
Number of workers to use for loading the dataset. Only supported on Python versions >= 3.8.
Number of workers to use for loading the dataset.
num_test_batches (`int`, defaults to `20`):
Number of batches to use to infer the output signature of the dataset.
The higher this number, the more accurate the signature will be, but the longer it will take to
121 changes: 59 additions & 62 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
@@ -242,9 +242,11 @@ def __reduce__(self): # to make dynamically created class pickable, see _Initia
def get_dataset_builder_class(
dataset_module: "DatasetModule", dataset_name: Optional[str] = None
) -> Type[DatasetBuilder]:
with lock_importable_file(
dataset_module.importable_file_path
) if dataset_module.importable_file_path else nullcontext():
with (
lock_importable_file(dataset_module.importable_file_path)
if dataset_module.importable_file_path
else nullcontext()
):
builder_cls = import_main_class(dataset_module.module_path)
if dataset_module.builder_configs_parameters.builder_configs:
dataset_name = dataset_name or dataset_module.builder_kwargs.get("dataset_name")
@@ -1751,42 +1753,36 @@ def load_dataset_builder(
_require_default_config_name=True,
**config_kwargs,
) -> DatasetBuilder:
"""Load a dataset builder from the Hugging Face Hub, or a local dataset. A dataset builder can be used to inspect general information that is required to build a dataset (cache directory, config, dataset info, etc.)
without downloading the dataset itself.
You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`].
"""Load a dataset builder which can be used to:
A dataset is a directory that contains:
- Inspect general information that is required to build a dataset (cache directory, config, dataset info, features, data files, etc.)
- Download and prepare the dataset as Arrow files in the cache
- Get a streaming dataset without downloading or caching anything
- some data files in generic formats (JSON, CSV, Parquet, text, etc.)
- and optionally a dataset script, if it requires some code to read the data files. This is used to load any kind of formats or structures.
You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`].
Note that dataset scripts can also download and read data files from anywhere - in case your data files already exist online.
A dataset is a directory that contains some data files in generic formats (JSON, CSV, Parquet, etc.) and possibly
in a generic structure (Webdataset, ImageFolder, AudioFolder, VideoFolder, etc.)
Args:
path (`str`):
Path or name of the dataset.
Depending on `path`, the dataset builder that is used comes from a generic dataset script (JSON, CSV, Parquet, text etc.) or from the dataset script (a python file) inside the dataset directory.
For local datasets:
- if `path` is a dataset repository on the HF hub (list all available datasets with [`huggingface_hub.list_datasets`])
-> load the dataset builder from supported files in the repository (csv, json, parquet, etc.)
e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing the data files.
- if `path` is a local directory (containing data files only)
-> load a generic dataset builder (csv, json, text etc.) based on the content of the directory
- if `path` is a local directory
-> load the dataset builder from supported files in the directory (csv, json, parquet, etc.)
e.g. `'./path/to/directory/with/my/csv/data'`.
- if `path` is a local dataset script or a directory containing a local dataset script (if the script has the same name as the directory)
-> load the dataset builder from the dataset script
e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'`.
For datasets on the Hugging Face Hub (list all available datasets with [`huggingface_hub.list_datasets`])
- if `path` is a dataset repository on the HF hub (containing data files only)
-> load a generic dataset builder (csv, text etc.) based on the content of the repository
e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing your data files.
- if `path` is a dataset repository on the HF hub with a dataset script (if the script has the same name as the directory)
-> load the dataset builder from the dataset script in the dataset repository
e.g. `glue`, `squad`, `'username/dataset_name'`, a dataset repository on the HF hub containing a dataset script `'dataset_name.py'`.
- if `path` is the name of a dataset builder and `data_files` or `data_dir` is specified
(available builders are "json", "csv", "parquet", "arrow", "text", "xml", "webdataset", "imagefolder", "audiofolder", "videofolder")
-> load the dataset builder from the files in `data_files` or `data_dir`
e.g. `'parquet'`.
It can also point to a local dataset script but this is not recommended.
name (`str`, *optional*):
Defining the name of the dataset configuration.
data_dir (`str`, *optional*):
@@ -1837,7 +1833,7 @@ def load_dataset_builder(
```py
>>> from datasets import load_dataset_builder
>>> ds_builder = load_dataset_builder('rotten_tomatoes')
>>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
>>> ds_builder.info.features
{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
'text': Value(dtype='string', id=None)}
@@ -1931,61 +1927,55 @@ def load_dataset(
You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`].
A dataset is a directory that contains:
- some data files in generic formats (JSON, CSV, Parquet, text, etc.).
- and optionally a dataset script, if it requires some code to read the data files. This is used to load any kind of formats or structures.
Note that dataset scripts can also download and read data files from anywhere - in case your data files already exist online.
A dataset is a directory that contains some data files in generic formats (JSON, CSV, Parquet, etc.) and possibly
in a generic structure (Webdataset, ImageFolder, AudioFolder, VideoFolder, etc.)
This function does the following under the hood:
1. Download and import in the library the dataset script from `path` if it's not already cached inside the library.
1. Load a dataset builder:
If the dataset has no dataset script, then a generic dataset script is imported instead (JSON, CSV, Parquet, text, etc.)
* Find the most common data format in the dataset and pick its associated builder (JSON, CSV, Parquet, Webdataset, ImageFolder, AudioFolder, etc.)
* Find which file goes into which split (e.g. train/test) based on file and directory names or on the YAML configuration
* It is also possible to specify `data_files` manually, and which dataset builder to use (e.g. "parquet").
Dataset scripts are small python scripts that define dataset builders. They define the citation, info and format of the dataset,
contain the path or URL to the original data files and the code to load examples from the original data files.
2. Run the dataset builder:
You can find the complete list of datasets in the Datasets [Hub](https://huggingface.co/datasets).
In the general case:
2. Run the dataset script which will:
* Download the dataset file from the original URL (see the script) if it's not already available locally or cached.
* Download the data files from the dataset if they are not already available locally or cached.
* Process and cache the dataset in typed Arrow tables for caching.
Arrow table are arbitrarily long, typed tables which can store nested objects and be mapped to numpy/pandas/python generic types.
They can be directly accessed from disk, loaded in RAM or even streamed over the web.
In the streaming case:
* Don't download or cache anything. Instead, the dataset is lazily loaded and will be streamed on-the-fly when iterating on it.
3. Return a dataset built from the requested splits in `split` (default: all).
It also allows to load a dataset from a local directory or a dataset repository on the Hugging Face Hub without dataset script.
In this case, it automatically loads all the data files from the directory or the dataset repository.
It can also use a custom dataset builder if the dataset contains a dataset script, but this feature is mostly for backward compatibility.
In this case the dataset script file must be named after the dataset repository or directory and end with ".py".
Args:
path (`str`):
Path or name of the dataset.
Depending on `path`, the dataset builder that is used comes from a generic dataset script (JSON, CSV, Parquet, text etc.) or from the dataset script (a python file) inside the dataset directory.
For local datasets:
- if `path` is a dataset repository on the HF hub (list all available datasets with [`huggingface_hub.list_datasets`])
-> load the dataset from supported files in the repository (csv, json, parquet, etc.)
e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing the data files.
- if `path` is a local directory (containing data files only)
-> load a generic dataset builder (csv, json, text etc.) based on the content of the directory
- if `path` is a local directory
-> load the dataset from supported files in the directory (csv, json, parquet, etc.)
e.g. `'./path/to/directory/with/my/csv/data'`.
- if `path` is a local dataset script or a directory containing a local dataset script (if the script has the same name as the directory)
-> load the dataset builder from the dataset script
e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'`.
For datasets on the Hugging Face Hub (list all available datasets with [`huggingface_hub.list_datasets`])
- if `path` is a dataset repository on the HF hub (containing data files only)
-> load a generic dataset builder (csv, text etc.) based on the content of the repository
e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing your data files.
- if `path` is a dataset repository on the HF hub with a dataset script (if the script has the same name as the directory)
-> load the dataset builder from the dataset script in the dataset repository
e.g. `glue`, `squad`, `'username/dataset_name'`, a dataset repository on the HF hub containing a dataset script `'dataset_name.py'`.
- if `path` is the name of a dataset builder and `data_files` or `data_dir` is specified
(available builders are "json", "csv", "parquet", "arrow", "text", "xml", "webdataset", "imagefolder", "audiofolder", "videofolder")
-> load the dataset from the files in `data_files` or `data_dir`
e.g. `'parquet'`.
It can also point to a local dataset script but this is not recommended.
name (`str`, *optional*):
Defining the name of the dataset configuration.
data_dir (`str`, *optional*):
@@ -2072,11 +2062,18 @@ def load_dataset(
```py
>>> from datasets import load_dataset
>>> ds = load_dataset('rotten_tomatoes', split='train')
>>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='train')
# Map data files to splits
# Load a subset or dataset configuration (here 'sst2')
>>> from datasets import load_dataset
>>> ds = load_dataset('nyu-mll/glue', 'sst2', split='train')
# Manual mapping of data files to splits
>>> data_files = {'train': 'train.csv', 'test': 'test.csv'}
>>> ds = load_dataset('namespace/your_dataset_name', data_files=data_files)
# Manual selection of a directory to load
>>> ds = load_dataset('namespace/your_dataset_name', data_dir='folder_name')
```
Load a local dataset:
@@ -2090,7 +2087,7 @@ def load_dataset(
>>> from datasets import load_dataset
>>> ds = load_dataset('json', data_files='path/to/local/my_dataset.json')
# Load from a local loading script
# Load from a local loading script (not recommended)
>>> from datasets import load_dataset
>>> ds = load_dataset('path/to/local/loading_script/loading_script.py', split='train')
```
@@ -2099,7 +2096,7 @@ def load_dataset(
```py
>>> from datasets import load_dataset
>>> ds = load_dataset('rotten_tomatoes', split='train', streaming=True)
>>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='train', streaming=True)
```
Load an image dataset with the `ImageFolder` dataset builder:
41 changes: 24 additions & 17 deletions tests/test_arrow_dataset.py
Original file line number Diff line number Diff line change
@@ -2717,9 +2717,11 @@ def test_format_vectors(self, in_memory):
import tensorflow as tf
import torch

with tempfile.TemporaryDirectory() as tmp_dir, self._create_dummy_dataset(
in_memory, tmp_dir
) as dset, dset.map(lambda ex, i: {"vec": np.ones(3) * i}, with_indices=True) as dset:
with (
tempfile.TemporaryDirectory() as tmp_dir,
self._create_dummy_dataset(in_memory, tmp_dir) as dset,
dset.map(lambda ex, i: {"vec": np.ones(3) * i}, with_indices=True) as dset,
):
columns = dset.column_names

self.assertIsNotNone(dset[0])
@@ -2770,9 +2772,11 @@ def test_format_ragged_vectors(self, in_memory):
import tensorflow as tf
import torch

with tempfile.TemporaryDirectory() as tmp_dir, self._create_dummy_dataset(
in_memory, tmp_dir
) as dset, dset.map(lambda ex, i: {"vec": np.ones(3 + i) * i}, with_indices=True) as dset:
with (
tempfile.TemporaryDirectory() as tmp_dir,
self._create_dummy_dataset(in_memory, tmp_dir) as dset,
dset.map(lambda ex, i: {"vec": np.ones(3 + i) * i}, with_indices=True) as dset,
):
columns = dset.column_names

self.assertIsNotNone(dset[0])
@@ -2830,9 +2834,11 @@ def test_format_nested(self, in_memory):
import tensorflow as tf
import torch

with tempfile.TemporaryDirectory() as tmp_dir, self._create_dummy_dataset(
in_memory, tmp_dir
) as dset, dset.map(lambda ex: {"nested": [{"foo": np.ones(3)}] * len(ex["filename"])}, batched=True) as dset:
with (
tempfile.TemporaryDirectory() as tmp_dir,
self._create_dummy_dataset(in_memory, tmp_dir) as dset,
dset.map(lambda ex: {"nested": [{"foo": np.ones(3)}] * len(ex["filename"])}, batched=True) as dset,
):
self.assertDictEqual(
dset.features, Features({"filename": Value("string"), "nested": {"foo": Sequence(Value("float64"))}})
)
@@ -3224,11 +3230,11 @@ def test_concatenate_mixed_memory_and_disk(self):
info1 = DatasetInfo(description="Dataset1")
info2 = DatasetInfo(description="Dataset2")
with tempfile.TemporaryDirectory() as tmp_dir:
with Dataset.from_dict(data1, info=info1).map(
cache_file_name=os.path.join(tmp_dir, "d1.arrow")
) as dset1, Dataset.from_dict(data2, info=info2).map(
cache_file_name=os.path.join(tmp_dir, "d2.arrow")
) as dset2, Dataset.from_dict(data3) as dset3:
with (
Dataset.from_dict(data1, info=info1).map(cache_file_name=os.path.join(tmp_dir, "d1.arrow")) as dset1,
Dataset.from_dict(data2, info=info2).map(cache_file_name=os.path.join(tmp_dir, "d2.arrow")) as dset2,
Dataset.from_dict(data3) as dset3,
):
with concatenate_datasets([dset1, dset2, dset3]) as concatenated_dset:
self.assertEqual(len(concatenated_dset), len(dset1) + len(dset2) + len(dset3))
self.assertListEqual(concatenated_dset["id"], dset1["id"] + dset2["id"] + dset3["id"])
@@ -4130,9 +4136,10 @@ def test_dataset_to_json(dataset, tmp_path):
)
def test_pickle_dataset_after_transforming_the_table(in_memory, method_and_params, arrow_file):
method, args, kwargs = method_and_params
with Dataset.from_file(arrow_file, in_memory=in_memory) as dataset, Dataset.from_file(
arrow_file, in_memory=in_memory
) as reference_dataset:
with (
Dataset.from_file(arrow_file, in_memory=in_memory) as dataset,
Dataset.from_file(arrow_file, in_memory=in_memory) as reference_dataset,
):
out = getattr(dataset, method)(*args, **kwargs)
dataset = out if out is not None else dataset
pickled_dataset = pickle.dumps(dataset)
7 changes: 5 additions & 2 deletions tests/test_hub.py
Original file line number Diff line number Diff line change
@@ -84,7 +84,7 @@ def test_convert_to_parquet(temporary_repo, hf_api, hf_token, ci_hub_config, ci_
- name: train
num_bytes: 55
num_examples: 5
download_size: 726
download_size: 717
dataset_size: 55
{METADATA_CONFIGS_FIELD}:
- config_name: first
@@ -105,7 +105,7 @@ def test_convert_to_parquet(temporary_repo, hf_api, hf_token, ci_hub_config, ci_
- name: train
num_bytes: 60
num_examples: 5
download_size: 732
download_size: 723
dataset_size: 60
{METADATA_CONFIGS_FIELD}:
- config_name: second
@@ -115,6 +115,9 @@ def test_convert_to_parquet(temporary_repo, hf_api, hf_token, ci_hub_config, ci_
---
"""),
]
if PYARROW_VERSION < version.parse("18.1.0"):
expected_readmes[0] = expected_readmes[0].replace("download_size: 717", "download_size: 726")
expected_readmes[1] = expected_readmes[1].replace("download_size: 723", "download_size: 732")
if PYARROW_VERSION < version.parse("18.0.0"):
expected_readmes[0] = expected_readmes[0].replace("download_size: 726", "download_size: 790")
expected_readmes[1] = expected_readmes[1].replace("download_size: 732", "download_size: 798")
7 changes: 4 additions & 3 deletions tests/test_py_utils.py
Original file line number Diff line number Diff line change
@@ -116,9 +116,10 @@ class Foo:
],
)
def test_map_nested_num_proc(iterable_length, num_proc, expected_num_proc):
with patch("datasets.utils.py_utils._single_map_nested") as mock_single_map_nested, patch(
"datasets.parallel.parallel.Pool"
) as mock_multiprocessing_pool:
with (
patch("datasets.utils.py_utils._single_map_nested") as mock_single_map_nested,
patch("datasets.parallel.parallel.Pool") as mock_multiprocessing_pool,
):
data_struct = {f"{i}": i for i in range(iterable_length)}
_ = map_nested(lambda x: x + 10, data_struct, num_proc=num_proc, parallel_min_length=16)
if expected_num_proc == 1:
16 changes: 10 additions & 6 deletions tests/test_search.py
Original file line number Diff line number Diff line change
@@ -88,9 +88,11 @@ def test_add_elasticsearch_index(self):
from elasticsearch import Elasticsearch

dset: Dataset = self._create_dummy_dataset()
with patch("elasticsearch.Elasticsearch.search") as mocked_search, patch(
"elasticsearch.client.IndicesClient.create"
) as mocked_index_create, patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk:
with (
patch("elasticsearch.Elasticsearch.search") as mocked_search,
patch("elasticsearch.client.IndicesClient.create") as mocked_index_create,
patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk,
):
mocked_index_create.return_value = {"acknowledged": True}
mocked_bulk.return_value([(True, None)] * 30)
mocked_search.return_value = {"hits": {"hits": [{"_score": 1, "_id": 29}]}}
@@ -198,9 +200,11 @@ class ElasticSearchIndexTest(TestCase):
def test_elasticsearch(self):
from elasticsearch import Elasticsearch

with patch("elasticsearch.Elasticsearch.search") as mocked_search, patch(
"elasticsearch.client.IndicesClient.create"
) as mocked_index_create, patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk:
with (
patch("elasticsearch.Elasticsearch.search") as mocked_search,
patch("elasticsearch.client.IndicesClient.create") as mocked_index_create,
patch("elasticsearch.helpers.streaming_bulk") as mocked_bulk,
):
es_client = Elasticsearch()
mocked_index_create.return_value = {"acknowledged": True}
index = ElasticSearchIndex(es_client=es_client)
5 changes: 3 additions & 2 deletions tests/test_upstream_hub.py
Original file line number Diff line number Diff line change
@@ -242,8 +242,9 @@ def test_push_dataset_dict_to_hub_with_multiple_commits(self, temporary_repo):
with temporary_repo() as ds_name:
self._api.create_repo(ds_name, token=self._token, repo_type="dataset")
num_commits_before_push = len(self._api.list_repo_commits(ds_name, repo_type="dataset", token=self._token))
with patch("datasets.config.MAX_SHARD_SIZE", "16KB"), patch(
"datasets.config.UPLOADS_MAX_NUMBER_PER_COMMIT", 1
with (
patch("datasets.config.MAX_SHARD_SIZE", "16KB"),
patch("datasets.config.UPLOADS_MAX_NUMBER_PER_COMMIT", 1),
):
local_ds.push_to_hub(ds_name, token=self._token)
hub_ds = load_dataset(ds_name, download_mode="force_redownload")

0 comments on commit 06c3235

Please sign in to comment.