From b60ebb83cd668decaa21df66148beb44bce57739 Mon Sep 17 00:00:00 2001 From: Sergio Paniego Blanco Date: Fri, 6 Dec 2024 12:32:01 +0100 Subject: [PATCH] Updated inconsistent output in documentation examples for `ClassLabel` (#7293) * Updated ClassLabel output in features.py * Updated ClassLabel output in load.py * Updated ClassLabel output in iterable_daaset.py * Updated ClassLabel output in dataset_dict.py * Updated ClassLabel output in builder.py * Updated ClassLabel output in arrow_dataset.py * Updated docs * Added missing comma * Updated python code * Update src/datasets/builder.py Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/about_dataset_features.mdx | 2 +- docs/source/load_hub.mdx | 2 +- docs/source/loading.mdx | 2 +- docs/source/process.mdx | 4 ++-- docs/source/stream.mdx | 4 ++-- src/datasets/arrow_dataset.py | 10 +++++----- src/datasets/builder.py | 6 +++--- src/datasets/dataset_dict.py | 22 +++++++++++----------- src/datasets/features/features.py | 8 ++++---- src/datasets/iterable_dataset.py | 6 +++--- src/datasets/load.py | 2 +- 11 files changed, 34 insertions(+), 34 deletions(-) diff --git a/docs/source/about_dataset_features.mdx b/docs/source/about_dataset_features.mdx index 12a85477645..f9b93fa9cb8 100644 --- a/docs/source/about_dataset_features.mdx +++ b/docs/source/about_dataset_features.mdx @@ -11,7 +11,7 @@ Let's have a look at the features of the MRPC dataset from the GLUE benchmark: >>> dataset = load_dataset('glue', 'mrpc', split='train') >>> dataset.features {'idx': Value(dtype='int32', id=None), - 'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], names_file=None, id=None), + 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None), 'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), } diff --git a/docs/source/load_hub.mdx b/docs/source/load_hub.mdx index 93a27611fd7..d2c71754bc6 100644 --- a/docs/source/load_hub.mdx +++ b/docs/source/load_hub.mdx @@ -20,7 +20,7 @@ Movie Review Dataset. This is a dataset of containing 5,331 positive and 5,331 n # Inspect dataset features >>> ds_builder.info.features -{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None), +{'label': ClassLabel(names=['neg', 'pos'], id=None), 'text': Value(dtype='string', id=None)} ``` diff --git a/docs/source/loading.mdx b/docs/source/loading.mdx index 49feeba419d..e30037e3cc7 100644 --- a/docs/source/loading.mdx +++ b/docs/source/loading.mdx @@ -435,7 +435,7 @@ Now when you look at your dataset features, you can see it uses the custom label ```py >>> dataset['train'].features {'text': Value(dtype='string', id=None), -'label': ClassLabel(num_classes=6, names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], names_file=None, id=None)} +'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)} ``` ## (Legacy) Local loading script diff --git a/docs/source/process.mdx b/docs/source/process.mdx index 38989613ef3..198b7509456 100644 --- a/docs/source/process.mdx +++ b/docs/source/process.mdx @@ -225,7 +225,7 @@ The [`~Dataset.cast`] function transforms the feature type of one or more column >>> dataset.features {'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), -'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], names_file=None, id=None), +'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None), 'idx': Value(dtype='int32', id=None)} >>> from datasets import ClassLabel, Value @@ -236,7 +236,7 @@ The [`~Dataset.cast`] function transforms the feature type of one or more column >>> dataset.features {'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), -'label': ClassLabel(num_classes=2, names=['negative', 'positive'], names_file=None, id=None), +'label': ClassLabel(names=['negative', 'positive'], id=None), 'idx': Value(dtype='int64', id=None)} ``` diff --git a/docs/source/stream.mdx b/docs/source/stream.mdx index 0be393ce4a8..f17899aa438 100644 --- a/docs/source/stream.mdx +++ b/docs/source/stream.mdx @@ -229,7 +229,7 @@ When you need to remove one or more columns, give [`IterableDataset.remove_colum >>> dataset.features {'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), -'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], names_file=None, id=None), +'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None), 'idx': Value(dtype='int32', id=None)} >>> from datasets import ClassLabel, Value @@ -240,7 +240,7 @@ When you need to remove one or more columns, give [`IterableDataset.remove_colum >>> dataset.features {'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), -'label': ClassLabel(num_classes=2, names=['negative', 'positive'], names_file=None, id=None), +'label': ClassLabel(names=['negative', 'positive'], id=None), 'idx': Value(dtype='int64', id=None)} ``` diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index c6ac25a30b5..aa7a919c566 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -2023,14 +2023,14 @@ def cast( >>> from datasets import load_dataset, ClassLabel, Value >>> ds = load_dataset("rotten_tomatoes", split="validation") >>> ds.features - {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None), + {'label': ClassLabel(names=['neg', 'pos'], id=None), 'text': Value(dtype='string', id=None)} >>> new_features = ds.features.copy() >>> new_features['label'] = ClassLabel(names=['bad', 'good']) >>> new_features['text'] = Value('large_string') >>> ds = ds.cast(new_features) >>> ds.features - {'label': ClassLabel(num_classes=2, names=['bad', 'good'], id=None), + {'label': ClassLabel(names=['bad', 'good'], id=None), 'text': Value(dtype='large_string', id=None)} ``` """ @@ -2078,14 +2078,14 @@ def cast_column(self, column: str, feature: FeatureType, new_fingerprint: Option Example: ```py - >>> from datasets import load_dataset + >>> from datasets import load_dataset, ClassLabel >>> ds = load_dataset("rotten_tomatoes", split="validation") >>> ds.features - {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None), + {'label': ClassLabel(names=['neg', 'pos'], id=None), 'text': Value(dtype='string', id=None)} >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good'])) >>> ds.features - {'label': ClassLabel(num_classes=2, names=['bad', 'good'], id=None), + {'label': ClassLabel(names=['bad', 'good'], id=None), 'text': Value(dtype='string', id=None)} ``` """ diff --git a/src/datasets/builder.py b/src/datasets/builder.py index c3eee41c6e0..1200749538e 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -510,9 +510,9 @@ def get_all_exported_dataset_infos(cls) -> DatasetInfosDict: ```py >>> from datasets import load_dataset_builder - >>> ds_builder = load_dataset_builder('rotten_tomatoes') + >>> ds_builder = load_dataset_builder('vivos') >>> ds_builder.get_all_exported_dataset_infos() - {'default': DatasetInfo(description="Movie Review Dataset.\nThis is a dataset of containing 5,331 positive and 5,331 negative processed\nsentences from Rotten Tomatoes movie reviews. This data was first used in Bo\nPang and Lillian Lee, ``Seeing stars: Exploiting class relationships for\nsentiment categorization with respect to rating scales.'', Proceedings of the\nACL, 2005.\n", citation='@InProceedings{Pang+Lee:05a,\n author = {Bo Pang and Lillian Lee},\n title = {Seeing stars: Exploiting class relationships for sentiment\n categorization with respect to rating scales},\n booktitle = {Proceedings of the ACL},\n year = 2005\n}\n', homepage='http://www.cs.cornell.edu/people/pabo/movie-review-data/', license='', features={'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None)}, post_processed=None, supervised_keys=SupervisedKeysData(input='', output=''), builder_name='rotten_tomatoes_movie_review', config_name='default', version=1.0.0, splits={'train': SplitInfo(name='train', num_bytes=1074810, num_examples=8530, dataset_name='rotten_tomatoes_movie_review'), 'validation': SplitInfo(name='validation', num_bytes=134679, num_examples=1066, dataset_name='rotten_tomatoes_movie_review'), 'test': SplitInfo(name='test', num_bytes=135972, num_examples=1066, dataset_name='rotten_tomatoes_movie_review')}, download_checksums={'https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz': {'num_bytes': 487770, 'checksum': 'a05befe52aafda71d458d188a1c54506a998b1308613ba76bbda2e5029409ce9'}}, download_size=487770, post_processing_size=None, dataset_size=1345461, size_in_bytes=1833231)} + {'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string', id=None), 'path': Value(dtype='string', id=None), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)} ``` """ return DatasetInfosDict.from_directory(cls.get_imported_module_dir()) @@ -526,7 +526,7 @@ def get_exported_dataset_info(self) -> DatasetInfo: >>> from datasets import load_dataset_builder >>> ds_builder = load_dataset_builder('rotten_tomatoes') >>> ds_builder.get_exported_dataset_info() - DatasetInfo(description="Movie Review Dataset.\nThis is a dataset of containing 5,331 positive and 5,331 negative processed\nsentences from Rotten Tomatoes movie reviews. This data was first used in Bo\nPang and Lillian Lee, ``Seeing stars: Exploiting class relationships for\nsentiment categorization with respect to rating scales.'', Proceedings of the\nACL, 2005.\n", citation='@InProceedings{Pang+Lee:05a,\n author = {Bo Pang and Lillian Lee},\n title = {Seeing stars: Exploiting class relationships for sentiment\n categorization with respect to rating scales},\n booktitle = {Proceedings of the ACL},\n year = 2005\n}\n', homepage='http://www.cs.cornell.edu/people/pabo/movie-review-data/', license='', features={'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None)}, post_processed=None, supervised_keys=SupervisedKeysData(input='', output=''), builder_name='rotten_tomatoes_movie_review', config_name='default', version=1.0.0, splits={'train': SplitInfo(name='train', num_bytes=1074810, num_examples=8530, dataset_name='rotten_tomatoes_movie_review'), 'validation': SplitInfo(name='validation', num_bytes=134679, num_examples=1066, dataset_name='rotten_tomatoes_movie_review'), 'test': SplitInfo(name='test', num_bytes=135972, num_examples=1066, dataset_name='rotten_tomatoes_movie_review')}, download_checksums={'https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz': {'num_bytes': 487770, 'checksum': 'a05befe52aafda71d458d188a1c54506a998b1308613ba76bbda2e5029409ce9'}}, download_size=487770, post_processing_size=None, dataset_size=1345461, size_in_bytes=1833231) + DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string', id=None), 'path': Value(dtype='string', id=None), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None) ``` """ return self.get_all_exported_dataset_infos().get(self.config.name, DatasetInfo()) diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index e2981fddf82..b06f7ffb97c 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -275,17 +275,17 @@ def cast(self, features: Features) -> "DatasetDict": Example: ```py - >>> from datasets import load_dataset + >>> from datasets import load_dataset, ClassLabel, Value >>> ds = load_dataset("rotten_tomatoes") >>> ds["train"].features - {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None), + {'label': ClassLabel(names=['neg', 'pos'], id=None), 'text': Value(dtype='string', id=None)} >>> new_features = ds["train"].features.copy() >>> new_features['label'] = ClassLabel(names=['bad', 'good']) >>> new_features['text'] = Value('large_string') >>> ds = ds.cast(new_features) >>> ds["train"].features - {'label': ClassLabel(num_classes=2, names=['bad', 'good'], id=None), + {'label': ClassLabel(names=['bad', 'good'], id=None), 'text': Value(dtype='large_string', id=None)} ``` """ @@ -307,14 +307,14 @@ def cast_column(self, column: str, feature) -> "DatasetDict": Example: ```py - >>> from datasets import load_dataset + >>> from datasets import load_dataset, ClassLabel >>> ds = load_dataset("rotten_tomatoes") >>> ds["train"].features - {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None), + {'label': ClassLabel(names=['neg', 'pos'], id=None), 'text': Value(dtype='string', id=None)} >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good'])) >>> ds["train"].features - {'label': ClassLabel(num_classes=2, names=['bad', 'good'], id=None), + {'label': ClassLabel(names=['bad', 'good'], id=None), 'text': Value(dtype='string', id=None)} ``` """ @@ -2201,14 +2201,14 @@ def cast_column(self, column: str, feature: FeatureType) -> "IterableDatasetDict Example: ```py - >>> from datasets import load_dataset + >>> from datasets import load_dataset, ClassLabel >>> ds = load_dataset("rotten_tomatoes", streaming=True) >>> ds["train"].features - {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None), + {'label': ClassLabel(names=['neg', 'pos'], id=None), 'text': Value(dtype='string', id=None)} >>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good'])) >>> ds["train"].features - {'label': ClassLabel(num_classes=2, names=['bad', 'good'], id=None), + {'label': ClassLabel(names=['bad', 'good'], id=None), 'text': Value(dtype='string', id=None)} ``` """ @@ -2240,14 +2240,14 @@ def cast( >>> from datasets import load_dataset >>> ds = load_dataset("rotten_tomatoes", streaming=True) >>> ds["train"].features - {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None), + {'label': ClassLabel(names=['neg', 'pos'], id=None), 'text': Value(dtype='string', id=None)} >>> new_features = ds["train"].features.copy() >>> new_features['label'] = ClassLabel(names=['bad', 'good']) >>> new_features['text'] = Value('large_string') >>> ds = ds.cast(new_features) >>> ds["train"].features - {'label': ClassLabel(num_classes=2, names=['bad', 'good'], id=None), + {'label': ClassLabel(names=['bad', 'good'], id=None), 'text': Value(dtype='large_string', id=None)} ``` """ diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index aac6bff343c..ec7dc2a548c 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -966,10 +966,10 @@ class ClassLabel: Example: ```py - >>> from datasets import Features + >>> from datasets import Features, ClassLabel >>> features = Features({'label': ClassLabel(num_classes=3, names=['bad', 'ok', 'good'])}) >>> features - {'label': ClassLabel(num_classes=3, names=['bad', 'ok', 'good'], id=None)} + {'label': ClassLabel(names=['bad', 'ok', 'good'], id=None)} ``` """ @@ -1156,7 +1156,7 @@ class Sequence: >>> from datasets import Features, Sequence, Value, ClassLabel >>> features = Features({'post': Sequence(feature={'text': Value(dtype='string'), 'upvotes': Value(dtype='int32'), 'label': ClassLabel(num_classes=2, names=['hot', 'cold'])})}) >>> features - {'post': Sequence(feature={'text': Value(dtype='string', id=None), 'upvotes': Value(dtype='int32', id=None), 'label': ClassLabel(num_classes=2, names=['hot', 'cold'], id=None)}, length=-1, id=None)} + {'post': Sequence(feature={'text': Value(dtype='string', id=None), 'upvotes': Value(dtype='int32', id=None), 'label': ClassLabel(names=['hot', 'cold'], id=None)}, length=-1, id=None)} ``` """ @@ -2110,7 +2110,7 @@ def copy(self) -> "Features": >>> ds = load_dataset("rotten_tomatoes", split="train") >>> copy_of_features = ds.features.copy() >>> copy_of_features - {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None), + {'label': ClassLabel(names=['neg', 'pos'], id=None), 'text': Value(dtype='string', id=None)} ``` """ diff --git a/src/datasets/iterable_dataset.py b/src/datasets/iterable_dataset.py index 8d1cefa3e72..382b20915bf 100644 --- a/src/datasets/iterable_dataset.py +++ b/src/datasets/iterable_dataset.py @@ -2951,17 +2951,17 @@ def cast( Example: ```py - >>> from datasets import load_dataset + >>> from datasets import load_dataset, ClassLabel, Value >>> ds = load_dataset("rotten_tomatoes", split="train", streaming=True) >>> ds.features - {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None), + {'label': ClassLabel(names=['neg', 'pos'], id=None), 'text': Value(dtype='string', id=None)} >>> new_features = ds.features.copy() >>> new_features["label"] = ClassLabel(names=["bad", "good"]) >>> new_features["text"] = Value("large_string") >>> ds = ds.cast(new_features) >>> ds.features - {'label': ClassLabel(num_classes=2, names=['bad', 'good'], id=None), + {'label': ClassLabel(names=['bad', 'good'], id=None), 'text': Value(dtype='large_string', id=None)} ``` """ diff --git a/src/datasets/load.py b/src/datasets/load.py index 2f516253db7..7c94b33046d 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -1835,7 +1835,7 @@ def load_dataset_builder( >>> from datasets import load_dataset_builder >>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes') >>> ds_builder.info.features - {'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None), + {'label': ClassLabel(names=['neg', 'pos'], id=None), 'text': Value(dtype='string', id=None)} ``` """