Skip to content

Commit

Permalink
Updated inconsistent output in documentation examples for ClassLabel (
Browse files Browse the repository at this point in the history
#7293)

* Updated ClassLabel output in features.py

* Updated ClassLabel output in load.py

* Updated ClassLabel output in iterable_daaset.py

* Updated ClassLabel output in dataset_dict.py

* Updated ClassLabel output in builder.py

* Updated ClassLabel output in arrow_dataset.py

* Updated docs

* Added missing comma

* Updated python code

* Update src/datasets/builder.py

Co-authored-by: Steven Liu <[email protected]>

---------

Co-authored-by: Steven Liu <[email protected]>
  • Loading branch information
sergiopaniego and stevhliu authored Dec 6, 2024
1 parent c9d3450 commit b60ebb8
Show file tree
Hide file tree
Showing 11 changed files with 34 additions and 34 deletions.
2 changes: 1 addition & 1 deletion docs/source/about_dataset_features.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Let's have a look at the features of the MRPC dataset from the GLUE benchmark:
>>> dataset = load_dataset('glue', 'mrpc', split='train')
>>> dataset.features
{'idx': Value(dtype='int32', id=None),
'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], names_file=None, id=None),
'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
'sentence1': Value(dtype='string', id=None),
'sentence2': Value(dtype='string', id=None),
}
Expand Down
2 changes: 1 addition & 1 deletion docs/source/load_hub.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Movie Review Dataset. This is a dataset of containing 5,331 positive and 5,331 n

# Inspect dataset features
>>> ds_builder.info.features
{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
{'label': ClassLabel(names=['neg', 'pos'], id=None),
'text': Value(dtype='string', id=None)}
```

Expand Down
2 changes: 1 addition & 1 deletion docs/source/loading.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ Now when you look at your dataset features, you can see it uses the custom label
```py
>>> dataset['train'].features
{'text': Value(dtype='string', id=None),
'label': ClassLabel(num_classes=6, names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], names_file=None, id=None)}
'label': ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)}
```

## (Legacy) Local loading script
Expand Down
4 changes: 2 additions & 2 deletions docs/source/process.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ The [`~Dataset.cast`] function transforms the feature type of one or more column
>>> dataset.features
{'sentence1': Value(dtype='string', id=None),
'sentence2': Value(dtype='string', id=None),
'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], names_file=None, id=None),
'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
'idx': Value(dtype='int32', id=None)}

>>> from datasets import ClassLabel, Value
Expand All @@ -236,7 +236,7 @@ The [`~Dataset.cast`] function transforms the feature type of one or more column
>>> dataset.features
{'sentence1': Value(dtype='string', id=None),
'sentence2': Value(dtype='string', id=None),
'label': ClassLabel(num_classes=2, names=['negative', 'positive'], names_file=None, id=None),
'label': ClassLabel(names=['negative', 'positive'], id=None),
'idx': Value(dtype='int64', id=None)}
```

Expand Down
4 changes: 2 additions & 2 deletions docs/source/stream.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ When you need to remove one or more columns, give [`IterableDataset.remove_colum
>>> dataset.features
{'sentence1': Value(dtype='string', id=None),
'sentence2': Value(dtype='string', id=None),
'label': ClassLabel(num_classes=2, names=['not_equivalent', 'equivalent'], names_file=None, id=None),
'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
'idx': Value(dtype='int32', id=None)}

>>> from datasets import ClassLabel, Value
Expand All @@ -240,7 +240,7 @@ When you need to remove one or more columns, give [`IterableDataset.remove_colum
>>> dataset.features
{'sentence1': Value(dtype='string', id=None),
'sentence2': Value(dtype='string', id=None),
'label': ClassLabel(num_classes=2, names=['negative', 'positive'], names_file=None, id=None),
'label': ClassLabel(names=['negative', 'positive'], id=None),
'idx': Value(dtype='int64', id=None)}
```

Expand Down
10 changes: 5 additions & 5 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2023,14 +2023,14 @@ def cast(
>>> from datasets import load_dataset, ClassLabel, Value
>>> ds = load_dataset("rotten_tomatoes", split="validation")
>>> ds.features
{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
{'label': ClassLabel(names=['neg', 'pos'], id=None),
'text': Value(dtype='string', id=None)}
>>> new_features = ds.features.copy()
>>> new_features['label'] = ClassLabel(names=['bad', 'good'])
>>> new_features['text'] = Value('large_string')
>>> ds = ds.cast(new_features)
>>> ds.features
{'label': ClassLabel(num_classes=2, names=['bad', 'good'], id=None),
{'label': ClassLabel(names=['bad', 'good'], id=None),
'text': Value(dtype='large_string', id=None)}
```
"""
Expand Down Expand Up @@ -2078,14 +2078,14 @@ def cast_column(self, column: str, feature: FeatureType, new_fingerprint: Option
Example:
```py
>>> from datasets import load_dataset
>>> from datasets import load_dataset, ClassLabel
>>> ds = load_dataset("rotten_tomatoes", split="validation")
>>> ds.features
{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
{'label': ClassLabel(names=['neg', 'pos'], id=None),
'text': Value(dtype='string', id=None)}
>>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
>>> ds.features
{'label': ClassLabel(num_classes=2, names=['bad', 'good'], id=None),
{'label': ClassLabel(names=['bad', 'good'], id=None),
'text': Value(dtype='string', id=None)}
```
"""
Expand Down
6 changes: 3 additions & 3 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,9 +510,9 @@ def get_all_exported_dataset_infos(cls) -> DatasetInfosDict:
```py
>>> from datasets import load_dataset_builder
>>> ds_builder = load_dataset_builder('rotten_tomatoes')
>>> ds_builder = load_dataset_builder('vivos')
>>> ds_builder.get_all_exported_dataset_infos()
{'default': DatasetInfo(description="Movie Review Dataset.\nThis is a dataset of containing 5,331 positive and 5,331 negative processed\nsentences from Rotten Tomatoes movie reviews. This data was first used in Bo\nPang and Lillian Lee, ``Seeing stars: Exploiting class relationships for\nsentiment categorization with respect to rating scales.'', Proceedings of the\nACL, 2005.\n", citation='@InProceedings{Pang+Lee:05a,\n author = {Bo Pang and Lillian Lee},\n title = {Seeing stars: Exploiting class relationships for sentiment\n categorization with respect to rating scales},\n booktitle = {Proceedings of the ACL},\n year = 2005\n}\n', homepage='http://www.cs.cornell.edu/people/pabo/movie-review-data/', license='', features={'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None)}, post_processed=None, supervised_keys=SupervisedKeysData(input='', output=''), builder_name='rotten_tomatoes_movie_review', config_name='default', version=1.0.0, splits={'train': SplitInfo(name='train', num_bytes=1074810, num_examples=8530, dataset_name='rotten_tomatoes_movie_review'), 'validation': SplitInfo(name='validation', num_bytes=134679, num_examples=1066, dataset_name='rotten_tomatoes_movie_review'), 'test': SplitInfo(name='test', num_bytes=135972, num_examples=1066, dataset_name='rotten_tomatoes_movie_review')}, download_checksums={'https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz': {'num_bytes': 487770, 'checksum': 'a05befe52aafda71d458d188a1c54506a998b1308613ba76bbda2e5029409ce9'}}, download_size=487770, post_processing_size=None, dataset_size=1345461, size_in_bytes=1833231)}
{'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string', id=None), 'path': Value(dtype='string', id=None), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)}
```
"""
return DatasetInfosDict.from_directory(cls.get_imported_module_dir())
Expand All @@ -526,7 +526,7 @@ def get_exported_dataset_info(self) -> DatasetInfo:
>>> from datasets import load_dataset_builder
>>> ds_builder = load_dataset_builder('rotten_tomatoes')
>>> ds_builder.get_exported_dataset_info()
DatasetInfo(description="Movie Review Dataset.\nThis is a dataset of containing 5,331 positive and 5,331 negative processed\nsentences from Rotten Tomatoes movie reviews. This data was first used in Bo\nPang and Lillian Lee, ``Seeing stars: Exploiting class relationships for\nsentiment categorization with respect to rating scales.'', Proceedings of the\nACL, 2005.\n", citation='@InProceedings{Pang+Lee:05a,\n author = {Bo Pang and Lillian Lee},\n title = {Seeing stars: Exploiting class relationships for sentiment\n categorization with respect to rating scales},\n booktitle = {Proceedings of the ACL},\n year = 2005\n}\n', homepage='http://www.cs.cornell.edu/people/pabo/movie-review-data/', license='', features={'text': Value(dtype='string', id=None), 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None)}, post_processed=None, supervised_keys=SupervisedKeysData(input='', output=''), builder_name='rotten_tomatoes_movie_review', config_name='default', version=1.0.0, splits={'train': SplitInfo(name='train', num_bytes=1074810, num_examples=8530, dataset_name='rotten_tomatoes_movie_review'), 'validation': SplitInfo(name='validation', num_bytes=134679, num_examples=1066, dataset_name='rotten_tomatoes_movie_review'), 'test': SplitInfo(name='test', num_bytes=135972, num_examples=1066, dataset_name='rotten_tomatoes_movie_review')}, download_checksums={'https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz': {'num_bytes': 487770, 'checksum': 'a05befe52aafda71d458d188a1c54506a998b1308613ba76bbda2e5029409ce9'}}, download_size=487770, post_processing_size=None, dataset_size=1345461, size_in_bytes=1833231)
DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string', id=None), 'path': Value(dtype='string', id=None), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)
```
"""
return self.get_all_exported_dataset_infos().get(self.config.name, DatasetInfo())
Expand Down
22 changes: 11 additions & 11 deletions src/datasets/dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,17 +275,17 @@ def cast(self, features: Features) -> "DatasetDict":
Example:
```py
>>> from datasets import load_dataset
>>> from datasets import load_dataset, ClassLabel, Value
>>> ds = load_dataset("rotten_tomatoes")
>>> ds["train"].features
{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
{'label': ClassLabel(names=['neg', 'pos'], id=None),
'text': Value(dtype='string', id=None)}
>>> new_features = ds["train"].features.copy()
>>> new_features['label'] = ClassLabel(names=['bad', 'good'])
>>> new_features['text'] = Value('large_string')
>>> ds = ds.cast(new_features)
>>> ds["train"].features
{'label': ClassLabel(num_classes=2, names=['bad', 'good'], id=None),
{'label': ClassLabel(names=['bad', 'good'], id=None),
'text': Value(dtype='large_string', id=None)}
```
"""
Expand All @@ -307,14 +307,14 @@ def cast_column(self, column: str, feature) -> "DatasetDict":
Example:
```py
>>> from datasets import load_dataset
>>> from datasets import load_dataset, ClassLabel
>>> ds = load_dataset("rotten_tomatoes")
>>> ds["train"].features
{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
{'label': ClassLabel(names=['neg', 'pos'], id=None),
'text': Value(dtype='string', id=None)}
>>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
>>> ds["train"].features
{'label': ClassLabel(num_classes=2, names=['bad', 'good'], id=None),
{'label': ClassLabel(names=['bad', 'good'], id=None),
'text': Value(dtype='string', id=None)}
```
"""
Expand Down Expand Up @@ -2201,14 +2201,14 @@ def cast_column(self, column: str, feature: FeatureType) -> "IterableDatasetDict
Example:
```py
>>> from datasets import load_dataset
>>> from datasets import load_dataset, ClassLabel
>>> ds = load_dataset("rotten_tomatoes", streaming=True)
>>> ds["train"].features
{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
{'label': ClassLabel(names=['neg', 'pos'], id=None),
'text': Value(dtype='string', id=None)}
>>> ds = ds.cast_column('label', ClassLabel(names=['bad', 'good']))
>>> ds["train"].features
{'label': ClassLabel(num_classes=2, names=['bad', 'good'], id=None),
{'label': ClassLabel(names=['bad', 'good'], id=None),
'text': Value(dtype='string', id=None)}
```
"""
Expand Down Expand Up @@ -2240,14 +2240,14 @@ def cast(
>>> from datasets import load_dataset
>>> ds = load_dataset("rotten_tomatoes", streaming=True)
>>> ds["train"].features
{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
{'label': ClassLabel(names=['neg', 'pos'], id=None),
'text': Value(dtype='string', id=None)}
>>> new_features = ds["train"].features.copy()
>>> new_features['label'] = ClassLabel(names=['bad', 'good'])
>>> new_features['text'] = Value('large_string')
>>> ds = ds.cast(new_features)
>>> ds["train"].features
{'label': ClassLabel(num_classes=2, names=['bad', 'good'], id=None),
{'label': ClassLabel(names=['bad', 'good'], id=None),
'text': Value(dtype='large_string', id=None)}
```
"""
Expand Down
8 changes: 4 additions & 4 deletions src/datasets/features/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -966,10 +966,10 @@ class ClassLabel:
Example:
```py
>>> from datasets import Features
>>> from datasets import Features, ClassLabel
>>> features = Features({'label': ClassLabel(num_classes=3, names=['bad', 'ok', 'good'])})
>>> features
{'label': ClassLabel(num_classes=3, names=['bad', 'ok', 'good'], id=None)}
{'label': ClassLabel(names=['bad', 'ok', 'good'], id=None)}
```
"""

Expand Down Expand Up @@ -1156,7 +1156,7 @@ class Sequence:
>>> from datasets import Features, Sequence, Value, ClassLabel
>>> features = Features({'post': Sequence(feature={'text': Value(dtype='string'), 'upvotes': Value(dtype='int32'), 'label': ClassLabel(num_classes=2, names=['hot', 'cold'])})})
>>> features
{'post': Sequence(feature={'text': Value(dtype='string', id=None), 'upvotes': Value(dtype='int32', id=None), 'label': ClassLabel(num_classes=2, names=['hot', 'cold'], id=None)}, length=-1, id=None)}
{'post': Sequence(feature={'text': Value(dtype='string', id=None), 'upvotes': Value(dtype='int32', id=None), 'label': ClassLabel(names=['hot', 'cold'], id=None)}, length=-1, id=None)}
```
"""

Expand Down Expand Up @@ -2110,7 +2110,7 @@ def copy(self) -> "Features":
>>> ds = load_dataset("rotten_tomatoes", split="train")
>>> copy_of_features = ds.features.copy()
>>> copy_of_features
{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
{'label': ClassLabel(names=['neg', 'pos'], id=None),
'text': Value(dtype='string', id=None)}
```
"""
Expand Down
6 changes: 3 additions & 3 deletions src/datasets/iterable_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2951,17 +2951,17 @@ def cast(
Example:
```py
>>> from datasets import load_dataset
>>> from datasets import load_dataset, ClassLabel, Value
>>> ds = load_dataset("rotten_tomatoes", split="train", streaming=True)
>>> ds.features
{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
{'label': ClassLabel(names=['neg', 'pos'], id=None),
'text': Value(dtype='string', id=None)}
>>> new_features = ds.features.copy()
>>> new_features["label"] = ClassLabel(names=["bad", "good"])
>>> new_features["text"] = Value("large_string")
>>> ds = ds.cast(new_features)
>>> ds.features
{'label': ClassLabel(num_classes=2, names=['bad', 'good'], id=None),
{'label': ClassLabel(names=['bad', 'good'], id=None),
'text': Value(dtype='large_string', id=None)}
```
"""
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -1835,7 +1835,7 @@ def load_dataset_builder(
>>> from datasets import load_dataset_builder
>>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
>>> ds_builder.info.features
{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
{'label': ClassLabel(names=['neg', 'pos'], id=None),
'text': Value(dtype='string', id=None)}
```
"""
Expand Down

0 comments on commit b60ebb8

Please sign in to comment.