From 07df063eea05cec6ab811ba6e69f2254d4bc26b9 Mon Sep 17 00:00:00 2001 From: Mikhail Moskovchenko Date: Wed, 23 Apr 2025 23:21:43 +0400 Subject: [PATCH 1/2] Add custom suffix support to from_generator --- src/datasets/arrow_dataset.py | 7 ++ src/datasets/builder.py | 75 ++++++++++--------- src/datasets/io/generator.py | 2 + .../packaged_modules/generator/generator.py | 1 + 4 files changed, 49 insertions(+), 36 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 23d379c645c..c6e4bf29dae 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -1047,6 +1047,7 @@ def from_generator( gen_kwargs: Optional[dict] = None, num_proc: Optional[int] = None, split: NamedSplit = Split.TRAIN, + dataset_id_suffix: Optional[str] = None, **kwargs, ): """Create a Dataset from a generator. @@ -1073,6 +1074,11 @@ def from_generator( Split name to be assigned to the dataset. + dataset_id_suffix (`str`, *optional*): + Suffix that will be used to generate dataset ID. + By default `dataset_id_suffix` is generated by hashing all the args which can be slow in case of a large dataset. + + **kwargs (additional keyword arguments): Keyword arguments to be passed to :[`GeneratorConfig`]. @@ -1110,6 +1116,7 @@ def from_generator( gen_kwargs=gen_kwargs, num_proc=num_proc, split=split, + dataset_id_suffix=dataset_id_suffix, **kwargs, ).read() diff --git a/src/datasets/builder.py b/src/datasets/builder.py index d6992b9e19d..c28117f934e 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -155,43 +155,46 @@ def create_config_id( """ # Possibly add a suffix to the name to handle custom features/data_files/config_kwargs suffix: Optional[str] = None - config_kwargs_to_add_to_suffix = config_kwargs.copy() - # name and version are already used to build the cache directory - config_kwargs_to_add_to_suffix.pop("name", None) - config_kwargs_to_add_to_suffix.pop("version", None) - # data dir handling (when specified it points to the manually downloaded data): - # it was previously ignored before the introduction of config id because we didn't want - # to change the config name. Now it's fine to take it into account for the config id. - # config_kwargs_to_add_to_suffix.pop("data_dir", None) - if "data_dir" in config_kwargs_to_add_to_suffix: - if config_kwargs_to_add_to_suffix["data_dir"] is None: - config_kwargs_to_add_to_suffix.pop("data_dir", None) - else: - # canonicalize the data dir to avoid two paths to the same location having different - # hashes - data_dir = config_kwargs_to_add_to_suffix["data_dir"] - data_dir = os.path.normpath(data_dir) - config_kwargs_to_add_to_suffix["data_dir"] = data_dir - if config_kwargs_to_add_to_suffix: - # we don't care about the order of the kwargs - config_kwargs_to_add_to_suffix = { - k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix) - } - if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()): - suffix = ",".join( - str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items() - ) - if len(suffix) > 32: # hash if too long + if "dataset_id_suffix" in config_kwargs and config_kwargs["dataset_id_suffix"] is not None: + suffix = config_kwargs["dataset_id_suffix"] + else: + config_kwargs_to_add_to_suffix = config_kwargs.copy() + # name and version are already used to build the cache directory + config_kwargs_to_add_to_suffix.pop("name", None) + config_kwargs_to_add_to_suffix.pop("version", None) + # data dir handling (when specified it points to the manually downloaded data): + # it was previously ignored before the introduction of config id because we didn't want + # to change the config name. Now it's fine to take it into account for the config id. + # config_kwargs_to_add_to_suffix.pop("data_dir", None) + if "data_dir" in config_kwargs_to_add_to_suffix: + if config_kwargs_to_add_to_suffix["data_dir"] is None: + config_kwargs_to_add_to_suffix.pop("data_dir", None) + else: + # canonicalize the data dir to avoid two paths to the same location having different + # hashes + data_dir = config_kwargs_to_add_to_suffix["data_dir"] + data_dir = os.path.normpath(data_dir) + config_kwargs_to_add_to_suffix["data_dir"] = data_dir + if config_kwargs_to_add_to_suffix: + # we don't care about the order of the kwargs + config_kwargs_to_add_to_suffix = { + k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix) + } + if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()): + suffix = ",".join( + str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items() + ) + if len(suffix) > 32: # hash if too long + suffix = Hasher.hash(config_kwargs_to_add_to_suffix) + else: suffix = Hasher.hash(config_kwargs_to_add_to_suffix) - else: - suffix = Hasher.hash(config_kwargs_to_add_to_suffix) - - if custom_features is not None: - m = Hasher() - if suffix: - m.update(suffix) - m.update(custom_features) - suffix = m.hexdigest() + + if custom_features is not None: + m = Hasher() + if suffix: + m.update(suffix) + m.update(custom_features) + suffix = m.hexdigest() if suffix: config_id = self.name + "-" + suffix diff --git a/src/datasets/io/generator.py b/src/datasets/io/generator.py index b10609cac23..48dd1df2a02 100644 --- a/src/datasets/io/generator.py +++ b/src/datasets/io/generator.py @@ -16,6 +16,7 @@ def __init__( gen_kwargs: Optional[dict] = None, num_proc: Optional[int] = None, split: NamedSplit = Split.TRAIN, + dataset_id_suffix: Optional[str] = None, **kwargs, ): super().__init__( @@ -32,6 +33,7 @@ def __init__( generator=generator, gen_kwargs=gen_kwargs, split=split, + dataset_id_suffix=dataset_id_suffix, **kwargs, ) diff --git a/src/datasets/packaged_modules/generator/generator.py b/src/datasets/packaged_modules/generator/generator.py index dac3f0d92df..3fc431b5f50 100644 --- a/src/datasets/packaged_modules/generator/generator.py +++ b/src/datasets/packaged_modules/generator/generator.py @@ -10,6 +10,7 @@ class GeneratorConfig(datasets.BuilderConfig): gen_kwargs: Optional[dict] = None features: Optional[datasets.Features] = None split: datasets.NamedSplit = datasets.Split.TRAIN + dataset_id_suffix: Optional[str] = None def __post_init__(self): super().__post_init__() From cafae0933fb7c87af63d91e2aacd504e388b9f8b Mon Sep 17 00:00:00 2001 From: Mikhail Moskovchenko Date: Sun, 4 May 2025 22:55:09 +0400 Subject: [PATCH 2/2] Renamed a new arg to fingerprint --- src/datasets/arrow_dataset.py | 10 +++++----- src/datasets/builder.py | 11 ++++++++--- src/datasets/io/generator.py | 4 ++-- src/datasets/packaged_modules/generator/generator.py | 2 +- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index c6e4bf29dae..ca3f1f12573 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -1047,7 +1047,7 @@ def from_generator( gen_kwargs: Optional[dict] = None, num_proc: Optional[int] = None, split: NamedSplit = Split.TRAIN, - dataset_id_suffix: Optional[str] = None, + fingerprint: Optional[str] = None, **kwargs, ): """Create a Dataset from a generator. @@ -1074,9 +1074,9 @@ def from_generator( Split name to be assigned to the dataset. - dataset_id_suffix (`str`, *optional*): - Suffix that will be used to generate dataset ID. - By default `dataset_id_suffix` is generated by hashing all the args which can be slow in case of a large dataset. + fingerprint (`str`, *optional*): + Fingerprint that will be used to generate dataset ID. + By default `fingerprint` is generated by hashing all the args which can be slow in case of a large dataset. **kwargs (additional keyword arguments): @@ -1116,7 +1116,7 @@ def from_generator( gen_kwargs=gen_kwargs, num_proc=num_proc, split=split, - dataset_id_suffix=dataset_id_suffix, + fingerprint=fingerprint, **kwargs, ).read() diff --git a/src/datasets/builder.py b/src/datasets/builder.py index c28117f934e..d312118963d 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -141,6 +141,7 @@ def create_config_id( self, config_kwargs: dict, custom_features: Optional[Features] = None, + fingerprint: Optional[str] = None, ) -> str: """ The config id is used to build the cache directory. @@ -155,8 +156,9 @@ def create_config_id( """ # Possibly add a suffix to the name to handle custom features/data_files/config_kwargs suffix: Optional[str] = None - if "dataset_id_suffix" in config_kwargs and config_kwargs["dataset_id_suffix"] is not None: - suffix = config_kwargs["dataset_id_suffix"] + + if fingerprint is not None: + suffix = fingerprint else: config_kwargs_to_add_to_suffix = config_kwargs.copy() # name and version are already used to build the cache directory @@ -316,6 +318,7 @@ def __init__( data_dir: Optional[str] = None, storage_options: Optional[dict] = None, writer_batch_size: Optional[int] = None, + fingerprint: Optional[str] = None, **config_kwargs, ): # DatasetBuilder name @@ -346,6 +349,7 @@ def __init__( self.config, self.config_id = self._create_builder_config( config_name=config_name, custom_features=features, + fingerprint=fingerprint, **config_kwargs, ) @@ -536,7 +540,7 @@ def get_exported_dataset_info(self) -> DatasetInfo: return self.get_all_exported_dataset_infos().get(self.config.name, DatasetInfo()) def _create_builder_config( - self, config_name=None, custom_features=None, **config_kwargs + self, config_name=None, custom_features=None, fingerprint=None, **config_kwargs ) -> tuple[BuilderConfig, str]: """Create and validate BuilderConfig object as well as a unique config id for this config. Raises ValueError if there are multiple builder configs and config_name and DEFAULT_CONFIG_NAME are None. @@ -607,6 +611,7 @@ def _create_builder_config( config_id = builder_config.create_config_id( config_kwargs, custom_features=custom_features, + fingerprint=fingerprint, ) is_custom = (config_id not in self.builder_configs) and config_id != "default" if is_custom: diff --git a/src/datasets/io/generator.py b/src/datasets/io/generator.py index 48dd1df2a02..a27552a64cc 100644 --- a/src/datasets/io/generator.py +++ b/src/datasets/io/generator.py @@ -16,7 +16,7 @@ def __init__( gen_kwargs: Optional[dict] = None, num_proc: Optional[int] = None, split: NamedSplit = Split.TRAIN, - dataset_id_suffix: Optional[str] = None, + fingerprint: Optional[str] = None, **kwargs, ): super().__init__( @@ -33,7 +33,7 @@ def __init__( generator=generator, gen_kwargs=gen_kwargs, split=split, - dataset_id_suffix=dataset_id_suffix, + fingerprint=fingerprint, **kwargs, ) diff --git a/src/datasets/packaged_modules/generator/generator.py b/src/datasets/packaged_modules/generator/generator.py index 3fc431b5f50..e7fa385f01f 100644 --- a/src/datasets/packaged_modules/generator/generator.py +++ b/src/datasets/packaged_modules/generator/generator.py @@ -10,7 +10,7 @@ class GeneratorConfig(datasets.BuilderConfig): gen_kwargs: Optional[dict] = None features: Optional[datasets.Features] = None split: datasets.NamedSplit = datasets.Split.TRAIN - dataset_id_suffix: Optional[str] = None + fingerprint: Optional[str] = None def __post_init__(self): super().__post_init__()