diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 23d379c645c..ca3f1f12573 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -1047,6 +1047,7 @@ def from_generator( gen_kwargs: Optional[dict] = None, num_proc: Optional[int] = None, split: NamedSplit = Split.TRAIN, + fingerprint: Optional[str] = None, **kwargs, ): """Create a Dataset from a generator. @@ -1073,6 +1074,11 @@ def from_generator( Split name to be assigned to the dataset. + fingerprint (`str`, *optional*): + Fingerprint that will be used to generate dataset ID. + By default `fingerprint` is generated by hashing all the args which can be slow in case of a large dataset. + + **kwargs (additional keyword arguments): Keyword arguments to be passed to :[`GeneratorConfig`]. @@ -1110,6 +1116,7 @@ def from_generator( gen_kwargs=gen_kwargs, num_proc=num_proc, split=split, + fingerprint=fingerprint, **kwargs, ).read() diff --git a/src/datasets/builder.py b/src/datasets/builder.py index d6992b9e19d..d312118963d 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -141,6 +141,7 @@ def create_config_id( self, config_kwargs: dict, custom_features: Optional[Features] = None, + fingerprint: Optional[str] = None, ) -> str: """ The config id is used to build the cache directory. @@ -155,43 +156,47 @@ def create_config_id( """ # Possibly add a suffix to the name to handle custom features/data_files/config_kwargs suffix: Optional[str] = None - config_kwargs_to_add_to_suffix = config_kwargs.copy() - # name and version are already used to build the cache directory - config_kwargs_to_add_to_suffix.pop("name", None) - config_kwargs_to_add_to_suffix.pop("version", None) - # data dir handling (when specified it points to the manually downloaded data): - # it was previously ignored before the introduction of config id because we didn't want - # to change the config name. Now it's fine to take it into account for the config id. - # config_kwargs_to_add_to_suffix.pop("data_dir", None) - if "data_dir" in config_kwargs_to_add_to_suffix: - if config_kwargs_to_add_to_suffix["data_dir"] is None: - config_kwargs_to_add_to_suffix.pop("data_dir", None) - else: - # canonicalize the data dir to avoid two paths to the same location having different - # hashes - data_dir = config_kwargs_to_add_to_suffix["data_dir"] - data_dir = os.path.normpath(data_dir) - config_kwargs_to_add_to_suffix["data_dir"] = data_dir - if config_kwargs_to_add_to_suffix: - # we don't care about the order of the kwargs - config_kwargs_to_add_to_suffix = { - k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix) - } - if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()): - suffix = ",".join( - str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items() - ) - if len(suffix) > 32: # hash if too long + + if fingerprint is not None: + suffix = fingerprint + else: + config_kwargs_to_add_to_suffix = config_kwargs.copy() + # name and version are already used to build the cache directory + config_kwargs_to_add_to_suffix.pop("name", None) + config_kwargs_to_add_to_suffix.pop("version", None) + # data dir handling (when specified it points to the manually downloaded data): + # it was previously ignored before the introduction of config id because we didn't want + # to change the config name. Now it's fine to take it into account for the config id. + # config_kwargs_to_add_to_suffix.pop("data_dir", None) + if "data_dir" in config_kwargs_to_add_to_suffix: + if config_kwargs_to_add_to_suffix["data_dir"] is None: + config_kwargs_to_add_to_suffix.pop("data_dir", None) + else: + # canonicalize the data dir to avoid two paths to the same location having different + # hashes + data_dir = config_kwargs_to_add_to_suffix["data_dir"] + data_dir = os.path.normpath(data_dir) + config_kwargs_to_add_to_suffix["data_dir"] = data_dir + if config_kwargs_to_add_to_suffix: + # we don't care about the order of the kwargs + config_kwargs_to_add_to_suffix = { + k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix) + } + if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()): + suffix = ",".join( + str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items() + ) + if len(suffix) > 32: # hash if too long + suffix = Hasher.hash(config_kwargs_to_add_to_suffix) + else: suffix = Hasher.hash(config_kwargs_to_add_to_suffix) - else: - suffix = Hasher.hash(config_kwargs_to_add_to_suffix) - if custom_features is not None: - m = Hasher() - if suffix: - m.update(suffix) - m.update(custom_features) - suffix = m.hexdigest() + if custom_features is not None: + m = Hasher() + if suffix: + m.update(suffix) + m.update(custom_features) + suffix = m.hexdigest() if suffix: config_id = self.name + "-" + suffix @@ -313,6 +318,7 @@ def __init__( data_dir: Optional[str] = None, storage_options: Optional[dict] = None, writer_batch_size: Optional[int] = None, + fingerprint: Optional[str] = None, **config_kwargs, ): # DatasetBuilder name @@ -343,6 +349,7 @@ def __init__( self.config, self.config_id = self._create_builder_config( config_name=config_name, custom_features=features, + fingerprint=fingerprint, **config_kwargs, ) @@ -533,7 +540,7 @@ def get_exported_dataset_info(self) -> DatasetInfo: return self.get_all_exported_dataset_infos().get(self.config.name, DatasetInfo()) def _create_builder_config( - self, config_name=None, custom_features=None, **config_kwargs + self, config_name=None, custom_features=None, fingerprint=None, **config_kwargs ) -> tuple[BuilderConfig, str]: """Create and validate BuilderConfig object as well as a unique config id for this config. Raises ValueError if there are multiple builder configs and config_name and DEFAULT_CONFIG_NAME are None. @@ -604,6 +611,7 @@ def _create_builder_config( config_id = builder_config.create_config_id( config_kwargs, custom_features=custom_features, + fingerprint=fingerprint, ) is_custom = (config_id not in self.builder_configs) and config_id != "default" if is_custom: diff --git a/src/datasets/io/generator.py b/src/datasets/io/generator.py index b10609cac23..a27552a64cc 100644 --- a/src/datasets/io/generator.py +++ b/src/datasets/io/generator.py @@ -16,6 +16,7 @@ def __init__( gen_kwargs: Optional[dict] = None, num_proc: Optional[int] = None, split: NamedSplit = Split.TRAIN, + fingerprint: Optional[str] = None, **kwargs, ): super().__init__( @@ -32,6 +33,7 @@ def __init__( generator=generator, gen_kwargs=gen_kwargs, split=split, + fingerprint=fingerprint, **kwargs, ) diff --git a/src/datasets/packaged_modules/generator/generator.py b/src/datasets/packaged_modules/generator/generator.py index dac3f0d92df..e7fa385f01f 100644 --- a/src/datasets/packaged_modules/generator/generator.py +++ b/src/datasets/packaged_modules/generator/generator.py @@ -10,6 +10,7 @@ class GeneratorConfig(datasets.BuilderConfig): gen_kwargs: Optional[dict] = None features: Optional[datasets.Features] = None split: datasets.NamedSplit = datasets.Split.TRAIN + fingerprint: Optional[str] = None def __post_init__(self): super().__post_init__()