diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
index 23d379c645c..ca3f1f12573 100644
--- a/src/datasets/arrow_dataset.py
+++ b/src/datasets/arrow_dataset.py
@@ -1047,6 +1047,7 @@ def from_generator(
gen_kwargs: Optional[dict] = None,
num_proc: Optional[int] = None,
split: NamedSplit = Split.TRAIN,
+ fingerprint: Optional[str] = None,
**kwargs,
):
"""Create a Dataset from a generator.
@@ -1073,6 +1074,11 @@ def from_generator(
Split name to be assigned to the dataset.
+ fingerprint (`str`, *optional*):
+ Fingerprint that will be used to generate dataset ID.
+ By default `fingerprint` is generated by hashing all the args which can be slow in case of a large dataset.
+
+
**kwargs (additional keyword arguments):
Keyword arguments to be passed to :[`GeneratorConfig`].
@@ -1110,6 +1116,7 @@ def from_generator(
gen_kwargs=gen_kwargs,
num_proc=num_proc,
split=split,
+ fingerprint=fingerprint,
**kwargs,
).read()
diff --git a/src/datasets/builder.py b/src/datasets/builder.py
index d6992b9e19d..d312118963d 100644
--- a/src/datasets/builder.py
+++ b/src/datasets/builder.py
@@ -141,6 +141,7 @@ def create_config_id(
self,
config_kwargs: dict,
custom_features: Optional[Features] = None,
+ fingerprint: Optional[str] = None,
) -> str:
"""
The config id is used to build the cache directory.
@@ -155,43 +156,47 @@ def create_config_id(
"""
# Possibly add a suffix to the name to handle custom features/data_files/config_kwargs
suffix: Optional[str] = None
- config_kwargs_to_add_to_suffix = config_kwargs.copy()
- # name and version are already used to build the cache directory
- config_kwargs_to_add_to_suffix.pop("name", None)
- config_kwargs_to_add_to_suffix.pop("version", None)
- # data dir handling (when specified it points to the manually downloaded data):
- # it was previously ignored before the introduction of config id because we didn't want
- # to change the config name. Now it's fine to take it into account for the config id.
- # config_kwargs_to_add_to_suffix.pop("data_dir", None)
- if "data_dir" in config_kwargs_to_add_to_suffix:
- if config_kwargs_to_add_to_suffix["data_dir"] is None:
- config_kwargs_to_add_to_suffix.pop("data_dir", None)
- else:
- # canonicalize the data dir to avoid two paths to the same location having different
- # hashes
- data_dir = config_kwargs_to_add_to_suffix["data_dir"]
- data_dir = os.path.normpath(data_dir)
- config_kwargs_to_add_to_suffix["data_dir"] = data_dir
- if config_kwargs_to_add_to_suffix:
- # we don't care about the order of the kwargs
- config_kwargs_to_add_to_suffix = {
- k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix)
- }
- if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()):
- suffix = ",".join(
- str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items()
- )
- if len(suffix) > 32: # hash if too long
+
+ if fingerprint is not None:
+ suffix = fingerprint
+ else:
+ config_kwargs_to_add_to_suffix = config_kwargs.copy()
+ # name and version are already used to build the cache directory
+ config_kwargs_to_add_to_suffix.pop("name", None)
+ config_kwargs_to_add_to_suffix.pop("version", None)
+ # data dir handling (when specified it points to the manually downloaded data):
+ # it was previously ignored before the introduction of config id because we didn't want
+ # to change the config name. Now it's fine to take it into account for the config id.
+ # config_kwargs_to_add_to_suffix.pop("data_dir", None)
+ if "data_dir" in config_kwargs_to_add_to_suffix:
+ if config_kwargs_to_add_to_suffix["data_dir"] is None:
+ config_kwargs_to_add_to_suffix.pop("data_dir", None)
+ else:
+ # canonicalize the data dir to avoid two paths to the same location having different
+ # hashes
+ data_dir = config_kwargs_to_add_to_suffix["data_dir"]
+ data_dir = os.path.normpath(data_dir)
+ config_kwargs_to_add_to_suffix["data_dir"] = data_dir
+ if config_kwargs_to_add_to_suffix:
+ # we don't care about the order of the kwargs
+ config_kwargs_to_add_to_suffix = {
+ k: config_kwargs_to_add_to_suffix[k] for k in sorted(config_kwargs_to_add_to_suffix)
+ }
+ if all(isinstance(v, (str, bool, int, float)) for v in config_kwargs_to_add_to_suffix.values()):
+ suffix = ",".join(
+ str(k) + "=" + urllib.parse.quote_plus(str(v)) for k, v in config_kwargs_to_add_to_suffix.items()
+ )
+ if len(suffix) > 32: # hash if too long
+ suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
+ else:
suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
- else:
- suffix = Hasher.hash(config_kwargs_to_add_to_suffix)
- if custom_features is not None:
- m = Hasher()
- if suffix:
- m.update(suffix)
- m.update(custom_features)
- suffix = m.hexdigest()
+ if custom_features is not None:
+ m = Hasher()
+ if suffix:
+ m.update(suffix)
+ m.update(custom_features)
+ suffix = m.hexdigest()
if suffix:
config_id = self.name + "-" + suffix
@@ -313,6 +318,7 @@ def __init__(
data_dir: Optional[str] = None,
storage_options: Optional[dict] = None,
writer_batch_size: Optional[int] = None,
+ fingerprint: Optional[str] = None,
**config_kwargs,
):
# DatasetBuilder name
@@ -343,6 +349,7 @@ def __init__(
self.config, self.config_id = self._create_builder_config(
config_name=config_name,
custom_features=features,
+ fingerprint=fingerprint,
**config_kwargs,
)
@@ -533,7 +540,7 @@ def get_exported_dataset_info(self) -> DatasetInfo:
return self.get_all_exported_dataset_infos().get(self.config.name, DatasetInfo())
def _create_builder_config(
- self, config_name=None, custom_features=None, **config_kwargs
+ self, config_name=None, custom_features=None, fingerprint=None, **config_kwargs
) -> tuple[BuilderConfig, str]:
"""Create and validate BuilderConfig object as well as a unique config id for this config.
Raises ValueError if there are multiple builder configs and config_name and DEFAULT_CONFIG_NAME are None.
@@ -604,6 +611,7 @@ def _create_builder_config(
config_id = builder_config.create_config_id(
config_kwargs,
custom_features=custom_features,
+ fingerprint=fingerprint,
)
is_custom = (config_id not in self.builder_configs) and config_id != "default"
if is_custom:
diff --git a/src/datasets/io/generator.py b/src/datasets/io/generator.py
index b10609cac23..a27552a64cc 100644
--- a/src/datasets/io/generator.py
+++ b/src/datasets/io/generator.py
@@ -16,6 +16,7 @@ def __init__(
gen_kwargs: Optional[dict] = None,
num_proc: Optional[int] = None,
split: NamedSplit = Split.TRAIN,
+ fingerprint: Optional[str] = None,
**kwargs,
):
super().__init__(
@@ -32,6 +33,7 @@ def __init__(
generator=generator,
gen_kwargs=gen_kwargs,
split=split,
+ fingerprint=fingerprint,
**kwargs,
)
diff --git a/src/datasets/packaged_modules/generator/generator.py b/src/datasets/packaged_modules/generator/generator.py
index dac3f0d92df..e7fa385f01f 100644
--- a/src/datasets/packaged_modules/generator/generator.py
+++ b/src/datasets/packaged_modules/generator/generator.py
@@ -10,6 +10,7 @@ class GeneratorConfig(datasets.BuilderConfig):
gen_kwargs: Optional[dict] = None
features: Optional[datasets.Features] = None
split: datasets.NamedSplit = datasets.Split.TRAIN
+ fingerprint: Optional[str] = None
def __post_init__(self):
super().__post_init__()