diff --git a/src/datasets/features/audio.py b/src/datasets/features/audio.py index f7df47b7a06..172097ceea8 100644 --- a/src/datasets/features/audio.py +++ b/src/datasets/features/audio.py @@ -243,8 +243,8 @@ def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray]) -> pa.Str path_array = pa.array([None] * len(storage), type=pa.string()) storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=storage.is_null()) return array_cast(storage, self.pa_type) - - def embed_storage(self, storage: pa.StructArray) -> pa.StructArray: + + def embed_storage_offset(self, start, dur, storage) -> pa.StructArray: """Embed audio files into the Arrow array. Args: @@ -255,13 +255,64 @@ def embed_storage(self, storage: pa.StructArray) -> pa.StructArray: `pa.StructArray`: Array in the Audio arrow storage type, that is `pa.struct({"bytes": pa.binary(), "path": pa.string()})`. """ + import soundfile as sf @no_op_if_value_is_null def path_to_bytes(path): with xopen(path, "rb") as f: bytes_ = f.read() return bytes_ + + def path_to_bytes_offset(start, dur, path): + start = int(start.as_py() * 16000) + dur = int(dur.as_py() * 16000) + arr = sf.read(path, start=start, frames=dur) + return arr[0].tobytes() + + bytes_array = [] + for i, x in enumerate(storage.to_pylist()): + if start[i].as_py() >= 0: + bytes_array.append(path_to_bytes_offset(start[i], dur[i], x["path"])) + else: + if x["bytes"] is None: + bytes_array.append(path_to_bytes(x["path"])) + elif x is not None: + bytes_array.append(x["bytes"]) + else: + bytes_array.append(None) + + bytes_array = pa.array(bytes_array, type=pa.binary()) + #bytes_array = pa.array( + # [ + # (path_to_bytes(x["start"], x["dur"], x["path"]) if x["bytes"] is None else x["bytes"]) if x is not None else None + # for x in storage.to_pylist() + # ], + # type=pa.binary(), + # + path_array = pa.array( + [os.path.basename(x["path"]) if x["path"] is not None else None for x in storage.to_pylist()], + type=pa.string(), + ) + storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=bytes_array.is_null()) + return array_cast(storage, self.pa_type) + + def embed_storage(self, storage: pa.StructArray) -> pa.StructArray: + """Embed audio files into the Arrow array. + + Args: + storage (`pa.StructArray`): + PyArrow array to embed. + Returns: + `pa.StructArray`: Array in the Audio arrow storage type, that is + `pa.struct({"bytes": pa.binary(), "path": pa.string()})`. + """ + @no_op_if_value_is_null + def path_to_bytes(path): + with xopen(path, "rb") as f: + bytes_ = f.read() + return bytes_ + bytes_array = pa.array( [ (path_to_bytes(x["path"]) if x["bytes"] is None else x["bytes"]) if x is not None else None diff --git a/src/datasets/table.py b/src/datasets/table.py index 5bd8d9b998f..d7effdd63f3 100644 --- a/src/datasets/table.py +++ b/src/datasets/table.py @@ -2107,6 +2107,34 @@ def cast_array_to_feature( ) raise TypeError(f"Couldn't cast array of type\n{_short_str(array.type)}\nto\n{_short_str(feature)}") +def embed_array_storage_audio(array_start: pa.Array, array_dur: pa.Array, array_wav: pa.Array, feature: "FeatureType"): + """Embed data into an arrays's storage. + For custom features like Audio or Image, it takes into account the "embed_storage" methods + they define to embed external data (e.g. an image file) into an array. + + + + Args: + array (`pa.Array`): + The PyArrow array in which to embed data. + feature (`datasets.features.FeatureType`): + Array features. + + Raises: + `TypeError`: if the target type is not supported according, e.g. + + - if a field is missing + + Returns: + array (`pyarrow.Array`): the casted array + """ + from .features import Sequence + + if hasattr(feature, "embed_storage_offset"): + return feature.embed_storage_offset(array_start, array_dur, array_wav) + raise TypeError(f"Couldn't embed array of type\n{_short_str(array.type)}\nwith\n{_short_str(feature)}") + + @_wrap_for_chunked_arrays def embed_array_storage(array: pa.Array, feature: "FeatureType"): @@ -2265,12 +2293,22 @@ def embed_table_storage(table: pa.Table): table (`pyarrow.Table`): the table with embedded data """ from .features.features import Features, require_storage_embed - + from .features import Audio + features = Features.from_arrow_schema(table.schema) - arrays = [ - embed_array_storage(table[name], feature) if require_storage_embed(feature) else table[name] - for name, feature in features.items() - ] + arrays = [] + for name, feature in features.items(): + if require_storage_embed(feature): + if isinstance(feature, Audio): + arrays.append(embed_array_storage_audio(table['start'], table['duration'], table[name], feature)) + else: + arrays.append(embed_array_storage(table[name], feature)) + else: + arrays.append(table[name]) + # arrays = [ + # embed_array_storage(table[name], feature) if require_storage_embed(feature) else table[name] + # for name, feature in features.items() + # ] return pa.Table.from_arrays(arrays, schema=features.arrow_schema)