Skip to content

Commit

Permalink
Feat/write empty chunks (zarr-developers#2429)
Browse files Browse the repository at this point in the history
* add write_empty_chunks to config.array namespace

* use write_empty_chunks from config in write_batch

* implement config-sensitive write_empty_chunks in write_batch, and add a test

* add literacy to test

* add warnings when write_empty_chunks is used as a kwarg

* init

* add ArrayConfig

* docstring

* ignore warning

* fix v2 test

* add test to ensure that write_empty_chunks can be set via the global config

* fix tests

* remove write_empty_chunks from Array.create; separate metadata order from config order

* remove missing overload

* Update src/zarr/core/array.py

Co-authored-by: Norman Rzepka <[email protected]>

* Update src/zarr/core/array.py

Co-authored-by: Norman Rzepka <[email protected]>

---------

Co-authored-by: Norman Rzepka <[email protected]>
  • Loading branch information
d-v-b and normanrz authored Dec 20, 2024
1 parent 6930fe8 commit 6dc6d07
Show file tree
Hide file tree
Showing 15 changed files with 399 additions and 109 deletions.
82 changes: 55 additions & 27 deletions src/zarr/api/asynchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,16 @@
from typing_extensions import deprecated

from zarr.core.array import Array, AsyncArray, get_array_metadata
from zarr.core.array_spec import ArrayConfig, ArrayConfigParams
from zarr.core.buffer import NDArrayLike
from zarr.core.common import (
JSON,
AccessModeLiteral,
ChunkCoords,
MemoryOrder,
ZarrFormat,
_warn_order_kwarg,
_warn_write_empty_chunks_kwarg,
parse_dtype,
)
from zarr.core.config import config
Expand Down Expand Up @@ -794,7 +797,7 @@ async def create(
read_only: bool | None = None,
object_codec: Codec | None = None, # TODO: type has changed
dimension_separator: Literal[".", "/"] | None = None,
write_empty_chunks: bool = False, # TODO: default has changed
write_empty_chunks: bool | None = None,
zarr_version: ZarrFormat | None = None, # deprecated
zarr_format: ZarrFormat | None = None,
meta_array: Any | None = None, # TODO: need type
Expand All @@ -810,6 +813,7 @@ async def create(
codecs: Iterable[Codec | dict[str, JSON]] | None = None,
dimension_names: Iterable[str] | None = None,
storage_options: dict[str, Any] | None = None,
config: ArrayConfig | ArrayConfigParams | None = None,
**kwargs: Any,
) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
"""Create an array.
Expand Down Expand Up @@ -856,8 +860,10 @@ async def create(
These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. fill_value : object
Default value to use for uninitialized portions of the array.
order : {'C', 'F'}, optional
Deprecated in favor of the ``config`` keyword argument.
Pass ``{'order': <value>}`` to ``create`` instead of using this parameter.
Memory layout to be used within each chunk.
If not specified, default is taken from the Zarr config ```array.order```.
If not specified, the ``array.order`` parameter in the global config will be used.
store : Store or str
Store or path to directory in file system or name of zip file.
synchronizer : object, optional
Expand Down Expand Up @@ -891,30 +897,26 @@ async def create(
Separator placed between the dimensions of a chunk.
V2 only. V3 arrays should use ``chunk_key_encoding`` instead.
Default is ".".
.. versionadded:: 2.8
write_empty_chunks : bool, optional
If True (default), all chunks will be stored regardless of their
Deprecated in favor of the ``config`` keyword argument.
Pass ``{'write_empty_chunks': <value>}`` to ``create`` instead of using this parameter.
If True, all chunks will be stored regardless of their
contents. If False, each chunk is compared to the array's fill value
prior to storing. If a chunk is uniformly equal to the fill value, then
that chunk is not be stored, and the store entry for that chunk's key
is deleted. This setting enables sparser storage, as only chunks with
non-fill-value data are stored, at the expense of overhead associated
with checking the data of each chunk.
.. versionadded:: 2.11
is deleted.
zarr_format : {2, 3, None}, optional
The zarr format to use when saving.
Default is 3.
meta_array : array-like, optional
An array instance to use for determining arrays to create and return
to users. Use `numpy.empty(())` by default.
.. versionadded:: 2.13
storage_options : dict
If using an fsspec URL to create the store, these will be passed to
the backend implementation. Ignored otherwise.
config : ArrayConfig or ArrayConfigParams, optional
Runtime configuration of the array. If provided, will override the
default values from `zarr.config.array`.
Returns
-------
Expand Down Expand Up @@ -951,26 +953,47 @@ async def create(
warnings.warn("object_codec is not yet implemented", RuntimeWarning, stacklevel=2)
if read_only is not None:
warnings.warn("read_only is not yet implemented", RuntimeWarning, stacklevel=2)
if dimension_separator is not None:
if zarr_format == 3:
raise ValueError(
"dimension_separator is not supported for zarr format 3, use chunk_key_encoding instead"
)
else:
warnings.warn(
"dimension_separator is not yet implemented",
RuntimeWarning,
stacklevel=2,
)
if write_empty_chunks:
warnings.warn("write_empty_chunks is not yet implemented", RuntimeWarning, stacklevel=2)
if dimension_separator is not None and zarr_format == 3:
raise ValueError(
"dimension_separator is not supported for zarr format 3, use chunk_key_encoding instead"
)

if order is not None:
_warn_order_kwarg()
if write_empty_chunks is not None:
_warn_write_empty_chunks_kwarg()

if meta_array is not None:
warnings.warn("meta_array is not yet implemented", RuntimeWarning, stacklevel=2)

mode = kwargs.pop("mode", None)
if mode is None:
mode = "a"
store_path = await make_store_path(store, path=path, mode=mode, storage_options=storage_options)

config_dict: ArrayConfigParams = {}

if write_empty_chunks is not None:
if config is not None:
msg = (
"Both write_empty_chunks and config keyword arguments are set. "
"This is redundant. When both are set, write_empty_chunks will be ignored and "
"config will be used."
)
warnings.warn(UserWarning(msg), stacklevel=1)
config_dict["write_empty_chunks"] = write_empty_chunks
if order is not None:
if config is not None:
msg = (
"Both order and config keyword arguments are set. "
"This is redundant. When both are set, order will be ignored and "
"config will be used."
)
warnings.warn(UserWarning(msg), stacklevel=1)
config_dict["order"] = order

config_parsed = ArrayConfig.from_dict(config_dict)

return await AsyncArray.create(
store_path,
shape=shape,
Expand All @@ -987,7 +1010,7 @@ async def create(
codecs=codecs,
dimension_names=dimension_names,
attributes=attributes,
order=order,
config=config_parsed,
**kwargs,
)

Expand Down Expand Up @@ -1163,6 +1186,11 @@ async def open_array(

zarr_format = _handle_zarr_version_or_format(zarr_version=zarr_version, zarr_format=zarr_format)

if "order" in kwargs:
_warn_order_kwarg()
if "write_empty_chunks" in kwargs:
_warn_write_empty_chunks_kwarg()

try:
return await AsyncArray.open(store_path, zarr_format=zarr_format)
except FileNotFoundError:
Expand Down
28 changes: 14 additions & 14 deletions src/zarr/api/synchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

from zarr.abc.codec import Codec
from zarr.api.asynchronous import ArrayLike, PathLike
from zarr.core.array_spec import ArrayConfig, ArrayConfigParams
from zarr.core.buffer import NDArrayLike
from zarr.core.chunk_key_encodings import ChunkKeyEncoding
from zarr.core.common import JSON, AccessModeLiteral, ChunkCoords, MemoryOrder, ZarrFormat
Expand Down Expand Up @@ -542,7 +543,7 @@ def create(
read_only: bool | None = None,
object_codec: Codec | None = None, # TODO: type has changed
dimension_separator: Literal[".", "/"] | None = None,
write_empty_chunks: bool = False, # TODO: default has changed
write_empty_chunks: bool | None = None, # TODO: default has changed
zarr_version: ZarrFormat | None = None, # deprecated
zarr_format: ZarrFormat | None = None,
meta_array: Any | None = None, # TODO: need type
Expand All @@ -558,6 +559,7 @@ def create(
codecs: Iterable[Codec | dict[str, JSON]] | None = None,
dimension_names: Iterable[str] | None = None,
storage_options: dict[str, Any] | None = None,
config: ArrayConfig | ArrayConfigParams | None = None,
**kwargs: Any,
) -> Array:
"""Create an array.
Expand All @@ -578,8 +580,10 @@ def create(
fill_value : object
Default value to use for uninitialized portions of the array.
order : {'C', 'F'}, optional
Deprecated in favor of the ``config`` keyword argument.
Pass ``{'order': <value>}`` to ``create`` instead of using this parameter.
Memory layout to be used within each chunk.
Default is set in Zarr's config (`array.order`).
If not specified, the ``array.order`` parameter in the global config will be used.
store : Store or str
Store or path to directory in file system or name of zip file.
synchronizer : object, optional
Expand Down Expand Up @@ -609,30 +613,25 @@ def create(
A codec to encode object arrays, only needed if dtype=object.
dimension_separator : {'.', '/'}, optional
Separator placed between the dimensions of a chunk.
.. versionadded:: 2.8
write_empty_chunks : bool, optional
If True (default), all chunks will be stored regardless of their
Deprecated in favor of the ``config`` keyword argument.
Pass ``{'write_empty_chunks': <value>}`` to ``create`` instead of using this parameter.
If True, all chunks will be stored regardless of their
contents. If False, each chunk is compared to the array's fill value
prior to storing. If a chunk is uniformly equal to the fill value, then
that chunk is not be stored, and the store entry for that chunk's key
is deleted. This setting enables sparser storage, as only chunks with
non-fill-value data are stored, at the expense of overhead associated
with checking the data of each chunk.
.. versionadded:: 2.11
is deleted.
zarr_format : {2, 3, None}, optional
The zarr format to use when saving.
meta_array : array-like, optional
An array instance to use for determining arrays to create and return
to users. Use `numpy.empty(())` by default.
.. versionadded:: 2.13
storage_options : dict
If using an fsspec URL to create the store, these will be passed to
the backend implementation. Ignored otherwise.
config : ArrayConfig or ArrayConfigParams, optional
Runtime configuration of the array. If provided, will override the
default values from `zarr.config.array`.
Returns
-------
Expand Down Expand Up @@ -669,6 +668,7 @@ def create(
codecs=codecs,
dimension_names=dimension_names,
storage_options=storage_options,
config=config,
**kwargs,
)
)
Expand Down
8 changes: 5 additions & 3 deletions src/zarr/codecs/sharding.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from zarr.abc.store import ByteGetter, ByteRangeRequest, ByteSetter
from zarr.codecs.bytes import BytesCodec
from zarr.codecs.crc32c_ import Crc32cCodec
from zarr.core.array_spec import ArraySpec
from zarr.core.array_spec import ArrayConfig, ArraySpec
from zarr.core.buffer import (
Buffer,
BufferPrototype,
Expand Down Expand Up @@ -665,7 +665,9 @@ def _get_index_chunk_spec(self, chunks_per_shard: ChunkCoords) -> ArraySpec:
shape=chunks_per_shard + (2,),
dtype=np.dtype("<u8"),
fill_value=MAX_UINT_64,
order="C", # Note: this is hard-coded for simplicity -- it is not surfaced into user code
config=ArrayConfig(
order="C", write_empty_chunks=False
), # Note: this is hard-coded for simplicity -- it is not surfaced into user code,
prototype=numpy_buffer_prototype(),
)

Expand All @@ -674,7 +676,7 @@ def _get_chunk_spec(self, shard_spec: ArraySpec) -> ArraySpec:
shape=self.chunk_shape,
dtype=shard_spec.dtype,
fill_value=shard_spec.fill_value,
order=shard_spec.order,
config=shard_spec.config,
prototype=shard_spec.prototype,
)

Expand Down
2 changes: 1 addition & 1 deletion src/zarr/codecs/transpose.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def resolve_metadata(self, chunk_spec: ArraySpec) -> ArraySpec:
shape=tuple(chunk_spec.shape[self.order[i]] for i in range(chunk_spec.ndim)),
dtype=chunk_spec.dtype,
fill_value=chunk_spec.fill_value,
order=chunk_spec.order,
config=chunk_spec.config,
prototype=chunk_spec.prototype,
)

Expand Down
Loading

0 comments on commit 6dc6d07

Please sign in to comment.