Add default compressors to config (zarr-developers#2470)

* add default compressor to config * modify _default_compressor to _default_filters_and_compressor * fix test_metadata_to_dict * wip debugging * format * fix v2 decode string dtype * fix config default tests * format * Update src/zarr/codecs/_v2.py * rename v2_dtype_kind_to_default_filters_and_compressor to v2_default_compressors * recover test_v2.py * incorporate feedback * incorporate feedback * fix mypy * allow only one default compressor * put `v2_default_compressor` under `array` * deprecate zarr.storage.default_compressor * test v3_default_codecs * use v3_default_codecs * fix tests that expected codecs==["bytes"] * fix test_default_codecs * fail-fast: false * fix string codecs for np1.25 * format * add docstrings to create in asynchronous.py and array.py * add docstrings to creation in group.py * Apply suggestions from code review Co-authored-by: David Stansby <[email protected]> * apply suggestions from review * correct code double backticks * correct attribute links in docstring * link zarr.core.config in docstrings * improve docstring readability * correct config docstring * correct config docstring * improve config docstring --------- Co-authored-by: Norman Rzepka <[email protected]> Co-authored-by: David Stansby <[email protected]>
DimitriPapadopoulos · Dec 19, 2024 · 4cb8ddd · 4cb8ddd
1 parent 1cc3917
commit 4cb8ddd
Show file tree

Hide file tree

Showing 14 changed files with 529 additions and 150 deletions.
diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py
@@ -17,10 +17,12 @@
     ChunkCoords,
     MemoryOrder,
     ZarrFormat,
+    parse_dtype,
 )
 from zarr.core.config import config
 from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata
 from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata
+from zarr.core.metadata.v2 import _default_filters_and_compressor
 from zarr.errors import NodeTypeValidationError
 from zarr.storage import (
     StoreLike,
@@ -401,7 +403,7 @@ async def save_array(
     arr : ndarray
         NumPy array with data to save.
     zarr_format : {2, 3, None}, optional
-        The zarr format to use when saving.
+        The zarr format to use when saving (default is 3 if not specified).
     path : str or None, optional
         The path within the store where the array will be saved.
     storage_options : dict
@@ -817,19 +819,45 @@ async def create(
     shape : int or tuple of ints
         Array shape.
     chunks : int or tuple of ints, optional
-        Chunk shape. If True, will be guessed from `shape` and `dtype`. If
-        False, will be set to `shape`, i.e., single chunk for the whole array.
-        If an int, the chunk size in each dimension will be given by the value
-        of `chunks`. Default is True.
+        The shape of the array's chunks.
+        V2 only. V3 arrays should use `chunk_shape` instead.
+        If not specified, default values are guessed based on the shape and dtype.
     dtype : str or dtype, optional
         NumPy dtype.
+    chunk_shape : int or tuple of ints, optional
+        The shape of the Array's chunks (default is None).
+        V3 only. V2 arrays should use `chunks` instead.
+    chunk_key_encoding : ChunkKeyEncoding, optional
+        A specification of how the chunk keys are represented in storage.
+        V3 only. V2 arrays should use `dimension_separator` instead.
+        Default is ``("default", "/")``.
+    codecs : Sequence of Codecs or dicts, optional
+        An iterable of Codec or dict serializations of Codecs. The elements of
+        this collection specify the transformation from array values to stored bytes.
+        V3 only. V2 arrays should use ``filters`` and ``compressor`` instead.
+
+        If no codecs are provided, default codecs will be used:
+
+        - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``.
+        - For Unicode strings, the default is ``VLenUTF8Codec``.
+        - For bytes or objects, the default is ``VLenBytesCodec``.
+
+        These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
     compressor : Codec, optional
-        Primary compressor.
-    fill_value : object
+        Primary compressor to compress chunk data.
+        V2 only. V3 arrays should use ``codecs`` instead.
+
+        If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used:
+
+        - For numeric arrays, the default is ``ZstdCodec``.
+        - For Unicode strings, the default is ``VLenUTF8Codec``.
+        - For bytes or objects, the default is ``VLenBytesCodec``.
+
+        These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`.    fill_value : object
         Default value to use for uninitialized portions of the array.
     order : {'C', 'F'}, optional
         Memory layout to be used within each chunk.
-        Default is set in Zarr's config (`array.order`).
+        If not specified, default is taken from the Zarr config ```array.order```.
     store : Store or str
         Store or path to directory in file system or name of zip file.
     synchronizer : object, optional
@@ -844,6 +872,8 @@ async def create(
         for storage of both chunks and metadata.
     filters : sequence of Codecs, optional
         Sequence of filters to use to encode chunk data prior to compression.
+        V2 only. If neither ``compressor`` nor ``filters`` are provided, a default
+        compressor will be used. (see ``compressor`` for details).
     cache_metadata : bool, optional
         If True, array configuration metadata will be cached for the
         lifetime of the object. If False, array metadata will be reloaded
@@ -859,7 +889,8 @@ async def create(
         A codec to encode object arrays, only needed if dtype=object.
     dimension_separator : {'.', '/'}, optional
         Separator placed between the dimensions of a chunk.
-
+        V2 only. V3 arrays should use ``chunk_key_encoding`` instead.
+        Default is ".".
         .. versionadded:: 2.8
 
     write_empty_chunks : bool, optional
@@ -875,6 +906,7 @@ async def create(
 
     zarr_format : {2, 3, None}, optional
         The zarr format to use when saving.
+        Default is 3.
     meta_array : array-like, optional
         An array instance to use for determining arrays to create and return
         to users. Use `numpy.empty(())` by default.
@@ -894,9 +926,13 @@ async def create(
         or _default_zarr_version()
     )
 
-    if zarr_format == 2 and chunks is None:
-        chunks = shape
-    elif zarr_format == 3 and chunk_shape is None:
+    if zarr_format == 2:
+        if chunks is None:
+            chunks = shape
+        dtype = parse_dtype(dtype, zarr_format)
+        if not filters and not compressor:
+            filters, compressor = _default_filters_and_compressor(dtype)
+    elif zarr_format == 3 and chunk_shape is None:  # type: ignore[redundant-expr]
         if chunks is not None:
             chunk_shape = chunks
             chunks = None

diff --git a/src/zarr/codecs/__init__.py b/src/zarr/codecs/__init__.py
@@ -1,10 +1,5 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any
-
-if TYPE_CHECKING:
-    import numpy as np
-
 from zarr.codecs.blosc import BloscCname, BloscCodec, BloscShuffle
 from zarr.codecs.bytes import BytesCodec, Endian
 from zarr.codecs.crc32c_ import Crc32cCodec
@@ -13,7 +8,6 @@
 from zarr.codecs.transpose import TransposeCodec
 from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
 from zarr.codecs.zstd import ZstdCodec
-from zarr.core.metadata.v3 import DataType
 
 __all__ = [
     "BloscCname",
@@ -30,15 +24,3 @@
     "VLenUTF8Codec",
     "ZstdCodec",
 ]
-
-
-def _get_default_array_bytes_codec(
-    np_dtype: np.dtype[Any],
-) -> BytesCodec | VLenUTF8Codec | VLenBytesCodec:
-    dtype = DataType.from_numpy(np_dtype)
-    if dtype == DataType.string:
-        return VLenUTF8Codec()
-    elif dtype == DataType.bytes:
-        return VLenBytesCodec()
-    else:
-        return BytesCodec()
diff --git a/src/zarr/codecs/_v2.py b/src/zarr/codecs/_v2.py
@@ -5,6 +5,7 @@
 from typing import TYPE_CHECKING
 
 import numcodecs
+import numpy as np
 from numcodecs.compat import ensure_bytes, ensure_ndarray_like
 
 from zarr.abc.codec import ArrayBytesCodec
@@ -46,7 +47,17 @@ async def _decode_single(
         # special case object dtype, because incorrect handling can lead to
         # segfaults and other bad things happening
         if chunk_spec.dtype != object:
-            chunk = chunk.view(chunk_spec.dtype)
+            try:
+                chunk = chunk.view(chunk_spec.dtype)
+            except TypeError:
+                # this will happen if the dtype of the chunk
+                # does not match the dtype of the array spec i.g. if
+                # the dtype of the chunk_spec is a string dtype, but the chunk
+                # is an object array. In this case, we need to convert the object
+                # array to the correct dtype.
+
+                chunk = np.array(chunk).astype(chunk_spec.dtype)
+
         elif chunk.dtype != object:
             # If we end up here, someone must have hacked around with the filters.
             # We cannot deal with object arrays unless there is an object

diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
@@ -13,7 +13,6 @@
 
 from zarr._compat import _deprecate_positional_args
 from zarr.abc.store import Store, set_or_delete
-from zarr.codecs import _get_default_array_bytes_codec
 from zarr.codecs._v2 import V2Codec
 from zarr.core._info import ArrayInfo
 from zarr.core.attributes import Attributes
@@ -78,7 +77,8 @@
     ArrayV3MetadataDict,
     T_ArrayMetadata,
 )
-from zarr.core.metadata.v3 import parse_node_type_array
+from zarr.core.metadata.v2 import _default_filters_and_compressor
+from zarr.core.metadata.v3 import DataType, parse_node_type_array
 from zarr.core.sync import sync
 from zarr.errors import MetadataValidationError
 from zarr.registry import get_pipeline_class
@@ -409,27 +409,53 @@ async def create(
         attributes : dict[str, JSON], optional
             The attributes of the array (default is None).
         chunk_shape : ChunkCoords, optional
-            The shape of the array's chunks (default is None).
+            The shape of the array's chunks
+            V3 only. V2 arrays should use `chunks` instead.
+            If not specified, default are guessed based on the shape and dtype.
         chunk_key_encoding : ChunkKeyEncoding, optional
-            The chunk key encoding (default is None).
-        codecs : Iterable[Codec | dict[str, JSON]], optional
-            The codecs used to encode the data (default is None).
+            A specification of how the chunk keys are represented in storage.
+            V3 only. V2 arrays should use `dimension_separator` instead.
+            Default is ``("default", "/")``.
+        codecs : Sequence of Codecs or dicts, optional
+            An iterable of Codec or dict serializations of Codecs. The elements of
+            this collection specify the transformation from array values to stored bytes.
+            V3 only. V2 arrays should use ``filters`` and ``compressor`` instead.
+
+            If no codecs are provided, default codecs will be used:
+
+            - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``.
+            - For Unicode strings, the default is ``VLenUTF8Codec``.
+            - For bytes or objects, the default is ``VLenBytesCodec``.
+
+            These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
         dimension_names : Iterable[str], optional
             The names of the dimensions (default is None).
+            V3 only. V2 arrays should not use this parameter.
         chunks : ShapeLike, optional
-            The shape of the array's chunks (default is None).
-            V2 only. V3 arrays should not have 'chunks' parameter.
+            The shape of the array's chunks.
+            V2 only. V3 arrays should use ``chunk_shape`` instead.
+            If not specified, default are guessed based on the shape and dtype.
         dimension_separator : Literal[".", "/"], optional
-            The dimension separator (default is None).
-            V2 only. V3 arrays cannot have a dimension separator.
+            The dimension separator (default is ".").
+            V2 only. V3 arrays should use ``chunk_key_encoding`` instead.
         order : Literal["C", "F"], optional
-            The order of the array (default is None).
+            The order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`).
         filters : list[dict[str, JSON]], optional
-            The filters used to compress the data (default is None).
-            V2 only. V3 arrays should not have 'filters' parameter.
+            Sequence of filters to use to encode chunk data prior to compression.
+            V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor``
+            nor ``filters`` are provided, a default compressor will be used. (see
+            ``compressor`` for details)
         compressor : dict[str, JSON], optional
             The compressor used to compress the data (default is None).
-            V2 only. V3 arrays should not have 'compressor' parameter.
+            V2 only. V3 arrays should use ``codecs`` instead.
+
+            If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used:
+
+            - For numeric arrays, the default is ``ZstdCodec``.
+            - For Unicode strings, the default is ``VLenUTF8Codec``.
+            - For bytes or objects, the default is ``VLenBytesCodec``.
+
+            These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`.
         overwrite : bool, optional
             Whether to raise an error if the store already exists (default is False).
         data : npt.ArrayLike, optional
@@ -494,14 +520,6 @@ async def create(
                 order=order,
             )
         elif zarr_format == 2:
-            if dtype is str or dtype == "str":
-                # another special case: zarr v2 added the vlen-utf8 codec
-                vlen_codec: dict[str, JSON] = {"id": "vlen-utf8"}
-                if filters and not any(x["id"] == "vlen-utf8" for x in filters):
-                    filters = list(filters) + [vlen_codec]
-                else:
-                    filters = [vlen_codec]
-
             if codecs is not None:
                 raise ValueError(
                     "codecs cannot be used for arrays with version 2. Use filters and compressor instead."
@@ -564,11 +582,7 @@ async def _create_v3(
             await ensure_no_existing_node(store_path, zarr_format=3)
 
         shape = parse_shapelike(shape)
-        codecs = (
-            list(codecs)
-            if codecs is not None
-            else [_get_default_array_bytes_codec(np.dtype(dtype))]
-        )
+        codecs = list(codecs) if codecs is not None else _get_default_codecs(np.dtype(dtype))
 
         if chunk_key_encoding is None:
             chunk_key_encoding = ("default", "/")
@@ -634,6 +648,14 @@ async def _create_v2(
         if dimension_separator is None:
             dimension_separator = "."
 
+        dtype = parse_dtype(dtype, zarr_format=2)
+        if not filters and not compressor:
+            filters, compressor = _default_filters_and_compressor(dtype)
+        if np.issubdtype(dtype, np.str_):
+            filters = filters or []
+            if not any(x["id"] == "vlen-utf8" for x in filters):
+                filters = list(filters) + [{"id": "vlen-utf8"}]
+
         metadata = ArrayV2Metadata(
             shape=shape,
             dtype=np.dtype(dtype),
@@ -1493,23 +1515,53 @@ def create(
         dtype : npt.DTypeLike
             The data type of the array.
         chunk_shape : ChunkCoords, optional
-            The shape of the Array's chunks (default is None).
+            The shape of the Array's chunks.
+            V3 only. V2 arrays should use `chunks` instead.
+            If not specified, default are guessed based on the shape and dtype.
         chunk_key_encoding : ChunkKeyEncoding, optional
-            The chunk key encoding (default is None).
-        codecs : Iterable[Codec | dict[str, JSON]], optional
-            The codecs used to encode the data (default is None).
+            A specification of how the chunk keys are represented in storage.
+            V3 only. V2 arrays should use `dimension_separator` instead.
+            Default is ``("default", "/")``.
+        codecs : Sequence of Codecs or dicts, optional
+            An iterable of Codec or dict serializations of Codecs. The elements of
+            this collection specify the transformation from array values to stored bytes.
+            V3 only. V2 arrays should use ``filters`` and ``compressor`` instead.
+
+            If no codecs are provided, default codecs will be used:
+
+            - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``.
+            - For Unicode strings, the default is ``VLenUTF8Codec``.
+            - For bytes or objects, the default is ``VLenBytesCodec``.
+
+            These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`.
         dimension_names : Iterable[str], optional
             The names of the dimensions (default is None).
+            V3 only. V2 arrays should not use this parameter.
         chunks : ChunkCoords, optional
-            The shape of the Array's chunks (default is None).
+            The shape of the array's chunks.
+            V2 only. V3 arrays should use ``chunk_shape`` instead.
+            If not specified, default are guessed based on the shape and dtype.
         dimension_separator : Literal[".", "/"], optional
-            The dimension separator (default is None).
+            The dimension separator (default is ".").
+            V2 only. V3 arrays should use ``chunk_key_encoding`` instead.
         order : Literal["C", "F"], optional
-            The order of the array (default is None).
+            The order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`).
         filters : list[dict[str, JSON]], optional
-            The filters used to compress the data (default is None).
+            Sequence of filters to use to encode chunk data prior to compression.
+            V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor``
+            nor ``filters`` are provided, a default compressor will be used. (see
+            ``compressor`` for details)
         compressor : dict[str, JSON], optional
-            The compressor used to compress the data (default is None).
+            Primary compressor to compress chunk data.
+            V2 only. V3 arrays should use ``codecs`` instead.
+
+            If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used:
+
+            - For numeric arrays, the default is ``ZstdCodec``.
+            - For Unicode strings, the default is ``VLenUTF8Codec``.
+            - For bytes or objects, the default is ``VLenBytesCodec``.
+
+            These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`.
         overwrite : bool, optional
             Whether to raise an error if the store already exists (default is False).
 
@@ -3342,3 +3394,18 @@ def _build_parents(
         )
 
     return parents
+
+
+def _get_default_codecs(
+    np_dtype: np.dtype[Any],
+) -> list[dict[str, JSON]]:
+    default_codecs = config.get("array.v3_default_codecs")
+    dtype = DataType.from_numpy(np_dtype)
+    if dtype == DataType.string:
+        dtype_key = "string"
+    elif dtype == DataType.bytes:
+        dtype_key = "bytes"
+    else:
+        dtype_key = "numeric"
+
+    return [{"name": codec_id, "configuration": {}} for codec_id in default_codecs[dtype_key]]