"""
Interface to zarr arrays
"""
import contextlib
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Literal, Optional, Sequence, Union
import numpy as np
from pydantic import SerializationInfo
from numpydantic.interface.interface import Interface, JsonDict
from numpydantic.types import DtypeType
try:
import zarr
from numcodecs import VLenUTF8
from zarr.core import Array as ZarrArray
from zarr.storage import StoreLike
except ImportError: # pragma: no cover
ZarrArray = None
StoreLike = None
storage = None
VLenUTF8 = None
[docs]
@dataclass
class ZarrArrayPath:
"""
Map to an array within a zarr store.
See :func:`zarr.open`
"""
file: Union[Path, str]
"""Location of Zarr store file or directory"""
path: Optional[str] = None
"""Path to array within hierarchical zarr store"""
[docs]
def open(self, **kwargs: dict) -> ZarrArray:
"""Open the zarr array at the provided path"""
return zarr.open(str(self.file), path=self.path, **kwargs)
[docs]
@classmethod
def from_iterable(cls, spec: Sequence) -> "ZarrArrayPath":
"""
Construct a :class:`.ZarrArrayPath` specifier from an iterable,
rather than kwargs
"""
if len(spec) == 1:
return ZarrArrayPath(file=spec[0])
elif len(spec) == 2:
return ZarrArrayPath(file=spec[0], path=spec[1])
else:
raise ValueError("Only len 1-2 iterables can be used for a ZarrArrayPath")
[docs]
class ZarrJsonDict(JsonDict):
"""Round-trip Json-able version of a Zarr Array"""
info: dict[str, str]
type: Literal["zarr"]
file: Optional[str] = None
path: Optional[str] = None
value: Optional[list] = None
[docs]
class ZarrInterface(Interface):
"""
Interface to in-memory or on-disk zarr arrays
"""
name = "zarr"
input_types = (Path, ZarrArray, ZarrArrayPath)
return_type = ZarrArray
json_model = ZarrJsonDict
[docs]
@classmethod
def enabled(cls) -> bool:
"""True if zarr is installed"""
return ZarrArray is not None
@staticmethod
def _get_array(
array: Union[ZarrArray, str, dict, ZarrJsonDict, Path, ZarrArrayPath, Sequence]
) -> ZarrArray:
if isinstance(array, ZarrArray):
return array
if isinstance(array, (str, Path)):
array = ZarrArrayPath(file=array)
elif isinstance(array, (tuple, list)):
array = ZarrArrayPath.from_iterable(array)
return array.open(mode="a")
[docs]
@classmethod
def check(cls, array: Any) -> bool:
"""
Check if array is in-memory zarr array,
a path to a zarr array, or a :class:`.ZarrArrayPath`
"""
if isinstance(array, ZarrArray):
return True
if isinstance(array, dict):
if array.get("type", False) == cls.name:
return True
# continue checking if dict contains a zarr file
array = array.get("file", "")
# See if can be coerced to ZarrArrayPath
if isinstance(array, (Path, str)):
array = ZarrArrayPath(file=array)
if isinstance(array, (tuple, list)):
# something that can be coerced to ZarrArrayPath
with contextlib.suppress(ValueError):
array = ZarrArrayPath.from_iterable(array)
if isinstance(array, ZarrArrayPath):
with contextlib.suppress(Exception):
arr = array.open(mode="r")
if isinstance(arr, ZarrArray):
return True
return False
[docs]
def before_validation(
self, array: Union[ZarrArray, str, Path, ZarrArrayPath, Sequence]
) -> ZarrArray:
"""
Ensure that the zarr array is opened
"""
return self._get_array(array)
[docs]
def get_dtype(self, array: ZarrArray) -> DtypeType:
"""
Override base dtype getter to handle zarr's string-as-object encoding.
"""
if (
getattr(array.dtype, "type", None) is np.object_
and array.filters
and any([isinstance(f, VLenUTF8) for f in array.filters])
):
return np.str_
else:
return array.dtype
[docs]
@classmethod
def to_json(
cls,
array: Union[ZarrArray, str, Path, ZarrArrayPath, Sequence],
info: Optional[SerializationInfo] = None,
) -> Union[list, ZarrJsonDict]:
"""
Dump a Zarr Array to JSON
If ``info.round_trip == False``, dump the array as a list of lists.
This may be a memory-intensive operation.
Otherwise, dump the metadata for an array from
:meth:`zarr.core.Array.info_items`
plus the :meth:`zarr.core.Array.hexdigest` as a :class:`.ZarrJsonDict`
If either the ``dump_array`` value in the context dictionary is ``True``
or the zarr array is an in-memory array, dump the array as well
(since without a persistent array it would be impossible to roundtrip and
dumping to JSON would be meaningless)
Passing ```dump_array': True`` to the serialization ``context``
looks like this::
model.model_dump_json(context={'zarr_dump_array': True})
"""
array = cls._get_array(array)
if info.round_trip:
dump_array = False
if info is not None and info.context is not None:
dump_array = info.context.get("dump_array", False)
is_file = False
as_json = {"type": cls.name}
if hasattr(array.store, "dir_path"):
is_file = True
as_json["file"] = array.store.dir_path()
as_json["path"] = array.name
as_json["info"] = {i[0]: i[1] for i in array.info_items()}
as_json["info"]["hexdigest"] = array.hexdigest()
if dump_array or not is_file:
as_json["value"] = array[:].tolist()
as_json = ZarrJsonDict(**as_json)
else:
as_json = array[:].tolist()
return as_json