Source code for numpydantic.interface.hdf5

"""
Interfaces for HDF5 Datasets

.. note::

    HDF5 arrays are accessed through a proxy class :class:`.H5Proxy` .
    Getting/setting values should work as normal, **except** that setting
    values on nested views is impossible - 
    
    Specifically this doesn't work:
    
    .. code-block:: python
    
        my_model.array[0][0] = 1
    
    But this does work:
    
    .. code-block:: python
    
        my_model.array[0,0] = 1
        
    To have direct access to the hdf5 dataset, use the
    :meth:`.H5Proxy.open` method.
    
Datetimes 
---------

Datetimes are supported as a dtype annotation, but currently they must be stored
as ``S32`` isoformatted byte strings (timezones optional) like:    

.. code-block:: python

    import h5py
    from datetime import datetime
    import numpy as np
    data = np.array([datetime.now().isoformat().encode('utf-8')], dtype="S32")
    h5f = h5py.File('test.hdf5', 'w')
    h5f.create_dataset('data', data=data)
    
"""

import sys
from datetime import datetime
from pathlib import Path
from typing import Any, Iterable, List, NamedTuple, Optional, Tuple, TypeVar, Union

import numpy as np
from pydantic import SerializationInfo

from numpydantic.interface.interface import Interface
from numpydantic.types import DtypeType, NDArrayType

try:
    import h5py
except ImportError:  # pragma: no cover
    h5py = None

if sys.version_info.minor >= 10:
    from typing import TypeAlias
else:
    from typing_extensions import TypeAlias

H5Arraylike: TypeAlias = Tuple[Union[Path, str], str]

T = TypeVar("T")


[docs] class H5ArrayPath(NamedTuple): """Location specifier for arrays within an HDF5 file""" file: Union[Path, str] """Location of HDF5 file""" path: str """Path within the HDF5 file""" field: Optional[Union[str, List[str]]] = None """Refer to a specific field within a compound dtype"""
[docs] class H5Proxy: """ Proxy class to mimic numpy-like array behavior with an HDF5 array The attribute and item access methods only open the file for the duration of the method, making it less perilous to share this object between threads and processes. This class attempts to be a passthrough class to a :class:`h5py.Dataset` object, including its attributes and item getters/setters. When using read-only methods, no locking is attempted (beyond the HDF5 defaults), but when using the write methods (setting an array value), try and use the ``locking`` methods of :class:`h5py.File` . Args: file (pathlib.Path | str): Location of hdf5 file on filesystem path (str): Path to array within hdf5 file field (str, list[str]): Optional - refer to a specific field within a compound dtype annotation_dtype (dtype): Optional - the dtype of our type annotation """ def __init__( self, file: Union[Path, str], path: str, field: Optional[Union[str, List[str]]] = None, annotation_dtype: Optional[DtypeType] = None, ): self._h5f = None self.file = Path(file) self.path = path self.field = field self._annotation_dtype = annotation_dtype
[docs] def array_exists(self) -> bool: """Check that there is in fact an array at :attr:`.path` within :attr:`.file`""" with h5py.File(self.file, "r") as h5f: obj = h5f.get(self.path) return obj is not None
[docs] @classmethod def from_h5array(cls, h5array: H5ArrayPath) -> "H5Proxy": """Instantiate using :class:`.H5ArrayPath`""" return H5Proxy(file=h5array.file, path=h5array.path, field=h5array.field)
@property def dtype(self) -> np.dtype: """ Get dtype of array, using :attr:`.field` if present """ with h5py.File(self.file, "r") as h5f: obj = h5f.get(self.path) if self.field is None: return obj.dtype else: return obj.dtype[self.field] def __getattr__(self, item: str): with h5py.File(self.file, "r") as h5f: obj = h5f.get(self.path) return getattr(obj, item) def __getitem__( self, item: Union[int, slice, Tuple[Union[int, slice], ...]] ) -> Union[np.ndarray, DtypeType]: with h5py.File(self.file, "r") as h5f: obj = h5f.get(self.path) # handle compound dtypes if self.field is not None: # handle compound string dtype if encoding := h5py.h5t.check_string_dtype(obj.dtype[self.field]): if isinstance(item, tuple): item = (*item, self.field) else: item = (item, self.field) try: # single string val = obj[item].decode(encoding.encoding) if self._annotation_dtype is np.datetime64: return np.datetime64(val) else: return val except AttributeError: # numpy array of bytes val = np.char.decode(obj[item], encoding=encoding.encoding) if self._annotation_dtype is np.datetime64: return val.astype(np.datetime64) else: return val # normal compound type else: obj = obj.fields(self.field) else: if h5py.h5t.check_string_dtype(obj.dtype): obj = obj.asstr() val = obj[item] if self._annotation_dtype is np.datetime64: if isinstance(val, str): return np.datetime64(val) else: return val.astype(np.datetime64) else: return val def __setitem__( self, key: Union[int, slice, Tuple[Union[int, slice], ...]], value: Union[int, float, datetime, np.ndarray], ): # TODO: Make a generalized value serdes system instead of ad-hoc type conversion value = self._serialize_datetime(value) with h5py.File(self.file, "r+", locking=True) as h5f: obj = h5f.get(self.path) if self.field is None: obj[key] = value else: if isinstance(key, tuple): key = (*key, self.field) obj[key] = value else: obj[key, self.field] = value
[docs] def __len__(self) -> int: """self.shape[0]""" return self.shape[0]
[docs] def open(self, mode: str = "r") -> "h5py.Dataset": """ Return the opened :class:`h5py.Dataset` object You must remember to close the associated file with :meth:`.close` """ if self._h5f is None: self._h5f = h5py.File(self.file, mode) return self._h5f.get(self.path)
[docs] def close(self) -> None: """ Close the :class:`h5py.File` object left open when returning the dataset with :meth:`.open` """ if self._h5f is not None: self._h5f.close() self._h5f = None
def _serialize_datetime(self, v: Union[T, datetime]) -> Union[T, bytes]: """ Convert a datetime into a bytestring """ if self._annotation_dtype is np.datetime64: if not isinstance(v, Iterable): v = [v] v = np.array(v).astype("S32") return v
[docs] class H5Interface(Interface): """ Interface for Arrays stored as datasets within an HDF5 file. Takes a :class:`.H5ArrayPath` specifier to select a :class:`h5py.Dataset` from a :class:`h5py.File` and returns a :class:`.H5Proxy` class that acts like a passthrough numpy-like interface to the dataset. """ input_types = (H5ArrayPath, H5Arraylike, H5Proxy) return_type = H5Proxy
[docs] @classmethod def enabled(cls) -> bool: """Check whether h5py can be imported""" return h5py is not None
[docs] @classmethod def check(cls, array: Union[H5ArrayPath, Tuple[Union[Path, str], str]]) -> bool: """ Check that the given array is a :class:`.H5ArrayPath` or something that resembles one. """ if isinstance(array, (H5ArrayPath, H5Proxy)): return True if isinstance(array, (tuple, list)) and len(array) in (2, 3): # check that the first arg is an hdf5 file try: file = Path(array[0]) except TypeError: # not a path, we don't apply. return False if not file.exists(): return False # hdf5 files are commonly given odd suffixes, # so we just try and open it and see what happens try: with h5py.File(file, "r"): # don't check that the array exists and raise here, # this check is just for whether the validator applies or not. pass return True except (FileNotFoundError, OSError): return False return False
[docs] def before_validation(self, array: Any) -> NDArrayType: """Create an :class:`.H5Proxy` to use throughout validation""" if isinstance(array, H5ArrayPath): array = H5Proxy.from_h5array(h5array=array) elif isinstance(array, H5Proxy): # nothing to do, already proxied pass elif isinstance(array, (tuple, list)) and len(array) == 2: # pragma: no cover array = H5Proxy(file=array[0], path=array[1]) elif isinstance(array, (tuple, list)) and len(array) == 3: array = H5Proxy(file=array[0], path=array[1], field=array[2]) else: # pragma: no cover # this should never happen really since `check` confirms this before # we'd reach here, but just to complete the if else... raise ValueError( "Need to specify a file and a path within an HDF5 file to use the HDF5 " "Interface" ) array._annotation_dtype = self.dtype if not array.array_exists(): raise ValueError( f"HDF5 file located at {array.file}, " f"but no array found at {array.path}" ) return array
[docs] def get_dtype(self, array: NDArrayType) -> DtypeType: """ Get the dtype from the input array Subclasses to correctly handle """ if h5py.h5t.check_string_dtype(array.dtype): # check for datetimes try: if array[0].dtype.type is np.datetime64: return np.datetime64 else: return str except (AttributeError, TypeError): # pragma: no cover # it's not a datetime, but it is some kind of string return str except (IndexError, ValueError): # if the dataset is empty, we can't tell if something is a datetime # or not, so we just tell the validation method what it wants to hear if self.dtype in (np.datetime64, str): return self.dtype else: return str else: return array.dtype
[docs] @classmethod def to_json(cls, array: H5Proxy, info: Optional[SerializationInfo] = None) -> dict: """ Dump to a dictionary containing * ``file``: :attr:`.file` * ``path``: :attr:`.path` * ``attrs``: Any HDF5 attributes on the dataset * ``array``: The array as a list of lists """ try: dset = array.open() meta = { "file": array.file, "path": array.path, "attrs": dict(dset.attrs), "array": dset[:].tolist(), } return meta finally: array.close()