Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

open_virtual_dataset with dmr++ #113

Merged
merged 28 commits into from
Aug 26, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
18b53bd
basic dmr parsing functionality
ayushnag May 13, 2024
47d8901
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 14, 2024
f3bfa82
Merge branch 'TomNicholas:main' into dmr-adapter
ayushnag May 14, 2024
aaf6af2
Speedup DMR chunk key parsing
agoodm May 14, 2024
fc8b0d8
Merge pull request #1 from agoodm/dmr-adapter
ayushnag May 14, 2024
7b81eeb
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 14, 2024
8334d0a
added groups, docs, and bug fixes
ayushnag May 16, 2024
64d59b1
Merge branch 'TomNicholas:main' into dmr-adapter
ayushnag Jun 3, 2024
1a3b787
Merge branch 'zarr-developers:main' into dmr-adapter
ayushnag Jun 21, 2024
7580fdc
rework hdf5 parser and group logic
ayushnag Jun 27, 2024
52ceba0
Merge remote-tracking branch 'upstream/main' into dmr-adapter
ayushnag Jul 3, 2024
b1f9aee
update attrs cast to python dtype
ayushnag Jul 10, 2024
ae29176
parser passing tests
ayushnag Jul 14, 2024
6e763f9
match main manifest dtypes
ayushnag Jul 14, 2024
0824ed2
Merge branch 'zarr-developers:main' into dmr-adapter
ayushnag Jul 15, 2024
659ab65
Merge branch 'zarr-developers:main' into dmr-adapter
ayushnag Jul 15, 2024
b8531c8
Merge branch 'zarr-developers:main' into dmr-adapter
ayushnag Jul 19, 2024
0125d71
Merge branch 'zarr-developers:main' into dmr-adapter
ayushnag Aug 2, 2024
ef8aa9c
modularize dmrpp.py
ayushnag Aug 3, 2024
7638092
add dmrpp api docs
ayushnag Aug 4, 2024
83cb586
resolve conflict
ayushnag Aug 4, 2024
cb6feff
resolve releases conflict
ayushnag Aug 4, 2024
888ce32
indexes and docs fix
ayushnag Aug 25, 2024
3e15e8e
Merge branch 'main' into dmr-adapter
TomNicholas Aug 26, 2024
ee23ec0
Fix type hint for shape
TomNicholas Aug 26, 2024
d9337ff
change how FileType is used
TomNicholas Aug 26, 2024
6bb9218
Change FileType check again
TomNicholas Aug 26, 2024
d1948d4
fix storage_options bug
TomNicholas Aug 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 134 additions & 0 deletions virtualizarr/dmrpp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import ast
from xml.etree import ElementTree as ET

import numpy as np
import xarray as xr

from virtualizarr.manifests import ManifestArray
from virtualizarr.zarr import ZArray


class DMRParser:
dap_namespace = "{http://xml.opendap.org/ns/DAP/4.0#}"
dmr_namespace = "{http://xml.opendap.org/dap/dmrpp/1.0.0#}"
dap_npdtype = {
"Byte": "uint8",
"UByte": "uint8",
"Int8": "int8",
"UInt8": "uint8",
"Int16": "int16",
"UInt16": "uint16",
"Int32": "int32",
"UInt32": "uint32",
"Int64": "int64",
"UInt64": "uint64",
"Url": "str",
"Float32": "float32",
"Float64": "float64",
"String": "str",
}

def __init__(self, dmr: str):
self.root = ET.fromstring(dmr)
self.data_filepath = self.root.attrib["name"]
self.global_dims = {}

def parse_dataset(self):
ayushnag marked this conversation as resolved.
Show resolved Hide resolved
# find all dimension names and sizes
for d in self.root.iterfind(self.dap_namespace + "Dimension"):
self.global_dims[d.attrib["name"]] = int(d.attrib["size"])
vars_tags = []
for dap_dtype in self.dap_npdtype:
vars_tags += self.root.findall(self.dap_namespace + dap_dtype)
# find all coordinate names (using Map tags)
coord_names = set()
for var_tag in vars_tags:
for map_tag in var_tag.iterfind(self.dap_namespace + "Map"):
coord_names.add(map_tag.attrib["name"].removeprefix("/"))
coords = {}
data_vars = {}
for var_tag in vars_tags:
if var_tag.attrib["name"] in coord_names:
coords[var_tag.attrib["name"]] = self.parse_variable(var_tag)
# if len(coords[v.attrib['name']].dims) == 1:
# dim1d, *_ = coords[v.attrib['name']].dims
# indexes[v.attrib['name']] = PandasIndex(coords[v.attrib['name']], dim1d)
else:
data_vars[var_tag.attrib["name"]] = self.parse_variable(var_tag)
# find all dataset attributes
attrs = {}
for attr_tag in self.root.iterfind(self.dap_namespace + "Attribute"):
if attr_tag.attrib["type"] != "Container":
attrs.update(self.parse_attribute(attr_tag))
return xr.Dataset(
data_vars=data_vars,
coords=xr.Coordinates(coords=coords, indexes={}),
attrs=attrs,
)

def parse_variable(self, root) -> xr.Variable:
# parse dimensions
dims = []
for d in root.iterfind(self.dap_namespace + "Dim"):
dims.append(d.attrib["name"].removeprefix("/"))
shape = tuple([self.global_dims[d] for d in dims])
# parse chunks
chunks = shape
chunks_tag = root.find(self.dmr_namespace + "chunks")
if chunks_tag.find(self.dmr_namespace + "chunkDimensionSizes") is not None:
dim_str = chunks_tag.find(self.dmr_namespace + "chunkDimensionSizes").text
chunks = tuple(map(int, dim_str.split()))
chunkmanifest = self.parse_chunks(chunks_tag, chunks)
# parse attributes
attrs = {}
for a in root.iterfind(self.dap_namespace + "Attribute"):
attrs.update(self.parse_attribute(a))
# create ManifestArray and ZArray
dtype = np.dtype(self.dap_npdtype[root.tag.removeprefix(self.dap_namespace)])
fill_value = (
attrs["_FillValue"]
if "_FillValue" in attrs and attrs["_FillValue"] != "*"
else None
)
zarray = ZArray(
chunks=chunks,
dtype=dtype,
fill_value=fill_value,
order="C",
shape=shape,
zarr_format=3,
)
marr = ManifestArray(zarray=zarray, chunkmanifest=chunkmanifest)
# create encoding dict (and remove those keys from attrs)
encoding_keys = {"_FillValue", "missing_value", "scale_factor", "add_offset"}
encoding = {key: value for key, value in attrs.items() if key in encoding_keys}
attrs = {key: value for key, value in attrs.items() if key not in encoding_keys}
return xr.Variable(dims=dims, data=marr, attrs=attrs, encoding=encoding)

def parse_attribute(self, root) -> dict:
attr = {}
values = []
# if multiple Value tags are present, store as "key": "[v1, v2, ...]"
for r in root:
values.append(r.text)
attr[root.attrib["name"]] = values[0] if len(values) == 1 else str(values)
return attr

def parse_chunks(self, root, chunks: tuple) -> dict:
chunkmanifest = {}
for r in root.iterfind(self.dmr_namespace + "chunk"):
chunk_pos = (
np.zeros(len(chunks), dtype=int)
if "chunkPositionInArray" not in r.attrib
else np.asarray(ast.literal_eval(r.attrib["chunkPositionInArray"]))
)
chunk_num = (
chunk_pos // chunks
) # [0,1023,10235] // [1, 1023, 2047] -> [0,1,5]
chunk_key = ".".join(map(str, chunk_num)) # [0,0,1] -> "0.0.1"
TomNicholas marked this conversation as resolved.
Show resolved Hide resolved
chunkmanifest[chunk_key] = {
"path": self.data_filepath,
"offset": int(r.attrib["offset"]),
"length": int(r.attrib["nBytes"]),
}
return chunkmanifest
5 changes: 5 additions & 0 deletions virtualizarr/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from xarray.core.variable import IndexVariable

import virtualizarr.kerchunk as kerchunk
from virtualizarr.dmrpp import DMRParser
ayushnag marked this conversation as resolved.
Show resolved Hide resolved
from virtualizarr.kerchunk import FileType, KerchunkStoreRefs
from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.zarr import (
Expand Down Expand Up @@ -99,6 +100,10 @@ def open_virtual_dataset(
return open_virtual_dataset_from_v3_store(
storepath=filepath, drop_variables=drop_variables, indexes=indexes
)
if filetype == "dmr++":
with open(filepath, "rb") as f:
ayushnag marked this conversation as resolved.
Show resolved Hide resolved
parser = DMRParser(f.read())
return parser.parse_dataset()
else:
# this is the only place we actually always need to use kerchunk directly
# TODO avoid even reading byte ranges for variables that will be dropped later anyway?
Expand Down
Loading