diff --git a/Project.toml b/Project.toml index 8cd7ac8ab..0fd893a19 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,7 @@ Compat = "34da2185-b29b-5c13-b0c7-acf172513d20" HDF5_jll = "0234f1f7-429e-5d53-9886-15a909be8d59" Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" Mmap = "a63ad114-7e13-5084-954f-fe012c677804" +Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Requires = "ae029012-a4dd-5104-9daa-d747884805df" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" diff --git a/docs/Manifest.toml b/docs/Manifest.toml index bb1e676f5..c7810b196 100644 --- a/docs/Manifest.toml +++ b/docs/Manifest.toml @@ -128,7 +128,7 @@ uuid = "f6f2d980-1ec6-471c-a70d-0270e22f1103" version = "0.1.0" [[deps.HDF5]] -deps = ["Compat", "HDF5_jll", "Libdl", "Mmap", "Random", "Requires", "UUIDs"] +deps = ["Compat", "HDF5_jll", "Libdl", "Mmap", "Printf", "Random", "Requires", "UUIDs"] path = ".." uuid = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" version = "0.16.13" diff --git a/docs/src/api_bindings.md b/docs/src/api_bindings.md index 2a6a5f000..ad58d0f15 100644 --- a/docs/src/api_bindings.md +++ b/docs/src/api_bindings.md @@ -89,6 +89,7 @@ h5a_write --- ## [[`H5D`](https://portal.hdfgroup.org/display/HDF5/Datasets) — Dataset Interface](@id H5D) +- [`h5d_chunk_iter`](@ref h5d_chunk_iter) - [`h5d_close`](@ref h5d_close) - [`h5d_create`](@ref h5d_create) - [`h5d_create_anon`](@ref h5d_create_anon) @@ -119,6 +120,7 @@ h5a_write - [`h5d_write`](@ref h5d_write) - [`h5d_write_chunk`](@ref h5d_write_chunk) ```@docs +h5d_chunk_iter h5d_close h5d_create h5d_create_anon diff --git a/docs/src/interface/dataset.md b/docs/src/interface/dataset.md index c488dcbf7..5d1cc807a 100644 --- a/docs/src/interface/dataset.md +++ b/docs/src/interface/dataset.md @@ -7,11 +7,15 @@ CurrentModule = HDF5 Many dataset operations are available through the indexing interface, which is aliased to the functional interface. Below describes the functional interface. ```@docs +Dataset create_dataset Base.copyto! Base.similar create_external_dataset get_datasets +open_dataset +write_dataset +read_dataset ``` ## Chunks @@ -20,10 +24,21 @@ get_datasets do_read_chunk do_write_chunk get_chunk_index +get_chunk_info_all get_chunk_length get_chunk_offset get_num_chunks get_num_chunks_per_dim read_chunk write_chunk -``` \ No newline at end of file +``` + +### Private Implementation + +These functions select private implementations of the public high-level API. +They should be used for diagnostic purposes only. + +```@docs +_get_chunk_info_all_by_index +_get_chunk_info_all_by_iter +``` diff --git a/docs/src/interface/datatype.md b/docs/src/interface/datatype.md new file mode 100644 index 000000000..7ac88bf7a --- /dev/null +++ b/docs/src/interface/datatype.md @@ -0,0 +1,9 @@ +# Datatypes + +```@meta +CurrentModule = HDF5 +``` + +```@docs +Datatype +``` diff --git a/docs/src/interface/files.md b/docs/src/interface/files.md index 74918d6df..2cadc042a 100644 --- a/docs/src/interface/files.md +++ b/docs/src/interface/files.md @@ -8,5 +8,6 @@ CurrentModule = HDF5 h5open ishdf5 Base.isopen +Base.read start_swmr_write ``` diff --git a/gen/api_defs.jl b/gen/api_defs.jl index e44c20f39..ede6d6d24 100644 --- a/gen/api_defs.jl +++ b/gen/api_defs.jl @@ -59,6 +59,7 @@ ### Dataset Interface ### +@bind h5d_chunk_iter(dset_id::hid_t, dxpl_id::hid_t, cb::Ptr{Nothing}, op_data::Any)::herr_t "Error iterating over chunks" (v"1.12.3", nothing) @bind h5d_close(dataset_id::hid_t)::herr_t "Error closing dataset" @bind h5d_create2(loc_id::hid_t, pathname::Cstring, dtype_id::hid_t, space_id::hid_t, lcpl_id::hid_t, dcpl_id::hid_t, dapl_id::hid_t)::hid_t string("Error creating dataset ", h5i_get_name(loc_id), "/", pathname) @bind h5d_create_anon(loc_id::hid_t, type_id::hid_t, space_id::hid_t, dcpl_id::hid_t, dapl_id::hid_t)::hid_t "Error in creating anonymous dataset" diff --git a/src/HDF5.jl b/src/HDF5.jl index fb95abfa1..783dd25b5 100644 --- a/src/HDF5.jl +++ b/src/HDF5.jl @@ -6,6 +6,7 @@ using Mmap: Mmap # needed for filter(f, tuple) in julia 1.3 using Compat using UUIDs: uuid4 +using Printf: @sprintf ### PUBLIC API ### diff --git a/src/api/functions.jl b/src/api/functions.jl index e8086d7a0..4e295df1e 100644 --- a/src/api/functions.jl +++ b/src/api/functions.jl @@ -443,6 +443,24 @@ function h5a_write(attr_hid, mem_type_id, buf) return nothing end +@static if v"1.12.3" ≤ _libhdf5_build_ver + @doc """ + h5d_chunk_iter(dset_id::hid_t, dxpl_id::hid_t, cb::Ptr{Nothing}, op_data::Any) + + See `libhdf5` documentation for [`H5Dchunk_iter`](https://portal.hdfgroup.org/display/HDF5/H5D_CHUNK_ITER). + """ + function h5d_chunk_iter(dset_id, dxpl_id, cb, op_data) + lock(liblock) + var"#status#" = try + ccall((:H5Dchunk_iter, libhdf5), herr_t, (hid_t, hid_t, Ptr{Nothing}, Any), dset_id, dxpl_id, cb, op_data) + finally + unlock(liblock) + end + var"#status#" < 0 && @h5error("Error iterating over chunks") + return nothing + end +end + """ h5d_close(dataset_id::hid_t) diff --git a/src/api/helpers.jl b/src/api/helpers.jl index f20b444c8..441c1aa4e 100644 --- a/src/api/helpers.jl +++ b/src/api/helpers.jl @@ -174,6 +174,52 @@ end end end +""" + h5d_chunk_iter(f, dataset, [dxpl_id=H5P_DEFAULT]) + +Call `f(offset::Ptr{hsize_t}, filter_mask::Cuint, addr::haddr_t, size::hsize_t)` for each chunk. +`dataset` maybe a `HDF5.Dataset` or a dataset id. +`dxpl_id` is the the dataset transfer property list and is optional. + +Available only for HDF5 1.10.x series for 1.10.9 and greater or for version HDF5 1.12.3 or greater. +""" +h5d_chunk_iter() = nothing + +@static if v"1.12.3" ≤ _libhdf5_build_ver || + (_libhdf5_build_ver.minor == 10 && _libhdf5_build_ver.patch >= 10) + # H5Dchunk_iter is first available in 1.10.10, 1.12.3, and 1.14.0 in the 1.10, 1.12, and 1.14 minor version series, respectively + function h5d_chunk_iter_helper( + offset::Ptr{hsize_t}, + filter_mask::Cuint, + addr::haddr_t, + size::hsize_t, + @nospecialize(data::Any) + )::H5_iter_t + func, err_ref = data + try + return convert(H5_iter_t, func(offset, filter_mask, addr, size)) + catch err + err_ref[] = err + return H5_ITER_ERROR + end + end + function h5d_chunk_iter(@nospecialize(f), dset_id, dxpl_id=H5P_DEFAULT) + err_ref = Ref{Any}(nothing) + fptr = @cfunction( + h5d_chunk_iter_helper, H5_iter_t, (Ptr{hsize_t}, Cuint, haddr_t, hsize_t, Any) + ) + try + return h5d_chunk_iter(dset_id, dxpl_id, fptr, (f, err_ref)) + catch h5err + jlerr = err_ref[] + if !isnothing(jlerr) + rethrow(jlerr) + end + rethrow(h5err) + end + end +end + """ h5d_get_space_status(dataset_id) diff --git a/src/api/types.jl b/src/api/types.jl index 70704d2cc..254970d8f 100644 --- a/src/api/types.jl +++ b/src/api/types.jl @@ -72,6 +72,12 @@ end H5_ITER_NATIVE = 2 H5_ITER_N = 3 end +@enum H5_iter_t::Cint begin + H5_ITER_CONT = 0 + H5_ITER_ERROR = -1 + H5_ITER_STOP = 1 +end +Base.convert(::Type{H5_iter_t}, x::Integer) = H5_iter_t(x) const H5O_iterate1_t = Ptr{Cvoid} const H5O_iterate2_t = Ptr{Cvoid} @@ -249,7 +255,7 @@ _read_const(sym::Symbol) = unsafe_load(cglobal(Libdl.dlsym(libhdf5handle[], sym) _has_symbol(sym::Symbol) = Libdl.dlsym(libhdf5handle[], sym; throw_error=false) !== nothing # iteration order constants -# Moved to H5_iter_t enum +# Moved to H5_iter_order_t enum #const H5_ITER_UNKNOWN = -1 #const H5_ITER_INC = 0 #const H5_ITER_DEC = 1 diff --git a/src/api_midlevel.jl b/src/api_midlevel.jl index 4efe7968f..4bbcac19f 100644 --- a/src/api_midlevel.jl +++ b/src/api_midlevel.jl @@ -148,7 +148,7 @@ end Helper method to read chunks via 0-based integer `index`. -Argument `buf` is optional and defaults to a `Vector{UInt8}` of length determined by `HDF5.h5d_get_chunk_info`. +Argument `buf` is optional and defaults to a `Vector{UInt8}` of length determined by `HDF5.API.h5d_get_chunk_info`. Argument `dxpl_id` can be supplied a keyword and defaults to `HDF5.API.H5P_DEFAULT`. Argument `filters` can be retrieved by supplying a `Ref{UInt32}` value via a keyword argument. diff --git a/src/datasets.jl b/src/datasets.jl index 71f8b6964..6d6c88ea5 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -3,8 +3,12 @@ # Get the dataspace of a dataset dataspace(dset::Dataset) = Dataspace(API.h5d_get_space(checkvalid(dset))) -# Open Dataset +""" + open_dataset(parent::Union{File, Group}, name::AbstractString, [dapl, dxpl]) +Open a dataset and return a [`HDF5.Dataset`](@ref) handle. Alternatively , just use index +a file or group with `name`. +""" open_dataset( parent::Union{File,Group}, name::AbstractString, @@ -112,6 +116,14 @@ create_dataset( # Get the datatype of a dataset datatype(dset::Dataset) = Datatype(API.h5d_get_type(checkvalid(dset)), file(dset)) +""" + read_dataset(parent::Union{File,Group}, name::AbstractString) + +Read a dataset with named `name` from `parent`. This will typically return an array. +The dataset will be opened, read, and closed. + +See also [`HDF5.open_dataset`](@ref), [`Base.read`](@ref) +""" function read_dataset(parent::Union{File,Group}, name::AbstractString) local ret obj = open_dataset(parent, name) @@ -275,6 +287,14 @@ function create_dataset( end # Create and write, closing the objects upon exit +""" + write_dataset(parent::Union{File,Group}, name::Union{AbstractString,Nothing}, data; pv...) + +Create and write a dataset with `data`. Keywords are forwarded to [`create_dataset`](@ref). +Providing `nothing` as the name will create an anonymous dataset. + +See also [`create_dataset`](@ref) +""" function write_dataset( parent::Union{File,Group}, name::Union{AbstractString,Nothing}, data; pv... ) @@ -735,6 +755,96 @@ function get_chunk(dset::Dataset) ret end +struct ChunkInfo{N} + offset::NTuple{N,Int} + filter_mask::Cuint + addr::API.haddr_t + size::API.hsize_t +end +function Base.show(io::IO, ::MIME"text/plain", info::Vector{<:ChunkInfo}) + print(io, typeof(info)) + println(io, " with $(length(info)) elements:") + println(io, "Offset \tFilter Mask \tAddress\tSize") + println(io, "----------\t--------------------------------\t-------\t----") + for ci in info + println( + io, + @sprintf("%10s", ci.offset), + "\t", + bitstring(ci.filter_mask), + "\t", + ci.addr, + "\t", + ci.size + ) + end +end + +""" + HDF5.get_chunk_info_all(dataset, [dxpl]) + +Obtain information on all the chunks in a dataset. Returns a +`Vector{ChunkInfo{N}}`. The fields of `ChunkInfo{N}` are +* offset - `NTuple{N, Int}` indicating the offset of the chunk in terms of elements, reversed to F-order +* filter_mask - Cuint, 32-bit flags indicating whether filters have been applied to the cunk +* addr - haddr_t, byte-offset of the chunk in the file +* size - hsize_t, size of the chunk in bytes +""" +function get_chunk_info_all(dataset, dxpl=API.H5P_DEFAULT) + @static if hasmethod(API.h5d_chunk_iter, Tuple{API.hid_t}) + return _get_chunk_info_all_by_iter(dataset, dxpl) + else + return _get_chunk_info_all_by_index(dataset, dxpl) + end +end + +""" + _get_chunk_info_all_by_iter(dataset, [dxpl]) + +Implementation of [`get_chunk_info_all`](@ref) via [`HDF5.API.h5d_chunk_iter`](@ref). + +We expect this will be faster, O(N), than using `h5d_get_chunk_info` since this allows us to iterate +through the chunks once. +""" +@inline function _get_chunk_info_all_by_iter(dataset, dxpl=API.H5P_DEFAULT) + ds = dataspace(dataset) + N = ndims(ds) + info = ChunkInfo{N}[] + num_chunks = get_num_chunks(dataset) + sizehint!(info, num_chunks) + API.h5d_chunk_iter(dataset, dxpl) do offset, filter_mask, addr, size + _offset = reverse(unsafe_load(Ptr{NTuple{N,API.hsize_t}}(offset))) + push!(info, ChunkInfo{N}(_offset, filter_mask, addr, size)) + return HDF5.API.H5_ITER_CONT + end + return info +end + +""" + _get_chunk_info_all_by_index(dataset, [dxpl]) + +Implementation of [`get_chunk_info_all`](@ref) via [`HDF5.API.h5d_get_chunk_info`](@ref). + +We expect this will be slower, O(N^2), than using `h5d_chunk_iter` since each call to `h5d_get_chunk_info` +iterates through the B-tree structure. +""" +@inline function _get_chunk_info_all_by_index(dataset, dxpl=API.H5P_DEFAULT) + ds = dataspace(dataset) + N = ndims(ds) + info = ChunkInfo{N}[] + num_chunks = get_num_chunks(dataset) + sizehint!(info, num_chunks) + for chunk_index in 0:(num_chunks - 1) + _info_nt = HDF5.API.h5d_get_chunk_info(dataset, chunk_index) + _offset = (reverse(_info_nt[:offset])...,) + filter_mask = _info_nt[:filter_mask] + addr = _info_nt[:addr] + size = _info_nt[:size] + push!(info, ChunkInfo{N}(_offset, filter_mask, addr, size)) + end + return info +end + # properties that require chunks in order to work (e.g. any filter) # values do not matter -- just needed to form a NamedTuple with the desired keys const chunked_props = (; compress=nothing, deflate=nothing, blosc=nothing, shuffle=nothing) diff --git a/src/readwrite.jl b/src/readwrite.jl index 361bf295e..0769746e3 100644 --- a/src/readwrite.jl +++ b/src/readwrite.jl @@ -12,6 +12,13 @@ end # Generic read functions +""" + read(parent::Union{HDF5.File, HDF5.Group}, name::AbstractString; pv...) + read(parent::Union{HDF5.File, HDF5.Group}, name::AbstractString => dt::HDF5.Datatype; pv...) + +Read a dataset or attribute from a HDF5 file of group identified by `name`. +Optionally, specify the [`HDF5.Datatype`](@ref) to be read. +""" function Base.read(parent::Union{File,Group}, name::AbstractString; pv...) obj = getindex(parent, name; pv...) val = read(obj) @@ -33,6 +40,11 @@ end # This infers the Julia type from the HDF5.Datatype. Specific file formats should provide their own read(dset). const DatasetOrAttribute = Union{Dataset,Attribute} +""" + read(obj::HDF5.DatasetOrAttribute} + +Read the data within a [`HDF5.Dataset`](@ref) or [`HDF5.Attribute`](@ref). +""" function Base.read(obj::DatasetOrAttribute) dtype = datatype(obj) T = get_jl_type(dtype) diff --git a/src/types.jl b/src/types.jl index 84a0204b5..c25f61b1b 100644 --- a/src/types.jl +++ b/src/types.jl @@ -5,7 +5,13 @@ # Supertype of HDF5.File, HDF5.Group, JldFile, JldGroup, Matlabv5File, and MatlabHDF5File. abstract type H5DataStore end -# Read a list of variables, read(parent, "A", "B", "x", ...) +""" + read(parent::H5DataStore) + read(parent::H5DataStore, names...) + +Read a list of variables, read(parent, "A", "B", "x", ...). +If no variables are specified, read every variable in the file. +""" function Base.read(parent::H5DataStore, name::AbstractString...) tuple((read(parent, x) for x in name)...) end @@ -62,6 +68,11 @@ end Base.cconvert(::Type{API.hid_t}, g::Group) = g Base.unsafe_convert(::Type{API.hid_t}, g::Group) = g.id +""" + HDF5.Dataset + +A mutable wrapper for a HDF5 Dataset `HDF5.API.hid_t`. +""" mutable struct Dataset id::API.hid_t file::File @@ -76,6 +87,12 @@ end Base.cconvert(::Type{API.hid_t}, dset::Dataset) = dset Base.unsafe_convert(::Type{API.hid_t}, dset::Dataset) = dset.id +""" + HDF5.Datatype(id, toclose = true) + +Wrapper for a HDF5 datatype id. If `toclose` is true, the finalizer will close +the datatype. +""" mutable struct Datatype id::API.hid_t toclose::Bool diff --git a/test/api.jl b/test/api.jl index d3045b3ad..2e086ef57 100644 --- a/test/api.jl +++ b/test/api.jl @@ -103,3 +103,9 @@ end @assert false end end + +@testset "h5dchunk_iter" begin + @test convert(HDF5.API.H5_iter_t, 0) == HDF5.API.H5_ITER_CONT + @test convert(HDF5.API.H5_iter_t, 1) == HDF5.API.H5_ITER_STOP + @test convert(HDF5.API.H5_iter_t, -1) == HDF5.API.H5_ITER_ERROR +end diff --git a/test/chunkstorage.jl b/test/chunkstorage.jl index b8a9ef2db..82bff6ba8 100644 --- a/test/chunkstorage.jl +++ b/test/chunkstorage.jl @@ -66,6 +66,20 @@ using Test @test HDF5.get_chunk_index(d, (2, 5)) == 5 @test HDF5.get_chunk_index(d, (3, 4)) == 5 @test HDF5.get_chunk_index(d, (3, 5)) == 5 + # Test chunk iter + if v"1.12.3" ≤ HDF5.API._libhdf5_build_ver + infos = HDF5.get_chunk_info_all(d) + offsets = [info.offset for info in infos] + addrs = [info.addr for info in infos] + filter_masks = [info.filter_mask for info in infos] + sizes = [info.size for info in infos] + @test isempty( + setdiff(offsets, [(0, 0), (2, 0), (0, 2), (2, 2), (0, 4), (2, 4)]) + ) + @test length(unique(addrs)) == 6 + @test only(unique(filter_masks)) === UInt32(0) + @test only(unique(sizes)) == 4 * sizeof(Int) + end end # Test direct write chunk writing via linear indexing @@ -198,5 +212,31 @@ using Test f["dataset"][:, :] end == reshape(1:20, 4, 5) + # Test chunk info retrieval method performance + h5open(fn, "w") do f + d = create_dataset( + f, + "dataset", + datatype(UInt8), + dataspace(256, 256); + chunk=(16, 16), + alloc_time=:early + ) + if v"1.10.5" ≤ HDF5.API._libhdf5_build_ver + HDF5._get_chunk_info_all_by_index(d) + index_time = @elapsed infos_by_index = HDF5._get_chunk_info_all_by_index(d) + @test length(infos_by_index) == 256 + iob = IOBuffer() + show(iob, MIME"text/plain"(), infos_by_index) + seekstart(iob) + @test length(readlines(iob)) == 259 + if v"1.12.3" ≤ HDF5.API._libhdf5_build_ver + HDF5._get_chunk_info_all_by_iter(d) + iter_time = @elapsed infos_by_iter = HDF5._get_chunk_info_all_by_iter(d) + @test infos_by_iter == infos_by_index + @test iter_time < index_time + end + end + end rm(fn) end # testset "Raw Chunk I/O" diff --git a/test/filter.jl b/test/filter.jl index 9e0e3c973..1aeafbc6a 100644 --- a/test/filter.jl +++ b/test/filter.jl @@ -137,25 +137,34 @@ using HDF5.Filters: ExternalFilter, isavailable, isencoderenabled, isdecoderenab close(f) f = h5open(fn) - # Read datasets and test for equality - for name in keys(f) - ds = f[name] - @testset "$name" begin - @debug "Filter Dataset" HDF5.name(ds) - @test ds[] == data - filters = HDF5.get_create_properties(ds).filters - if startswith(name, "shuffle+") - @test filters[1] isa Shuffle - @test filters[2] isa compressionFilters[name[9:end]] - elseif haskey(compressionFilters, name) || name == "blosc_bitshuffle" - name = replace(name, r"_.*" => "") - @test filters[1] isa compressionFilters[name] + try + + # Read datasets and test for equality + for name in keys(f) + ds = f[name] + @testset "$name" begin + @debug "Filter Dataset" HDF5.name(ds) + @test ds[] == data + filters = HDF5.get_create_properties(ds).filters + if startswith(name, "shuffle+") + @test filters[1] isa Shuffle + @test filters[2] isa compressionFilters[name[9:end]] + elseif haskey(compressionFilters, name) || name == "blosc_bitshuffle" + name = replace(name, r"_.*" => "") + @test filters[1] isa compressionFilters[name] + end + + if v"1.12.3" ≤ HDF5.API._libhdf5_build_ver + infos = HDF5.get_chunk_info_all(ds) + filter_masks = [info.filter_mask for info in infos] + @test only(unique(filter_masks)) === UInt32(0) + end end end + finally + close(f) end - close(f) - # Test that reading a dataset with a missing filter has an informative error message. h5open(fn, "w") do f data = zeros(100, 100)