diff --git a/Project.toml b/Project.toml index b93f048..6ca0fa2 100644 --- a/Project.toml +++ b/Project.toml @@ -17,6 +17,7 @@ OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" OpenSSL = "4d8831e6-92b7-49fb-bdf8-b643e874388c" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4" +ZipArchives = "49080126-0e18-4c2a-b176-c102e4b3760c" [compat] AWSS3 = "0.10" @@ -30,6 +31,7 @@ LRUCache = "1" OffsetArrays = "0.11, 1.0" OpenSSL = "1" URIs = "1" +ZipArchives = "1" julia = "1.2" [extras] diff --git a/src/Storage/Storage.jl b/src/Storage/Storage.jl index d1c6914..e6355b6 100644 --- a/src/Storage/Storage.jl +++ b/src/Storage/Storage.jl @@ -5,9 +5,9 @@ abstract type AbstractStore end #Define the interface """ - storagesize(d::AbstractStore) + storagesize(d::AbstractStore, p::AbstractString) -This function shall return the size of all data files in a store. +This function shall return the size of all data files in a store at path `p`. """ function storagesize end @@ -168,3 +168,4 @@ include("s3store.jl") include("gcstore.jl") include("consolidated.jl") include("http.jl") +include("zipstore.jl") diff --git a/src/Storage/zipstore.jl b/src/Storage/zipstore.jl new file mode 100644 index 0000000..8e8bbd2 --- /dev/null +++ b/src/Storage/zipstore.jl @@ -0,0 +1,97 @@ +import ZipArchives + +""" + ZipStore + +A read only store that wraps an `AbstractVector{UInt8}` that contains a zip file. +""" +struct ZipStore{T <: AbstractVector{UInt8}} <: AbstractStore + r::ZipArchives.ZipBufferReader{T} +end + + +ZipStore(data::AbstractVector{UInt8}) = ZipStore(ZipArchives.ZipBufferReader(data)) + +Base.show(io::IO,::ZipStore) = print(io,"Read Only Zip Storage") + +function Base.getindex(d::ZipStore, k::AbstractString)::Union{Nothing, Vector{UInt8}} + i = ZipArchives.zip_findlast_entry(d.r, k) + if isnothing(i) + nothing + else + ZipArchives.zip_readentry(d.r, i) + end +end + +_make_prefix(p)::String =(isempty(p) || endswith(p,'/')) ? p : p*'/' + +function storagesize(d::ZipStore, p)::Int64 + prefix::String = _make_prefix(p) + s::Int128 = Int128(0) + for i in 1:ZipArchives.zip_nentries(d.r) + name = ZipArchives.zip_name(d.r, i) + if startswith(name, prefix) + filename = last(split(name, '/')) + if !in(filename,(".zattrs",".zarray",".zgroup")) + s += ZipArchives.zip_uncompressed_size(d.r, i) + end + end + end + s +end + +function subdirs(d::ZipStore, p)::Vector{String} + prefix::String = _make_prefix(p) + o = Set{String}() + for i in 1:ZipArchives.zip_nentries(d.r) + name = ZipArchives.zip_name(d.r, i) + if startswith(name, prefix) && !endswith(name, '/') + chopped_name = SubString(name, 1+ncodeunits(prefix)) + if '/' ∈ chopped_name + push!(o, first(split(chopped_name, '/'))) + end + end + end + collect(o) +end +function subkeys(d::ZipStore, p)::Vector{String} + prefix::String = _make_prefix(p) + o = Set{String}() + for i in 1:ZipArchives.zip_nentries(d.r) + name = ZipArchives.zip_name(d.r, i) + if startswith(name, prefix) && !endswith(name, '/') + chopped_name = SubString(name, 1+ncodeunits(prefix)) + if '/' ∉ chopped_name + push!(o, chopped_name) + end + end + end + collect(o) +end + +# Zip archives are generally append only +# so it doesn't quite work to make ZipStore writable. +# The idea is if you want a zipfile, you should first use one of the +# regular mutable stores, then save it to a zip archive. +""" + writezip(io::IO, s::AbstractStore, p) + +Write an AbstractStore to an IO as a zip archive. +""" +function writezip(io::IO, s::AbstractStore, p=""; kwargs...) + ZipArchives.ZipWriter(io; kwargs...) do w + _writezip(w, s, String(p)) + end +end +function _writezip(w::ZipArchives.ZipWriter, s::AbstractStore, p::String) + for subkey in subkeys(s, p) + fullname = _make_prefix(p)*subkey + data = getindex(s, fullname) + if !isnothing(data) + ZipArchives.zip_writefile(w, fullname, data) + end + end + for subdir in subdirs(s, p) + _writezip(w, s, _make_prefix(p)*subdir) + end +end \ No newline at end of file diff --git a/src/ZGroup.jl b/src/ZGroup.jl index 8bc54be..acecf2b 100644 --- a/src/ZGroup.jl +++ b/src/ZGroup.jl @@ -156,6 +156,7 @@ function zcreate(::Type{T},g::ZGroup, name::AbstractString, addargs...; kwargs.. end HTTP.serve(s::Union{ZArray,ZGroup}, args...; kwargs...) = HTTP.serve(s.storage, s.path, args...; kwargs...) +writezip(io::IO, s::Union{ZArray,ZGroup}; kwargs...) = writezip(io, s.storage, s.path; kwargs...) function consolidate_metadata(z::Union{ZArray,ZGroup}) z.writeable || throw(Base.IOError("Zarr group is not writeable. Please re-open in write mode to create an array",0)) consolidate_metadata(z.storage,z.path) diff --git a/test/Project.toml b/test/Project.toml index 352bb8c..008ae2a 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -3,6 +3,7 @@ Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" Minio = "4281f0d9-7ae0-406e-9172-b7277c1efa20" +Mmap = "a63ad114-7e13-5084-954f-fe012c677804" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" diff --git a/test/python.jl b/test/python.jl index 8124e6e..86a72ca 100644 --- a/test/python.jl +++ b/test/python.jl @@ -5,6 +5,7 @@ ### @testset "Python zarr implementation" begin +import Mmap using PyCall import PyCall: @py_str #If we are on conda, import zarr @@ -48,10 +49,16 @@ for t in dtypes, co in compressors a = zcreate(t, g,string("azerodim",t,compstr), compressor=comp) a[] = testzerodimarrays[t] end +#Also save as zip file. +open(pjulia*".zip";write=true) do io + Zarr.writezip(io, g) +end + # Test reading in python +for julia_path in (pjulia, pjulia*".zip") py""" import zarr -g = zarr.open_group($pjulia) +g = zarr.open_group($julia_path) gatts = g.attrs """ @@ -111,6 +118,10 @@ for i=1:length(dtypes), co in compressors @test py"ar.shape" == () @test convert(t, py"ar[()]") == testzerodimarrays[t] end +py""" +g.store.close() +""" +end ## Now the other way around, we create a zarr array using the python lib and read back into julia data = rand(Int32,2,6,10) @@ -160,6 +171,37 @@ a1[:,1,1] = 1:10 @test a1[:,1,1] == 1:10 # Test reading the string array @test String(g["a2"][:])=="hallo" + + +# Test zip file can be read +ppythonzip = ppython*".zip" +py""" +import numcodecs +import numpy as np +store = zarr.ZipStore($ppythonzip, mode="w") +g = zarr.group(store=store) +g.attrs["groupatt"] = "Hi" +z1 = g.create_dataset("a1", shape=(2,6,10),chunks=(1,2,3), dtype='i4') +z1[:,:,:]=$data +z1.attrs["test"]={"b": 6} +z2 = g.create_dataset("a2", shape=(5,),chunks=(5,), dtype='S1', compressor=numcodecs.Zlib()) +z2[:]=[k for k in 'hallo'] +z3 = g.create_dataset('a3', shape=(2,), dtype=str) +z3[:]=np.asarray(['test1', 'test234'], dtype='O') +store.close() +""" + +g = zopen(Zarr.ZipStore(Mmap.mmap(ppythonzip))) +@test g isa Zarr.ZGroup +@test g.attrs["groupatt"] == "Hi" +a1 = g["a1"] +@test a1 isa ZArray +@test a1[:,:,:]==permutedims(data,(3,2,1)) +@test a1.attrs["test"]==Dict("b"=>6) +# Test reading the string array +@test String(g["a2"][:])=="hallo" +@test g["a3"] == ["test1", "test234"] + end @testset "Python datetime types" begin diff --git a/test/storage.jl b/test/storage.jl index 1dde7b8..0fd9d0c 100644 --- a/test/storage.jl +++ b/test/storage.jl @@ -49,6 +49,77 @@ function test_store_common(ds) @test !Zarr.isemptysub(ds,"bar/") end +""" +Function to test the interface of a read only AbstractStore. Every complete implementation should pass this test. + +`converter` is a function that takes a Zarr.DictStore, and converts it to a read only store. + +`closer` is a function that gets called to close the read only store. +""" +function test_read_only_store_common(converter, closer=Returns(nothing)) + ds = Zarr.DictStore() + rs = converter(ds) + @test !Zarr.is_zgroup(rs,"") + + closer(rs) + ds[".zgroup"]=rand(UInt8,50) + rs = converter(ds) + + @test haskey(rs,".zgroup") + + @test Zarr.is_zgroup(rs,"") + @test !Zarr.is_zarray(rs,"") + + @test isempty(Zarr.subdirs(rs,"")) + @test sort(collect(Zarr.subkeys(rs,"")))==[".zgroup"] + + #Create a subgroup + @test !Zarr.is_zarray(rs,"bar") + + closer(rs) + ds["bar/.zarray"] = rand(UInt8,50) + rs = converter(ds) + + @test Zarr.is_zarray(rs,"bar") + @test Zarr.subdirs(rs,"") == ["bar"] + @test Zarr.subdirs(rs,"bar") == String[] + #Test getindex and setindex + data = rand(UInt8,50) + + closer(rs) + ds["bar/0.0.0"] = data + rs = converter(ds) + + @test rs["bar/0.0.0"]==data + @test Zarr.storagesize(rs,"bar")==50 + @test Zarr.isinitialized(rs,"bar/0.0.0") + @test !Zarr.isinitialized(rs,"bar/0.0.1") + + closer(rs) + Zarr.writeattrs(ds,"bar",Dict("a"=>"b")) + rs = converter(ds) + + @test Zarr.getattrs(rs,"bar")==Dict("a"=>"b") + + closer(rs) + delete!(ds,"bar/0.0.0") + rs = converter(ds) + + @test !Zarr.isinitialized(rs,"bar",CartesianIndex((0,0,0))) + @test !Zarr.isinitialized(rs,"bar/0.0.0") + + closer(rs) + ds["bar/0.0.0"] = data + rs = converter(ds) + + #Add tests for empty storage + @test Zarr.isemptysub(rs,"ba") + @test Zarr.isemptysub(rs,"ba/") + @test !Zarr.isemptysub(rs,"bar") + @test !Zarr.isemptysub(rs,"bar/") + closer(rs) +end + @testset "DirectoryStore" begin A = fill(1.0, 30, 20) chunks = (5,10) @@ -145,5 +216,35 @@ end @test g2.attrs == Dict("groupatt"=>5) @test g2["a1"].attrs == Dict("arratt"=>2.5) @test g2["a1"][:,:] == reshape(1:200,10,20) + + # The following test doesn't pass, but maybe should? + # test_read_only_store_common() do ds + # # This converts a DictStore to a read only ConsolidatedStore HTTPStore + # @async HTTP.serve(ds,"",ip,port,server=server) + # Zarr.ConsolidatedStore(Zarr.HTTPStore("http://$ip:$port"),"") + # end close(server) end + +@testset "Zip Storage" begin + s = Zarr.DictStore() + g = zgroup(s, attrs = Dict("groupatt"=>5)) + a = zcreate(Int,g,"a1",10,20,chunks=(5,5),attrs=Dict("arratt"=>2.5)) + a .= reshape(1:200,10,20) + io = IOBuffer() + Zarr.writezip(io, g) + data = take!(io) + ds = Zarr.ZipStore(data) + @test sprint(show, ds) == "Read Only Zip Storage" + g2 = zopen(ds) + @test g2.attrs == Dict("groupatt"=>5) + @test g2["a1"].attrs == Dict("arratt"=>2.5) + @test g2["a1"][:,:] == reshape(1:200,10,20) + + test_read_only_store_common() do ds + # This converts a DictStore to a read only ZipStore + io = IOBuffer() + Zarr.writezip(io, ds) + Zarr.ZipStore(take!(io)) + end +end