Skip to content

Commit

Permalink
Add read only ZipStore (#123)
Browse files Browse the repository at this point in the history
* Add zip storage

* more testing

* close zipfile in python

* fix spacing

* update version

* Update src/Storage/Storage.jl

Co-authored-by: Steve Kelly <[email protected]>

---------

Co-authored-by: Steve Kelly <[email protected]>
  • Loading branch information
nhz2 and sjkelly authored Feb 23, 2024
1 parent 1dda5f1 commit 8ad5dab
Show file tree
Hide file tree
Showing 7 changed files with 248 additions and 3 deletions.
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
OpenSSL = "4d8831e6-92b7-49fb-bdf8-b643e874388c"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
ZipArchives = "49080126-0e18-4c2a-b176-c102e4b3760c"

[compat]
AWSS3 = "0.10"
Expand All @@ -30,6 +31,7 @@ LRUCache = "1"
OffsetArrays = "0.11, 1.0"
OpenSSL = "1"
URIs = "1"
ZipArchives = "1"
julia = "1.2"

[extras]
Expand Down
5 changes: 3 additions & 2 deletions src/Storage/Storage.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ abstract type AbstractStore end

#Define the interface
"""
storagesize(d::AbstractStore)
storagesize(d::AbstractStore, p::AbstractString)
This function shall return the size of all data files in a store.
This function shall return the size of all data files in a store at path `p`.
"""
function storagesize end

Expand Down Expand Up @@ -168,3 +168,4 @@ include("s3store.jl")
include("gcstore.jl")
include("consolidated.jl")
include("http.jl")
include("zipstore.jl")
97 changes: 97 additions & 0 deletions src/Storage/zipstore.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import ZipArchives

"""
ZipStore
A read only store that wraps an `AbstractVector{UInt8}` that contains a zip file.
"""
struct ZipStore{T <: AbstractVector{UInt8}} <: AbstractStore
r::ZipArchives.ZipBufferReader{T}
end


ZipStore(data::AbstractVector{UInt8}) = ZipStore(ZipArchives.ZipBufferReader(data))

Base.show(io::IO,::ZipStore) = print(io,"Read Only Zip Storage")

function Base.getindex(d::ZipStore, k::AbstractString)::Union{Nothing, Vector{UInt8}}
i = ZipArchives.zip_findlast_entry(d.r, k)
if isnothing(i)
nothing
else
ZipArchives.zip_readentry(d.r, i)
end
end

_make_prefix(p)::String =(isempty(p) || endswith(p,'/')) ? p : p*'/'

function storagesize(d::ZipStore, p)::Int64
prefix::String = _make_prefix(p)
s::Int128 = Int128(0)
for i in 1:ZipArchives.zip_nentries(d.r)
name = ZipArchives.zip_name(d.r, i)
if startswith(name, prefix)
filename = last(split(name, '/'))
if !in(filename,(".zattrs",".zarray",".zgroup"))
s += ZipArchives.zip_uncompressed_size(d.r, i)
end
end
end
s
end

function subdirs(d::ZipStore, p)::Vector{String}
prefix::String = _make_prefix(p)
o = Set{String}()
for i in 1:ZipArchives.zip_nentries(d.r)
name = ZipArchives.zip_name(d.r, i)
if startswith(name, prefix) && !endswith(name, '/')
chopped_name = SubString(name, 1+ncodeunits(prefix))
if '/' chopped_name
push!(o, first(split(chopped_name, '/')))
end
end
end
collect(o)
end
function subkeys(d::ZipStore, p)::Vector{String}
prefix::String = _make_prefix(p)
o = Set{String}()
for i in 1:ZipArchives.zip_nentries(d.r)
name = ZipArchives.zip_name(d.r, i)
if startswith(name, prefix) && !endswith(name, '/')
chopped_name = SubString(name, 1+ncodeunits(prefix))
if '/' chopped_name
push!(o, chopped_name)
end
end
end
collect(o)
end

# Zip archives are generally append only
# so it doesn't quite work to make ZipStore writable.
# The idea is if you want a zipfile, you should first use one of the
# regular mutable stores, then save it to a zip archive.
"""
writezip(io::IO, s::AbstractStore, p)
Write an AbstractStore to an IO as a zip archive.
"""
function writezip(io::IO, s::AbstractStore, p=""; kwargs...)
ZipArchives.ZipWriter(io; kwargs...) do w
_writezip(w, s, String(p))
end
end
function _writezip(w::ZipArchives.ZipWriter, s::AbstractStore, p::String)
for subkey in subkeys(s, p)
fullname = _make_prefix(p)*subkey
data = getindex(s, fullname)
if !isnothing(data)
ZipArchives.zip_writefile(w, fullname, data)
end
end
for subdir in subdirs(s, p)
_writezip(w, s, _make_prefix(p)*subdir)
end
end
1 change: 1 addition & 0 deletions src/ZGroup.jl
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ function zcreate(::Type{T},g::ZGroup, name::AbstractString, addargs...; kwargs..
end

HTTP.serve(s::Union{ZArray,ZGroup}, args...; kwargs...) = HTTP.serve(s.storage, s.path, args...; kwargs...)
writezip(io::IO, s::Union{ZArray,ZGroup}; kwargs...) = writezip(io, s.storage, s.path; kwargs...)
function consolidate_metadata(z::Union{ZArray,ZGroup})
z.writeable || throw(Base.IOError("Zarr group is not writeable. Please re-open in write mode to create an array",0))
consolidate_metadata(z.storage,z.path)
Expand Down
1 change: 1 addition & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
Minio = "4281f0d9-7ae0-406e-9172-b7277c1efa20"
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Expand Down
44 changes: 43 additions & 1 deletion test/python.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
###
@testset "Python zarr implementation" begin

import Mmap
using PyCall
import PyCall: @py_str
#If we are on conda, import zarr
Expand Down Expand Up @@ -48,10 +49,16 @@ for t in dtypes, co in compressors
a = zcreate(t, g,string("azerodim",t,compstr), compressor=comp)
a[] = testzerodimarrays[t]
end
#Also save as zip file.
open(pjulia*".zip";write=true) do io
Zarr.writezip(io, g)
end

# Test reading in python
for julia_path in (pjulia, pjulia*".zip")
py"""
import zarr
g = zarr.open_group($pjulia)
g = zarr.open_group($julia_path)
gatts = g.attrs
"""

Expand Down Expand Up @@ -111,6 +118,10 @@ for i=1:length(dtypes), co in compressors
@test py"ar.shape" == ()
@test convert(t, py"ar[()]") == testzerodimarrays[t]
end
py"""
g.store.close()
"""
end

## Now the other way around, we create a zarr array using the python lib and read back into julia
data = rand(Int32,2,6,10)
Expand Down Expand Up @@ -160,6 +171,37 @@ a1[:,1,1] = 1:10
@test a1[:,1,1] == 1:10
# Test reading the string array
@test String(g["a2"][:])=="hallo"


# Test zip file can be read
ppythonzip = ppython*".zip"
py"""
import numcodecs
import numpy as np
store = zarr.ZipStore($ppythonzip, mode="w")
g = zarr.group(store=store)
g.attrs["groupatt"] = "Hi"
z1 = g.create_dataset("a1", shape=(2,6,10),chunks=(1,2,3), dtype='i4')
z1[:,:,:]=$data
z1.attrs["test"]={"b": 6}
z2 = g.create_dataset("a2", shape=(5,),chunks=(5,), dtype='S1', compressor=numcodecs.Zlib())
z2[:]=[k for k in 'hallo']
z3 = g.create_dataset('a3', shape=(2,), dtype=str)
z3[:]=np.asarray(['test1', 'test234'], dtype='O')
store.close()
"""

g = zopen(Zarr.ZipStore(Mmap.mmap(ppythonzip)))
@test g isa Zarr.ZGroup
@test g.attrs["groupatt"] == "Hi"
a1 = g["a1"]
@test a1 isa ZArray
@test a1[:,:,:]==permutedims(data,(3,2,1))
@test a1.attrs["test"]==Dict("b"=>6)
# Test reading the string array
@test String(g["a2"][:])=="hallo"
@test g["a3"] == ["test1", "test234"]

end

@testset "Python datetime types" begin
Expand Down
101 changes: 101 additions & 0 deletions test/storage.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,77 @@ function test_store_common(ds)
@test !Zarr.isemptysub(ds,"bar/")
end

"""
Function to test the interface of a read only AbstractStore. Every complete implementation should pass this test.
`converter` is a function that takes a Zarr.DictStore, and converts it to a read only store.
`closer` is a function that gets called to close the read only store.
"""
function test_read_only_store_common(converter, closer=Returns(nothing))
ds = Zarr.DictStore()
rs = converter(ds)
@test !Zarr.is_zgroup(rs,"")

closer(rs)
ds[".zgroup"]=rand(UInt8,50)
rs = converter(ds)

@test haskey(rs,".zgroup")

@test Zarr.is_zgroup(rs,"")
@test !Zarr.is_zarray(rs,"")

@test isempty(Zarr.subdirs(rs,""))
@test sort(collect(Zarr.subkeys(rs,"")))==[".zgroup"]

#Create a subgroup
@test !Zarr.is_zarray(rs,"bar")

closer(rs)
ds["bar/.zarray"] = rand(UInt8,50)
rs = converter(ds)

@test Zarr.is_zarray(rs,"bar")
@test Zarr.subdirs(rs,"") == ["bar"]
@test Zarr.subdirs(rs,"bar") == String[]
#Test getindex and setindex
data = rand(UInt8,50)

closer(rs)
ds["bar/0.0.0"] = data
rs = converter(ds)

@test rs["bar/0.0.0"]==data
@test Zarr.storagesize(rs,"bar")==50
@test Zarr.isinitialized(rs,"bar/0.0.0")
@test !Zarr.isinitialized(rs,"bar/0.0.1")

closer(rs)
Zarr.writeattrs(ds,"bar",Dict("a"=>"b"))
rs = converter(ds)

@test Zarr.getattrs(rs,"bar")==Dict("a"=>"b")

closer(rs)
delete!(ds,"bar/0.0.0")
rs = converter(ds)

@test !Zarr.isinitialized(rs,"bar",CartesianIndex((0,0,0)))
@test !Zarr.isinitialized(rs,"bar/0.0.0")

closer(rs)
ds["bar/0.0.0"] = data
rs = converter(ds)

#Add tests for empty storage
@test Zarr.isemptysub(rs,"ba")
@test Zarr.isemptysub(rs,"ba/")
@test !Zarr.isemptysub(rs,"bar")
@test !Zarr.isemptysub(rs,"bar/")
closer(rs)
end

@testset "DirectoryStore" begin
A = fill(1.0, 30, 20)
chunks = (5,10)
Expand Down Expand Up @@ -145,6 +216,13 @@ end
@test g2.attrs == Dict("groupatt"=>5)
@test g2["a1"].attrs == Dict("arratt"=>2.5)
@test g2["a1"][:,:] == reshape(1:200,10,20)

# The following test doesn't pass, but maybe should?
# test_read_only_store_common() do ds
# # This converts a DictStore to a read only ConsolidatedStore HTTPStore
# @async HTTP.serve(ds,"",ip,port,server=server)
# Zarr.ConsolidatedStore(Zarr.HTTPStore("http://$ip:$port"),"")
# end
close(server)
#Test server that returns 403 instead of 404 for missing chunks
server = Sockets.listen(0)
Expand All @@ -159,3 +237,26 @@ end
@test all(==(-1),g3["a"][:,:])
close(server)
end

@testset "Zip Storage" begin
s = Zarr.DictStore()
g = zgroup(s, attrs = Dict("groupatt"=>5))
a = zcreate(Int,g,"a1",10,20,chunks=(5,5),attrs=Dict("arratt"=>2.5))
a .= reshape(1:200,10,20)
io = IOBuffer()
Zarr.writezip(io, g)
data = take!(io)
ds = Zarr.ZipStore(data)
@test sprint(show, ds) == "Read Only Zip Storage"
g2 = zopen(ds)
@test g2.attrs == Dict("groupatt"=>5)
@test g2["a1"].attrs == Dict("arratt"=>2.5)
@test g2["a1"][:,:] == reshape(1:200,10,20)

test_read_only_store_common() do ds
# This converts a DictStore to a read only ZipStore
io = IOBuffer()
Zarr.writezip(io, ds)
Zarr.ZipStore(take!(io))
end
end

0 comments on commit 8ad5dab

Please sign in to comment.