Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add read only ZipStore #123

Merged
merged 6 commits into from
Feb 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
OpenSSL = "4d8831e6-92b7-49fb-bdf8-b643e874388c"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"
ZipArchives = "49080126-0e18-4c2a-b176-c102e4b3760c"

[compat]
AWSS3 = "0.10"
Expand All @@ -30,6 +31,7 @@ LRUCache = "1"
OffsetArrays = "0.11, 1.0"
OpenSSL = "1"
URIs = "1"
ZipArchives = "1"
julia = "1.2"

[extras]
Expand Down
5 changes: 3 additions & 2 deletions src/Storage/Storage.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ abstract type AbstractStore end

#Define the interface
"""
storagesize(d::AbstractStore)
storagesize(d::AbstractStore, p::AbstractString)
sjkelly marked this conversation as resolved.
Show resolved Hide resolved

This function shall return the size of all data files in a store.
This function shall return the size of all data files in a store at path `p`.
"""
function storagesize end

Expand Down Expand Up @@ -168,3 +168,4 @@ include("s3store.jl")
include("gcstore.jl")
include("consolidated.jl")
include("http.jl")
include("zipstore.jl")
97 changes: 97 additions & 0 deletions src/Storage/zipstore.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import ZipArchives

"""
ZipStore

A read only store that wraps an `AbstractVector{UInt8}` that contains a zip file.
"""
struct ZipStore{T <: AbstractVector{UInt8}} <: AbstractStore
r::ZipArchives.ZipBufferReader{T}
end


ZipStore(data::AbstractVector{UInt8}) = ZipStore(ZipArchives.ZipBufferReader(data))

Base.show(io::IO,::ZipStore) = print(io,"Read Only Zip Storage")

function Base.getindex(d::ZipStore, k::AbstractString)::Union{Nothing, Vector{UInt8}}
i = ZipArchives.zip_findlast_entry(d.r, k)
if isnothing(i)
nothing
else
ZipArchives.zip_readentry(d.r, i)
end
end

_make_prefix(p)::String =(isempty(p) || endswith(p,'/')) ? p : p*'/'

function storagesize(d::ZipStore, p)::Int64
prefix::String = _make_prefix(p)
s::Int128 = Int128(0)
for i in 1:ZipArchives.zip_nentries(d.r)
name = ZipArchives.zip_name(d.r, i)
if startswith(name, prefix)
filename = last(split(name, '/'))
if !in(filename,(".zattrs",".zarray",".zgroup"))
s += ZipArchives.zip_uncompressed_size(d.r, i)
end
end
end
s
end

function subdirs(d::ZipStore, p)::Vector{String}
prefix::String = _make_prefix(p)
o = Set{String}()
for i in 1:ZipArchives.zip_nentries(d.r)
name = ZipArchives.zip_name(d.r, i)
if startswith(name, prefix) && !endswith(name, '/')
chopped_name = SubString(name, 1+ncodeunits(prefix))
if '/' ∈ chopped_name
push!(o, first(split(chopped_name, '/')))
end
end
end
collect(o)
end
function subkeys(d::ZipStore, p)::Vector{String}
prefix::String = _make_prefix(p)
o = Set{String}()
for i in 1:ZipArchives.zip_nentries(d.r)
name = ZipArchives.zip_name(d.r, i)
if startswith(name, prefix) && !endswith(name, '/')
chopped_name = SubString(name, 1+ncodeunits(prefix))
if '/' ∉ chopped_name
push!(o, chopped_name)
end
end
end
collect(o)
end

# Zip archives are generally append only
# so it doesn't quite work to make ZipStore writable.
# The idea is if you want a zipfile, you should first use one of the
# regular mutable stores, then save it to a zip archive.
"""
writezip(io::IO, s::AbstractStore, p)

Write an AbstractStore to an IO as a zip archive.
"""
function writezip(io::IO, s::AbstractStore, p=""; kwargs...)
ZipArchives.ZipWriter(io; kwargs...) do w
_writezip(w, s, String(p))
end
end
function _writezip(w::ZipArchives.ZipWriter, s::AbstractStore, p::String)
for subkey in subkeys(s, p)
fullname = _make_prefix(p)*subkey
data = getindex(s, fullname)
if !isnothing(data)
ZipArchives.zip_writefile(w, fullname, data)
end
end
for subdir in subdirs(s, p)
_writezip(w, s, _make_prefix(p)*subdir)
end
end
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a very nice interface idea to produce zip stores.

1 change: 1 addition & 0 deletions src/ZGroup.jl
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ function zcreate(::Type{T},g::ZGroup, name::AbstractString, addargs...; kwargs..
end

HTTP.serve(s::Union{ZArray,ZGroup}, args...; kwargs...) = HTTP.serve(s.storage, s.path, args...; kwargs...)
writezip(io::IO, s::Union{ZArray,ZGroup}; kwargs...) = writezip(io, s.storage, s.path; kwargs...)
function consolidate_metadata(z::Union{ZArray,ZGroup})
z.writeable || throw(Base.IOError("Zarr group is not writeable. Please re-open in write mode to create an array",0))
consolidate_metadata(z.storage,z.path)
Expand Down
1 change: 1 addition & 0 deletions test/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
Minio = "4281f0d9-7ae0-406e-9172-b7277c1efa20"
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Expand Down
44 changes: 43 additions & 1 deletion test/python.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
###
@testset "Python zarr implementation" begin

import Mmap
using PyCall
import PyCall: @py_str
#If we are on conda, import zarr
Expand Down Expand Up @@ -48,10 +49,16 @@ for t in dtypes, co in compressors
a = zcreate(t, g,string("azerodim",t,compstr), compressor=comp)
a[] = testzerodimarrays[t]
end
#Also save as zip file.
open(pjulia*".zip";write=true) do io
Zarr.writezip(io, g)
end

# Test reading in python
for julia_path in (pjulia, pjulia*".zip")
py"""
import zarr
g = zarr.open_group($pjulia)
g = zarr.open_group($julia_path)
gatts = g.attrs
"""

Expand Down Expand Up @@ -111,6 +118,10 @@ for i=1:length(dtypes), co in compressors
@test py"ar.shape" == ()
@test convert(t, py"ar[()]") == testzerodimarrays[t]
end
py"""
g.store.close()
"""
end

## Now the other way around, we create a zarr array using the python lib and read back into julia
data = rand(Int32,2,6,10)
Expand Down Expand Up @@ -160,6 +171,37 @@ a1[:,1,1] = 1:10
@test a1[:,1,1] == 1:10
# Test reading the string array
@test String(g["a2"][:])=="hallo"


# Test zip file can be read
ppythonzip = ppython*".zip"
py"""
import numcodecs
import numpy as np
store = zarr.ZipStore($ppythonzip, mode="w")
g = zarr.group(store=store)
g.attrs["groupatt"] = "Hi"
z1 = g.create_dataset("a1", shape=(2,6,10),chunks=(1,2,3), dtype='i4')
z1[:,:,:]=$data
z1.attrs["test"]={"b": 6}
z2 = g.create_dataset("a2", shape=(5,),chunks=(5,), dtype='S1', compressor=numcodecs.Zlib())
z2[:]=[k for k in 'hallo']
z3 = g.create_dataset('a3', shape=(2,), dtype=str)
z3[:]=np.asarray(['test1', 'test234'], dtype='O')
store.close()
"""

g = zopen(Zarr.ZipStore(Mmap.mmap(ppythonzip)))
@test g isa Zarr.ZGroup
@test g.attrs["groupatt"] == "Hi"
a1 = g["a1"]
@test a1 isa ZArray
@test a1[:,:,:]==permutedims(data,(3,2,1))
@test a1.attrs["test"]==Dict("b"=>6)
# Test reading the string array
@test String(g["a2"][:])=="hallo"
@test g["a3"] == ["test1", "test234"]

end

@testset "Python datetime types" begin
Expand Down
101 changes: 101 additions & 0 deletions test/storage.jl
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,77 @@ function test_store_common(ds)
@test !Zarr.isemptysub(ds,"bar/")
end

"""
Function to test the interface of a read only AbstractStore. Every complete implementation should pass this test.

`converter` is a function that takes a Zarr.DictStore, and converts it to a read only store.

`closer` is a function that gets called to close the read only store.
"""
function test_read_only_store_common(converter, closer=Returns(nothing))
ds = Zarr.DictStore()
rs = converter(ds)
@test !Zarr.is_zgroup(rs,"")

closer(rs)
ds[".zgroup"]=rand(UInt8,50)
rs = converter(ds)

@test haskey(rs,".zgroup")

@test Zarr.is_zgroup(rs,"")
@test !Zarr.is_zarray(rs,"")

@test isempty(Zarr.subdirs(rs,""))
@test sort(collect(Zarr.subkeys(rs,"")))==[".zgroup"]

#Create a subgroup
@test !Zarr.is_zarray(rs,"bar")

closer(rs)
ds["bar/.zarray"] = rand(UInt8,50)
rs = converter(ds)

@test Zarr.is_zarray(rs,"bar")
@test Zarr.subdirs(rs,"") == ["bar"]
@test Zarr.subdirs(rs,"bar") == String[]
#Test getindex and setindex
data = rand(UInt8,50)

closer(rs)
ds["bar/0.0.0"] = data
rs = converter(ds)

@test rs["bar/0.0.0"]==data
@test Zarr.storagesize(rs,"bar")==50
@test Zarr.isinitialized(rs,"bar/0.0.0")
@test !Zarr.isinitialized(rs,"bar/0.0.1")

closer(rs)
Zarr.writeattrs(ds,"bar",Dict("a"=>"b"))
rs = converter(ds)

@test Zarr.getattrs(rs,"bar")==Dict("a"=>"b")

closer(rs)
delete!(ds,"bar/0.0.0")
rs = converter(ds)

@test !Zarr.isinitialized(rs,"bar",CartesianIndex((0,0,0)))
@test !Zarr.isinitialized(rs,"bar/0.0.0")

closer(rs)
ds["bar/0.0.0"] = data
rs = converter(ds)

#Add tests for empty storage
@test Zarr.isemptysub(rs,"ba")
@test Zarr.isemptysub(rs,"ba/")
@test !Zarr.isemptysub(rs,"bar")
@test !Zarr.isemptysub(rs,"bar/")
closer(rs)
end

@testset "DirectoryStore" begin
A = fill(1.0, 30, 20)
chunks = (5,10)
Expand Down Expand Up @@ -145,5 +216,35 @@ end
@test g2.attrs == Dict("groupatt"=>5)
@test g2["a1"].attrs == Dict("arratt"=>2.5)
@test g2["a1"][:,:] == reshape(1:200,10,20)

# The following test doesn't pass, but maybe should?
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I will check this later. Maybe we could make a broken test from it, let's see

# test_read_only_store_common() do ds
# # This converts a DictStore to a read only ConsolidatedStore HTTPStore
# @async HTTP.serve(ds,"",ip,port,server=server)
# Zarr.ConsolidatedStore(Zarr.HTTPStore("http://$ip:$port"),"")
# end
close(server)
end

@testset "Zip Storage" begin
s = Zarr.DictStore()
g = zgroup(s, attrs = Dict("groupatt"=>5))
a = zcreate(Int,g,"a1",10,20,chunks=(5,5),attrs=Dict("arratt"=>2.5))
a .= reshape(1:200,10,20)
io = IOBuffer()
Zarr.writezip(io, g)
data = take!(io)
ds = Zarr.ZipStore(data)
@test sprint(show, ds) == "Read Only Zip Storage"
g2 = zopen(ds)
@test g2.attrs == Dict("groupatt"=>5)
@test g2["a1"].attrs == Dict("arratt"=>2.5)
@test g2["a1"][:,:] == reshape(1:200,10,20)

test_read_only_store_common() do ds
# This converts a DictStore to a read only ZipStore
io = IOBuffer()
Zarr.writezip(io, ds)
Zarr.ZipStore(take!(io))
end
end
Loading