Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable async chunk reading #106

Merged
merged 9 commits into from
Apr 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Zarr"
uuid = "0a941bbe-ad1d-11e8-39d9-ab76183a1d99"
authors = ["Fabian Gans <[email protected]>"]
version = "0.8.0"
version = "0.9.0"

[deps]
AWSS3 = "1c724243-ef5b-51ab-93f4-b0a88ac62a95"
Expand All @@ -14,6 +14,7 @@ HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
LRUCache = "8ac3fa9e-de4c-5943-b1dc-09c6b5f20637"
OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881"
OpenSSL = "4d8831e6-92b7-49fb-bdf8-b643e874388c"
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
URIs = "5c2747f8-b7ea-4ff2-ba2e-563bfd36b1d4"

Expand Down
4 changes: 2 additions & 2 deletions docs/src/tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ A number of different compressors can be used with Zarr. In this Julia package w
julia> using Zarr

julia> compressor = Zarr.BloscCompressor(cname="zstd", clevel=3, shuffle=true)
Zarr.BloscCompressor(0, 3, "zstd", 1)
Zarr.BloscCompressor(0, 3, "zstd", true)

julia> data = Int32(1):Int32(100000000)
1:100000000
Expand All @@ -195,7 +195,7 @@ Shape : (10000, 10000)
Chunk Shape : (1000, 1000)
Order : C
Read-Only : false
Compressor : Zarr.BloscCompressor(0, 3, "zstd", 1)
Compressor : Zarr.BloscCompressor(0, 3, "zstd", true)
Filters : nothing
Store type : Dictionary Storage
No. bytes : 400000000
Expand Down
14 changes: 14 additions & 0 deletions src/Compressors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,20 @@ end

zuncompress(a, ::BloscCompressor, T) = Blosc.decompress(Base.nonmissingtype(T), a)

function zuncompress!(data::DenseArray, compressed, ::BloscCompressor)
Blosc.decompress!(vec(data),compressed)
# if Int(pointer(data,length(data))-pointer(data)) != (length(data)-1)*sizeof(eltype(data))
# @show size(data)
# @show size(parent(data))
# @show typeof(data)
# @show Int(pointer(data,length(data))-pointer(data))
# @show (length(data)-1)*sizeof(eltype(data))
# error("Something is wrong")
# end
# Zarr.Blosc.blosc_decompress(data, compressed, sizeof(data))
end


function zcompress(a, c::BloscCompressor)
itemsize = sizeof(eltype(a))
shuffle = c.shuffle
Expand Down
61 changes: 61 additions & 0 deletions src/Storage/Storage.jl
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,67 @@ function writemetadata(s::AbstractStore, p, m::Metadata)
end


## Handling sequential vs parallel IO
struct SequentialRead end
struct ConcurrentRead
ntasks::Int
end
store_read_strategy(::AbstractStore) = SequentialRead()

channelsize(s) = channelsize(store_read_strategy(s))
channelsize(::SequentialRead) = 0
channelsize(c::ConcurrentRead) = c.ntasks

read_items!(s::AbstractStore,c::AbstractChannel, p, i) = read_items!(s,c,store_read_strategy(s),p,i)
function read_items!(s::AbstractStore,c::AbstractChannel, ::SequentialRead ,p,i)
for ii in i
res = s[p,ii]
put!(c,(ii=>res))
end
end
function read_items!(s::AbstractStore,c::AbstractChannel, r::ConcurrentRead ,p,i)
ntasks = r.ntasks
#@show ntasks
asyncmap(i,ntasks = ntasks) do ii
#@show ii,objectid(current_task),p
res = s[p,ii]
#@show ii,length(res)
put!(c,(ii=>res))
nothing
end
end

write_items!(s::AbstractStore,c::AbstractChannel, p, i) = write_items!(s,c,store_read_strategy(s),p,i)
function write_items!(s::AbstractStore,c::AbstractChannel, ::SequentialRead ,p,i)
for _ in 1:length(i)
ii,data = take!(c)
if data === nothing
if isinitialized(s,p,ii)
delete!(s,p,ii)
end
else
s[p,ii] = data
end
end
close(c)
end

function write_items!(s::AbstractStore,c::AbstractChannel, r::ConcurrentRead ,p,i)
ntasks = r.ntasks
asyncmap(i,ntasks = ntasks) do _
ii,data = take!(c)
if data === nothing
if isinitialized(s,ii)
delete!(s,ii)
end
else
s[p,ii] = data
end
nothing
end
close(c)
end

isemptysub(s::AbstractStore, p) = isempty(subkeys(s,p)) && isempty(subdirs(s,p))

#Here different storage backends can register regexes that are checked against
Expand Down
1 change: 1 addition & 0 deletions src/Storage/consolidated.jl
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ function Base.delete!(d::ConsolidatedStore,i::String)
delete!(d.parent,i)
end

store_read_strategy(s::ConsolidatedStore) = store_read_strategy(s.parent)

function consolidate_metadata(s::AbstractStore,d,prefix)
for k in (".zattrs",".zarray",".zgroup")
Expand Down
2 changes: 2 additions & 0 deletions src/Storage/gcstore.jl
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,5 @@ function storefromstring(::Type{<:GCStore}, url,_)
@debug "path: $p"
return GCStore(url),p
end

store_read_strategy(::GCStore) = ConcurrentRead(concurrent_io_tasks[])
5 changes: 4 additions & 1 deletion src/Storage/http.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using HTTP
using OpenSSL: OpenSSL

"""
HTTPStore
Expand All @@ -13,7 +14,7 @@ struct HTTPStore <: AbstractStore
end

function Base.getindex(s::HTTPStore, k::String)
r = HTTP.request("GET",string(s.url,"/",k),status_exception = false)
r = HTTP.request("GET",string(s.url,"/",k),status_exception = false,socket_type_tls=OpenSSL.SSLStream,connection_limit=25)
if r.status >= 300
if r.status == 404
nothing
Expand All @@ -25,10 +26,12 @@ else
end
end


push!(storageregexlist,r"^https://"=>HTTPStore)
push!(storageregexlist,r"^http://"=>HTTPStore)
storefromstring(::Type{<:HTTPStore}, s,_) = ConsolidatedStore(HTTPStore(s),""),""

store_read_strategy(::HTTPStore) = ConcurrentRead(concurrent_io_tasks[])
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we are using channels, then I think it would be better to change the default concurrent_io_tasks size to around 100 rather than the current default (10). This corresponds to the maximum number of concurrent connections in aiohttp for example and I think it would be better to prioritize runtime over memory usage, then users would set this lower only if they need to do suboptimal chunk access and want to limit their memory usage.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I have increased this to 50 for now. Do you still see speedups for 100 tasks compared to 50? If yes, then I am happy to increase this further.



## This is a server implementation for Zarr datasets
Expand Down
41 changes: 0 additions & 41 deletions src/Storage/lru.jl

This file was deleted.

2 changes: 2 additions & 0 deletions src/Storage/s3store.jl
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,5 @@ function storefromstring(::Type{<:S3Store}, s, _)
path = join(decomp[3:end],"/")
S3Store(String(bucket),aws=AWSS3.AWS.global_aws_config()),path
end

store_read_strategy(::S3Store) = ConcurrentRead(concurrent_io_tasks[])
Loading