Skip to content

Commit

Permalink
Add docs
Browse files Browse the repository at this point in the history
  • Loading branch information
asinghvi17 committed Sep 7, 2024
1 parent b105af9 commit e3f5385
Show file tree
Hide file tree
Showing 11 changed files with 176 additions and 22 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
/Manifest.toml
/docs/Manifest.toml
/docs/build/

/test/ref.parquet/
/test/real_zarray.zarr/
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,15 @@ YAXArrays.open_dataset(za)
- No support for `gen` references with templates.
- No support for complex Jinja2 templates in `refs`. (Although Kerchunk hardly supports this either...)

## Acknowledgements

This effort was funded by the NASA MEaSUREs program in contribution to the Inter-mission Time Series of Land Ice Velocity and Elevation (ITS_LIVE) project (https://its-live.jpl.nasa.gov/).

## Alternatives and related packages

- You can always use Python's `xarray` directly via PythonCall.jl
- [FSSpec.jl](https://github.com/asinghvi17/FSSpec.jl) is an alternative storage backends for Zarr.jl that wraps the same [`fsspec`](https://github.com/fsspec/filesystem_spec) that `xarray` uses under the hood.

This package is of course built on top of [Zarr.jl](https://github.com/JuliaIO/Zarr.jl), which is a pure-Julia Zarr array library.
[YAXArrays.jl](https://github.com/JuliaDataCubes/YAXArrays.jl) is a Julia package that can wrap Zarr arrays in a DimensionalData-compatible interface.

77 changes: 71 additions & 6 deletions docs/make.jl
Original file line number Diff line number Diff line change
@@ -1,19 +1,84 @@
using Kerchunk
using Documenter
using Documenter, DocumenterVitepress

DocMeta.setdocmeta!(Kerchunk, :DocTestSetup, :(using Kerchunk); recursive=true)

using Literate


# First, remove any codecov files that may have been generated by the CI run
for (root, dirs, files) in walkdir(dirname(@__DIR__)) # walk through `GeometryOps/*`
# Iterate through all files in the current directory
for file in files
# If the file is a codecov file, remove it
if splitext(file)[2] == ".cov"
rm(joinpath(root, file))
end
end
end

# Now, we convert the source code to markdown files using Literate.jl
source_path = joinpath(dirname(@__DIR__), "src")
output_path = joinpath(@__DIR__, "src", "source")
mkpath(output_path)

literate_pages = Any[]

# We don't want Literate to convert the code into Documenter blocks, so we use a custom postprocessor
# to add the `@meta` block to the markdown file, which will be used by Documenter to add an edit link.
function _add_meta_edit_link_generator(path)
return function (input)
return """
```@meta
EditURL = "$(path).jl"
```
""" * input # we add `.jl` because `relpath` eats the file extension, apparently :shrug:
end
end

# First letter of `str` is made uppercase and returned
ucfirst(str::String) = string(uppercase(str[1]), str[2:end])

function process_literate_recursive!(pages::Vector{Any}, path::String)
global source_path
global output_path
if isdir(path)
contents = []
process_literate_recursive!.((contents,), normpath.(readdir(path; join = true)))
push!(pages, ucfirst(splitdir(path)[2]) => contents)
elseif isfile(path)
if endswith(path, ".jl")
relative_path = relpath(path, source_path)
output_dir = joinpath(output_path, splitdir(relative_path)[1])
Literate.markdown(
path, output_dir;
flavor = Literate.CommonMarkFlavor(),
postprocess = _add_meta_edit_link_generator(joinpath(relpath(source_path, output_dir), relative_path))
)
push!(pages, joinpath("source", splitext(relative_path)[1] * ".md"))
end
end
end

withenv("JULIA_DEBUG" => "Literate") do # allow Literate debug output to escape to the terminal!
global literate_pages
vec = []
process_literate_recursive!(vec, source_path)
literate_pages = vec[1][2] # this is a hack to get the pages in the correct order, without an initial "src" folder.
# TODO: We should probably fix the above in `process_literate_recursive!`.
end

makedocs(;
modules=[Kerchunk],
authors="Anshul Singhvi <[email protected]> and contributors",
sitename="Kerchunk.jl",
format=Documenter.HTML(;
canonical="https://JuliaIO.github.io/Kerchunk.jl",
edit_link="main",
assets=String[],
),
format=MarkdownVitepress(repo = "https://github.com/JuliaIO/Kerchunk.jl",)
pages=[
"Home" => "index.md",
"What is Kerchunk?" => "what_the_heck.md",
"API" => "api.md",
"Source code" => literate_pages,
],
)

Expand Down
6 changes: 6 additions & 0 deletions docs/src/api.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
```@index
```

```@autodocs
Modules = [Kerchunk]
```
4 changes: 2 additions & 2 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ Documentation for [Kerchunk](https://github.com/JuliaIO/Kerchunk.jl).
```@index
```

```@autodocs
Modules = [Kerchunk]
```@docs
ReferenceStore
```
4 changes: 4 additions & 0 deletions docs/src/what_the_heck.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# What is Kerchunk?

## Available data sources

20 changes: 20 additions & 0 deletions src/parquet.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#=
Kerchunk has two file formats - JSON as discussed earlier, and Parquet.
The Parquet format is a bit complicated - files are nested in
a directory structure and row indices are computable by
the chunk index.
The files are also paginated based on a parameter.
Files might look something like this:
```
ref.parquet/deep/name/refs.0.parq
ref.parquet/name/refs.0.parq
ref.parquet/.zmetadata
```
One must first parse `.zmetadata`, a JSON file, which has two fields:
- A `dict[str, str]` that encodes the zmetadata, this may contain inlined files also
- A field `record_size` that encodes how many records may be stored in a single Parquet file.
=#
49 changes: 35 additions & 14 deletions src/referencestore.jl
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ Files can also be generated, so we have to parse that and then actually material
```
=#

"""
ReferenceStore(filename_or_dict)
A `ReferenceStore` is a
"""
struct ReferenceStore{MapperType <: AbstractDict, HasTemplates} <: Zarr.AbstractStore
mapper::MapperType
zmetadata::Dict{String, Any}
Expand Down Expand Up @@ -127,24 +132,40 @@ end

# Implement the Zarr interface

function Zarr.subdirs(store::ReferenceStore, key)
path = rstrip(key, '/')
l_path = length(path)
sub_sub_keys = filter(keys(store)) do k
startswith(string(k), isempty(key) ? "" : key * "/") && # path is a child of the key
'/' in string(k)[l_path+1:end] # path has children
# Utility functions copied from Zarr.jl
function _pdict(d::AbstractDict{<: Symbol, Any}, path)
p = (isempty(path) || endswith(path,'/')) ? path : path*'/'
return filter(((k,v),) -> startswith(string(k), path) ,d)
end

function _searchsubdict(d2,p,condition)
o = Set{String}()
pspl = split(rstrip(p,'/'), '/')
lp = if length(pspl) == 1 && isempty(pspl[1])
0
else
length(pspl)
end
sub_dirs = unique!([rsplit(string(sub_sub_key), "/", limit=2)[1] for sub_sub_key in sub_sub_keys])
return sub_dirs
for k in Iterators.map(string, keys(d2))
sp = split(k,'/')
if condition(sp,lp)
push!(o,sp[lp+1])
end
end
collect(o)
end


# The actual Zarr store API implementation folloes.

function Zarr.subdirs(store::ReferenceStore, key)
d2 = _pdict(d, p)
return searchsubdict(d2, p, (sp, lp) -> length(sp) > lp + 1)
end

function Zarr.subkeys(store::ReferenceStore, key::String)
path = rstrip(key, '/')
l_path = length(path)
return filter(keys(store)) do k
startswith(string(k), isempty(key) ? "" : key * "/") && # path is a child of the key
'/' string(k)[l_path+2:end] # path has no children
end .|> string
d2 = _pdict(d,p)
_searchsubdict(d2,p,(sp,lp)->length(sp) == lp+1)
end

function Zarr.storagesize(store::ReferenceStore, key::String)
Expand Down
3 changes: 3 additions & 0 deletions test/its_live_catalog.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,6 @@ using Test
za = Zarr.zopen(st)
@test_nowarn za["vx"][1, 1] # test that reading works
end

# test ICESAT2 data
# p"s3://mymdtemp/icesat2-4.01.json"
21 changes: 21 additions & 0 deletions test/parquet.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
using CondaPkg, Parquet

# Generate the Parquet reference file
CondaPkg.withenv() do
run(```
$(CondaPkg.which("python")) -c "
import numpy as np
import fsspec
import fsspec.implementations.reference
import zarr
lz = fsspec.implementations.reference.LazyReferenceMapper.create(\"ref.parquet\")
z = zarr.open_group(lz, mode=\"w\")
d = z.create_dataset(\"name\", shape=(10,10))
d[:, :] = np.random.randn(10, 10)
g2 = z.create_group(\"deep\")
d = g2.create_dataset(\"name\", shape=(15, 15))
d[:, :] = np.random.randn(15, 15)
"
```)
end

6 changes: 6 additions & 0 deletions test/real_zarr.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# One way to benchmark how much performance the Kerchunk implementation
# is costing us is to use an actual Zarr file.
# We can simulate a Kerchunk catalog but use an actual Zarr array,
# so the difference in benchmark speeds between Kerchunk and Zarr
# should provide useful data.

0 comments on commit e3f5385

Please sign in to comment.