Add docs

JuliaIO · Sep 7, 2024 · e3f5385 · e3f5385
1 parent b105af9
commit e3f5385
Show file tree

Hide file tree

Showing 11 changed files with 176 additions and 22 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
 /Manifest.toml
 /docs/Manifest.toml
 /docs/build/
+
+/test/ref.parquet/
+/test/real_zarray.zarr/
diff --git a/README.md b/README.md
@@ -35,10 +35,15 @@ YAXArrays.open_dataset(za)
 - No support for `gen` references with templates.
 - No support for complex Jinja2 templates in `refs`.  (Although Kerchunk hardly supports this either...)
 
+## Acknowledgements
+
+This effort was funded by the NASA MEaSUREs program in contribution to the Inter-mission Time Series of Land Ice Velocity and Elevation (ITS_LIVE) project (https://its-live.jpl.nasa.gov/).
+
 ## Alternatives and related packages
 
 - You can always use Python's `xarray` directly via PythonCall.jl
 - [FSSpec.jl](https://github.com/asinghvi17/FSSpec.jl) is an alternative storage backends for Zarr.jl that wraps the same [`fsspec`](https://github.com/fsspec/filesystem_spec) that `xarray` uses under the hood.
 
 This package is of course built on top of [Zarr.jl](https://github.com/JuliaIO/Zarr.jl), which is a pure-Julia Zarr array library.
 [YAXArrays.jl](https://github.com/JuliaDataCubes/YAXArrays.jl) is a Julia package that can wrap Zarr arrays in a DimensionalData-compatible interface.
+
diff --git a/docs/make.jl b/docs/make.jl
@@ -1,19 +1,84 @@
 using Kerchunk
-using Documenter
+using Documenter, DocumenterVitepress
 
 DocMeta.setdocmeta!(Kerchunk, :DocTestSetup, :(using Kerchunk); recursive=true)
 
+using Literate
+
+
+# First, remove any codecov files that may have been generated by the CI run
+for (root, dirs, files) in walkdir(dirname(@__DIR__)) # walk through `GeometryOps/*`
+    # Iterate through all files in the current directory
+    for file in files
+        # If the file is a codecov file, remove it
+        if splitext(file)[2] == ".cov"
+            rm(joinpath(root, file))
+        end
+    end
+end
+
+# Now, we convert the source code to markdown files using Literate.jl
+source_path = joinpath(dirname(@__DIR__), "src")
+output_path = joinpath(@__DIR__, "src", "source")
+mkpath(output_path)
+
+literate_pages = Any[]
+
+# We don't want Literate to convert the code into Documenter blocks, so we use a custom postprocessor
+# to add the `@meta` block to the markdown file, which will be used by Documenter to add an edit link.
+function _add_meta_edit_link_generator(path)
+    return function (input)
+        return """
+        ```@meta
+        EditURL = "$(path).jl"
+        ```
+
+        """ * input # we add `.jl` because `relpath` eats the file extension, apparently :shrug:
+    end
+end
+
+# First letter of `str` is made uppercase and returned
+ucfirst(str::String) = string(uppercase(str[1]), str[2:end])
+
+function process_literate_recursive!(pages::Vector{Any}, path::String)
+    global source_path
+    global output_path
+    if isdir(path)
+        contents = []
+        process_literate_recursive!.((contents,), normpath.(readdir(path; join = true)))
+        push!(pages, ucfirst(splitdir(path)[2]) => contents)
+    elseif isfile(path)
+        if endswith(path, ".jl")
+            relative_path = relpath(path, source_path)
+            output_dir = joinpath(output_path, splitdir(relative_path)[1])
+            Literate.markdown(
+                path, output_dir; 
+                flavor = Literate.CommonMarkFlavor(), 
+                postprocess = _add_meta_edit_link_generator(joinpath(relpath(source_path, output_dir), relative_path))
+            )
+            push!(pages, joinpath("source", splitext(relative_path)[1] * ".md"))
+        end
+    end
+end
+
+withenv("JULIA_DEBUG" => "Literate") do # allow Literate debug output to escape to the terminal!
+    global literate_pages
+    vec = []
+    process_literate_recursive!(vec, source_path)
+    literate_pages = vec[1][2] # this is a hack to get the pages in the correct order, without an initial "src" folder.  
+    # TODO: We should probably fix the above in `process_literate_recursive!`.
+end
+
 makedocs(;
     modules=[Kerchunk],
     authors="Anshul Singhvi <[email protected]> and contributors",
     sitename="Kerchunk.jl",
-    format=Documenter.HTML(;
-        canonical="https://JuliaIO.github.io/Kerchunk.jl",
-        edit_link="main",
-        assets=String[],
-    ),
+    format=MarkdownVitepress(repo = "https://github.com/JuliaIO/Kerchunk.jl",)
     pages=[
         "Home" => "index.md",
+        "What is Kerchunk?" => "what_the_heck.md",
+        "API" => "api.md",
+        "Source code" => literate_pages,
     ],
 )
 

diff --git a/docs/src/api.md b/docs/src/api.md
@@ -0,0 +1,6 @@
+```@index
+```
+
+```@autodocs
+Modules = [Kerchunk]
+```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -9,6 +9,6 @@ Documentation for [Kerchunk](https://github.com/JuliaIO/Kerchunk.jl).
 ```@index
 ```
 
-```@autodocs
-Modules = [Kerchunk]
+```@docs
+ReferenceStore
 ```
diff --git a/docs/src/what_the_heck.md b/docs/src/what_the_heck.md
@@ -0,0 +1,4 @@
+# What is Kerchunk?
+
+## Available data sources
+
diff --git a/src/parquet.jl b/src/parquet.jl
@@ -0,0 +1,20 @@
+#=
+Kerchunk has two file formats - JSON as discussed earlier, and Parquet.
+The Parquet format is a bit complicated - files are nested in 
+a directory structure and row indices are computable by 
+the chunk index.
+The files are also paginated based on a parameter.
+
+Files might look something like this:
+
+```
+ref.parquet/deep/name/refs.0.parq
+ref.parquet/name/refs.0.parq
+ref.parquet/.zmetadata
+```
+
+One must first parse `.zmetadata`, a JSON file, which has two fields:
+- A `dict[str, str]` that encodes the zmetadata, this may contain inlined files also
+- A field `record_size` that encodes how many records may be stored in a single Parquet file.
+
+=#
diff --git a/src/referencestore.jl b/src/referencestore.jl
@@ -57,6 +57,11 @@ Files can also be generated, so we have to parse that and then actually material
 ```
 =#
 
+"""
+    ReferenceStore(filename_or_dict)
+
+A `ReferenceStore` is a 
+"""
 struct ReferenceStore{MapperType <: AbstractDict, HasTemplates} <: Zarr.AbstractStore
     mapper::MapperType
     zmetadata::Dict{String, Any}
@@ -127,24 +132,40 @@ end
 
 # Implement the Zarr interface
 
-function Zarr.subdirs(store::ReferenceStore, key)
-    path = rstrip(key, '/')
-    l_path = length(path)
-    sub_sub_keys = filter(keys(store)) do k
-        startswith(string(k), isempty(key) ? "" : key * "/") && # path is a child of the key
-        '/' in string(k)[l_path+1:end] # path has children
+# Utility functions copied from Zarr.jl
+function _pdict(d::AbstractDict{<: Symbol, Any}, path)
+    p = (isempty(path) || endswith(path,'/')) ? path : path*'/'
+    return filter(((k,v),) -> startswith(string(k), path) ,d)
+end
+
+function _searchsubdict(d2,p,condition)
+    o = Set{String}()
+    pspl = split(rstrip(p,'/'), '/')
+    lp = if length(pspl) == 1 && isempty(pspl[1])
+        0
+    else
+        length(pspl)
     end
-    sub_dirs = unique!([rsplit(string(sub_sub_key), "/", limit=2)[1] for sub_sub_key in sub_sub_keys])
-    return sub_dirs
+    for k in Iterators.map(string, keys(d2))
+      sp = split(k,'/')
+        if condition(sp,lp)
+            push!(o,sp[lp+1])
+        end
+    end
+    collect(o)
+end
+
+
+# The actual Zarr store API implementation folloes.
+
+function Zarr.subdirs(store::ReferenceStore, key)
+    d2 = _pdict(d, p)
+    return searchsubdict(d2, p, (sp, lp) -> length(sp) > lp + 1)
 end
 
 function Zarr.subkeys(store::ReferenceStore, key::String)
-    path = rstrip(key, '/')
-    l_path = length(path)
-    return filter(keys(store)) do k
-        startswith(string(k), isempty(key) ? "" : key * "/") && # path is a child of the key
-        '/' ∉ string(k)[l_path+2:end] # path has no children
-    end .|> string
+    d2 = _pdict(d,p)
+    _searchsubdict(d2,p,(sp,lp)->length(sp) == lp+1)
 end
 
 function Zarr.storagesize(store::ReferenceStore, key::String) 

diff --git a/test/its_live_catalog.jl b/test/its_live_catalog.jl
@@ -8,3 +8,6 @@ using Test
     za = Zarr.zopen(st)
     @test_nowarn za["vx"][1, 1] # test that reading works
 end
+
+# test ICESAT2 data
+# p"s3://mymdtemp/icesat2-4.01.json"
diff --git a/test/parquet.jl b/test/parquet.jl
@@ -0,0 +1,21 @@
+using CondaPkg, Parquet
+
+# Generate the Parquet reference file
+CondaPkg.withenv() do 
+run(```
+$(CondaPkg.which("python")) -c "
+import numpy as np
+import fsspec
+import fsspec.implementations.reference
+import zarr
+lz = fsspec.implementations.reference.LazyReferenceMapper.create(\"ref.parquet\")
+z = zarr.open_group(lz, mode=\"w\")
+d = z.create_dataset(\"name\", shape=(10,10))
+d[:, :] = np.random.randn(10, 10)
+g2 = z.create_group(\"deep\")
+d = g2.create_dataset(\"name\", shape=(15, 15))
+d[:, :] = np.random.randn(15, 15)
+"
+```)
+end
+
diff --git a/test/real_zarr.jl b/test/real_zarr.jl
@@ -0,0 +1,6 @@
+# One way to benchmark how much performance the Kerchunk implementation
+# is costing us is to use an actual Zarr file.
+# We can simulate a Kerchunk catalog but use an actual Zarr array,
+# so the difference in benchmark speeds between Kerchunk and Zarr
+# should provide useful data.
+