Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use a temporary file for Zarr group JSON #135

Merged
merged 1 commit into from
Sep 14, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 26 additions & 32 deletions activestorage/netcdf_to_zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,40 +4,31 @@
import ujson
import fsspec
import s3fs
import tempfile

from activestorage.config import *
from kerchunk.hdf import SingleHdf5ToZarr


def gen_json(file_url, fs, fs2, varname, **so):
def gen_json(file_url, fs, fs2, outf, **so):
"""Generate a json file that contains the kerchunk-ed data for Zarr."""
# set some name for the output json file
fname = os.path.splitext(file_url)[0]
if "s3:" in fname:
fname = os.path.basename(fname)
outf = f'{fname}_{varname}.json' # vanilla file name

# write it out if it's not there
if not os.path.isfile(outf):
with fs.open(file_url, **so) as infile:
# FIXME need to disentangle HDF5 errors if not OSError (most are)
try:
h5chunks = SingleHdf5ToZarr(infile, file_url, inline_threshold=0)
except OSError as exc:
raiser_1 = f"Unable to open file {file_url}. "
raiser_2 = "Check if file is netCDF3 or netCDF-classic"
print(raiser_1 + raiser_2)
raise exc

# inline threshold adjusts the Size below which binary blocks are
# included directly in the output
# a higher inline threshold can result in a larger json file but
# faster loading time
# for active storage, we don't want anything inline
# fname = os.path.splitext(file_url)[0]
# outf = f'{fname}_{varname}.json' # vanilla file name
with fs2.open(outf, 'wb') as f:
f.write(ujson.dumps(h5chunks.translate()).encode())
with fs.open(file_url, **so) as infile:
# FIXME need to disentangle HDF5 errors if not OSError (most are)
try:
h5chunks = SingleHdf5ToZarr(infile, file_url, inline_threshold=0)
except OSError as exc:
raiser_1 = f"Unable to open file {file_url}. "
raiser_2 = "Check if file is netCDF3 or netCDF-classic"
print(raiser_1 + raiser_2)
raise exc

# inline threshold adjusts the Size below which binary blocks are
# included directly in the output
# a higher inline threshold can result in a larger json file but
# faster loading time
# for active storage, we don't want anything inline
with fs2.open(outf, 'wb') as f:
f.write(ujson.dumps(h5chunks.translate()).encode())

return outf

Expand Down Expand Up @@ -85,10 +76,13 @@ def load_netcdf_zarr_generic(fileloc, varname, storage_type, build_dummy=True):
so = {}

fs2 = fsspec.filesystem('') # local file system to save final json to
out_json = gen_json(fileloc, fs, fs2, varname)

# open this monster
print(f"Attempting to open and convert {fileloc}.")
ref_ds = open_zarr_group(out_json, varname)
# Write the Zarr group JSON to a temporary file.
with tempfile.NamedTemporaryFile() as out_json:
gen_json(fileloc, fs, fs2, out_json.name)

# open this monster
print(f"Attempting to open and convert {fileloc}.")
ref_ds = open_zarr_group(out_json.name, varname)

return ref_ds