Skip to content

Commit

Permalink
Use a temporary file for Zarr group JSON
Browse files Browse the repository at this point in the history
When using multiple netCDF files with the same names, the Zarr group
JSON file would previously not be overwritten after it was first
written. This would lead to subsequent uses potentially using an invalid
Zarr group metadata file.

This change switches to use a temporary file to store the Zarr group
metadata.  This should not be a problem because the Zarr datasource is
cached in the Active object as the _zds member between operations.

Closes #134
  • Loading branch information
markgoddard committed Aug 15, 2023
1 parent 8adf0f0 commit 4ed6003
Showing 1 changed file with 26 additions and 32 deletions.
58 changes: 26 additions & 32 deletions activestorage/netcdf_to_zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,40 +4,31 @@
import ujson
import fsspec
import s3fs
import tempfile

from activestorage.config import *
from kerchunk.hdf import SingleHdf5ToZarr


def gen_json(file_url, fs, fs2, varname, **so):
def gen_json(file_url, fs, fs2, outf, **so):
"""Generate a json file that contains the kerchunk-ed data for Zarr."""
# set some name for the output json file
fname = os.path.splitext(file_url)[0]
if "s3:" in fname:
fname = os.path.basename(fname)
outf = f'{fname}_{varname}.json' # vanilla file name

# write it out if it's not there
if not os.path.isfile(outf):
with fs.open(file_url, **so) as infile:
# FIXME need to disentangle HDF5 errors if not OSError (most are)
try:
h5chunks = SingleHdf5ToZarr(infile, file_url, inline_threshold=0)
except OSError as exc:
raiser_1 = f"Unable to open file {file_url}. "
raiser_2 = "Check if file is netCDF3 or netCDF-classic"
print(raiser_1 + raiser_2)
raise exc

# inline threshold adjusts the Size below which binary blocks are
# included directly in the output
# a higher inline threshold can result in a larger json file but
# faster loading time
# for active storage, we don't want anything inline
# fname = os.path.splitext(file_url)[0]
# outf = f'{fname}_{varname}.json' # vanilla file name
with fs2.open(outf, 'wb') as f:
f.write(ujson.dumps(h5chunks.translate()).encode())
with fs.open(file_url, **so) as infile:
# FIXME need to disentangle HDF5 errors if not OSError (most are)
try:
h5chunks = SingleHdf5ToZarr(infile, file_url, inline_threshold=0)
except OSError as exc:
raiser_1 = f"Unable to open file {file_url}. "
raiser_2 = "Check if file is netCDF3 or netCDF-classic"
print(raiser_1 + raiser_2)
raise exc

# inline threshold adjusts the Size below which binary blocks are
# included directly in the output
# a higher inline threshold can result in a larger json file but
# faster loading time
# for active storage, we don't want anything inline
with fs2.open(outf, 'wb') as f:
f.write(ujson.dumps(h5chunks.translate()).encode())

return outf

Expand Down Expand Up @@ -85,10 +76,13 @@ def load_netcdf_zarr_generic(fileloc, varname, storage_type, build_dummy=True):
so = {}

fs2 = fsspec.filesystem('') # local file system to save final json to
out_json = gen_json(fileloc, fs, fs2, varname)

# open this monster
print(f"Attempting to open and convert {fileloc}.")
ref_ds = open_zarr_group(out_json, varname)
# Write the Zarr group JSON to a temporary file.
with tempfile.NamedTemporaryFile() as out_json:
gen_json(fileloc, fs, fs2, out_json.name)

# open this monster
print(f"Attempting to open and convert {fileloc}.")
ref_ds = open_zarr_group(out_json.name, varname)

return ref_ds

0 comments on commit 4ed6003

Please sign in to comment.