From 4ed6003840e1694502acf5d00be5df40642ed230 Mon Sep 17 00:00:00 2001 From: Mark Goddard Date: Tue, 15 Aug 2023 12:56:51 +0100 Subject: [PATCH] Use a temporary file for Zarr group JSON When using multiple netCDF files with the same names, the Zarr group JSON file would previously not be overwritten after it was first written. This would lead to subsequent uses potentially using an invalid Zarr group metadata file. This change switches to use a temporary file to store the Zarr group metadata. This should not be a problem because the Zarr datasource is cached in the Active object as the _zds member between operations. Closes #134 --- activestorage/netcdf_to_zarr.py | 58 +++++++++++++++------------------ 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/activestorage/netcdf_to_zarr.py b/activestorage/netcdf_to_zarr.py index c0605419..ccfbe49e 100644 --- a/activestorage/netcdf_to_zarr.py +++ b/activestorage/netcdf_to_zarr.py @@ -4,40 +4,31 @@ import ujson import fsspec import s3fs +import tempfile from activestorage.config import * from kerchunk.hdf import SingleHdf5ToZarr -def gen_json(file_url, fs, fs2, varname, **so): +def gen_json(file_url, fs, fs2, outf, **so): """Generate a json file that contains the kerchunk-ed data for Zarr.""" - # set some name for the output json file - fname = os.path.splitext(file_url)[0] - if "s3:" in fname: - fname = os.path.basename(fname) - outf = f'{fname}_{varname}.json' # vanilla file name - - # write it out if it's not there - if not os.path.isfile(outf): - with fs.open(file_url, **so) as infile: - # FIXME need to disentangle HDF5 errors if not OSError (most are) - try: - h5chunks = SingleHdf5ToZarr(infile, file_url, inline_threshold=0) - except OSError as exc: - raiser_1 = f"Unable to open file {file_url}. " - raiser_2 = "Check if file is netCDF3 or netCDF-classic" - print(raiser_1 + raiser_2) - raise exc - - # inline threshold adjusts the Size below which binary blocks are - # included directly in the output - # a higher inline threshold can result in a larger json file but - # faster loading time - # for active storage, we don't want anything inline -# fname = os.path.splitext(file_url)[0] -# outf = f'{fname}_{varname}.json' # vanilla file name - with fs2.open(outf, 'wb') as f: - f.write(ujson.dumps(h5chunks.translate()).encode()) + with fs.open(file_url, **so) as infile: + # FIXME need to disentangle HDF5 errors if not OSError (most are) + try: + h5chunks = SingleHdf5ToZarr(infile, file_url, inline_threshold=0) + except OSError as exc: + raiser_1 = f"Unable to open file {file_url}. " + raiser_2 = "Check if file is netCDF3 or netCDF-classic" + print(raiser_1 + raiser_2) + raise exc + + # inline threshold adjusts the Size below which binary blocks are + # included directly in the output + # a higher inline threshold can result in a larger json file but + # faster loading time + # for active storage, we don't want anything inline + with fs2.open(outf, 'wb') as f: + f.write(ujson.dumps(h5chunks.translate()).encode()) return outf @@ -85,10 +76,13 @@ def load_netcdf_zarr_generic(fileloc, varname, storage_type, build_dummy=True): so = {} fs2 = fsspec.filesystem('') # local file system to save final json to - out_json = gen_json(fileloc, fs, fs2, varname) - # open this monster - print(f"Attempting to open and convert {fileloc}.") - ref_ds = open_zarr_group(out_json, varname) + # Write the Zarr group JSON to a temporary file. + with tempfile.NamedTemporaryFile() as out_json: + gen_json(fileloc, fs, fs2, out_json.name) + + # open this monster + print(f"Attempting to open and convert {fileloc}.") + ref_ds = open_zarr_group(out_json.name, varname) return ref_ds