diff --git a/activestorage/netcdf_to_zarr.py b/activestorage/netcdf_to_zarr.py index c0605419..ccfbe49e 100644 --- a/activestorage/netcdf_to_zarr.py +++ b/activestorage/netcdf_to_zarr.py @@ -4,40 +4,31 @@ import ujson import fsspec import s3fs +import tempfile from activestorage.config import * from kerchunk.hdf import SingleHdf5ToZarr -def gen_json(file_url, fs, fs2, varname, **so): +def gen_json(file_url, fs, fs2, outf, **so): """Generate a json file that contains the kerchunk-ed data for Zarr.""" - # set some name for the output json file - fname = os.path.splitext(file_url)[0] - if "s3:" in fname: - fname = os.path.basename(fname) - outf = f'{fname}_{varname}.json' # vanilla file name - - # write it out if it's not there - if not os.path.isfile(outf): - with fs.open(file_url, **so) as infile: - # FIXME need to disentangle HDF5 errors if not OSError (most are) - try: - h5chunks = SingleHdf5ToZarr(infile, file_url, inline_threshold=0) - except OSError as exc: - raiser_1 = f"Unable to open file {file_url}. " - raiser_2 = "Check if file is netCDF3 or netCDF-classic" - print(raiser_1 + raiser_2) - raise exc - - # inline threshold adjusts the Size below which binary blocks are - # included directly in the output - # a higher inline threshold can result in a larger json file but - # faster loading time - # for active storage, we don't want anything inline -# fname = os.path.splitext(file_url)[0] -# outf = f'{fname}_{varname}.json' # vanilla file name - with fs2.open(outf, 'wb') as f: - f.write(ujson.dumps(h5chunks.translate()).encode()) + with fs.open(file_url, **so) as infile: + # FIXME need to disentangle HDF5 errors if not OSError (most are) + try: + h5chunks = SingleHdf5ToZarr(infile, file_url, inline_threshold=0) + except OSError as exc: + raiser_1 = f"Unable to open file {file_url}. " + raiser_2 = "Check if file is netCDF3 or netCDF-classic" + print(raiser_1 + raiser_2) + raise exc + + # inline threshold adjusts the Size below which binary blocks are + # included directly in the output + # a higher inline threshold can result in a larger json file but + # faster loading time + # for active storage, we don't want anything inline + with fs2.open(outf, 'wb') as f: + f.write(ujson.dumps(h5chunks.translate()).encode()) return outf @@ -85,10 +76,13 @@ def load_netcdf_zarr_generic(fileloc, varname, storage_type, build_dummy=True): so = {} fs2 = fsspec.filesystem('') # local file system to save final json to - out_json = gen_json(fileloc, fs, fs2, varname) - # open this monster - print(f"Attempting to open and convert {fileloc}.") - ref_ds = open_zarr_group(out_json, varname) + # Write the Zarr group JSON to a temporary file. + with tempfile.NamedTemporaryFile() as out_json: + gen_json(fileloc, fs, fs2, out_json.name) + + # open this monster + print(f"Attempting to open and convert {fileloc}.") + ref_ds = open_zarr_group(out_json.name, varname) return ref_ds