Use a temporary file for Zarr group JSON

When using multiple netCDF files with the same names, the Zarr group JSON file would previously not be overwritten after it was first written. This would lead to subsequent uses potentially using an invalid Zarr group metadata file. This change switches to use a temporary file to store the Zarr group metadata. This should not be a problem because the Zarr datasource is cached in the Active object as the _zds member between operations. Closes #134
NCAS-CMS · Aug 15, 2023 · 4ed6003 · 4ed6003
1 parent 8adf0f0
commit 4ed6003
Showing 1 changed file with 26 additions and 32 deletions.
diff --git a/activestorage/netcdf_to_zarr.py b/activestorage/netcdf_to_zarr.py
@@ -4,40 +4,31 @@
 import ujson
 import fsspec
 import s3fs
+import tempfile
 
 from activestorage.config import *
 from kerchunk.hdf import SingleHdf5ToZarr
 
 
-def gen_json(file_url, fs, fs2, varname, **so):
+def gen_json(file_url, fs, fs2, outf, **so):
     """Generate a json file that contains the kerchunk-ed data for Zarr."""
-    # set some name for the output json file
-    fname = os.path.splitext(file_url)[0]
-    if "s3:" in fname:
-        fname = os.path.basename(fname)
-    outf = f'{fname}_{varname}.json' # vanilla file name
-
-    # write it out if it's not there
-    if not os.path.isfile(outf):
-        with fs.open(file_url, **so) as infile:
-            # FIXME need to disentangle HDF5 errors if not OSError (most are)
-            try:
-                h5chunks = SingleHdf5ToZarr(infile, file_url, inline_threshold=0)
-            except OSError as exc:
-                raiser_1 = f"Unable to open file {file_url}. "
-                raiser_2 = "Check if file is netCDF3 or netCDF-classic"
-                print(raiser_1 + raiser_2)
-                raise exc
-
-            # inline threshold adjusts the Size below which binary blocks are
-            # included directly in the output
-            # a higher inline threshold can result in a larger json file but
-            # faster loading time
-            # for active storage, we don't want anything inline
-#            fname = os.path.splitext(file_url)[0]
-#            outf = f'{fname}_{varname}.json' # vanilla file name
-            with fs2.open(outf, 'wb') as f:
-                f.write(ujson.dumps(h5chunks.translate()).encode())
+    with fs.open(file_url, **so) as infile:
+        # FIXME need to disentangle HDF5 errors if not OSError (most are)
+        try:
+            h5chunks = SingleHdf5ToZarr(infile, file_url, inline_threshold=0)
+        except OSError as exc:
+            raiser_1 = f"Unable to open file {file_url}. "
+            raiser_2 = "Check if file is netCDF3 or netCDF-classic"
+            print(raiser_1 + raiser_2)
+            raise exc
+
+        # inline threshold adjusts the Size below which binary blocks are
+        # included directly in the output
+        # a higher inline threshold can result in a larger json file but
+        # faster loading time
+        # for active storage, we don't want anything inline
+        with fs2.open(outf, 'wb') as f:
+            f.write(ujson.dumps(h5chunks.translate()).encode())
 
     return outf
 
@@ -85,10 +76,13 @@ def load_netcdf_zarr_generic(fileloc, varname, storage_type, build_dummy=True):
         so = {}
 
     fs2 = fsspec.filesystem('')  # local file system to save final json to
-    out_json = gen_json(fileloc, fs, fs2, varname)
 
-    # open this monster
-    print(f"Attempting to open and convert {fileloc}.")
-    ref_ds = open_zarr_group(out_json, varname)
+    # Write the Zarr group JSON to a temporary file.
+    with tempfile.NamedTemporaryFile() as out_json:
+        gen_json(fileloc, fs, fs2, out_json.name)
+
+        # open this monster
+        print(f"Attempting to open and convert {fileloc}.")
+        ref_ds = open_zarr_group(out_json.name, varname)
 
     return ref_ds