diff --git a/activestorage/active.py b/activestorage/active.py index 734321f..da6e549 100644 --- a/activestorage/active.py +++ b/activestorage/active.py @@ -1,5 +1,6 @@ import concurrent.futures import os +import fsspec import numpy as np import pathlib import urllib @@ -50,6 +51,18 @@ def load_from_s3(uri, storage_options=None): return ds +def load_from_https(uri): + """ + Load a Dataset from a netCDF4 file on an https server (NGINX). + """ + #TODO need to test if NGINX server behind https:// + fs = fsspec.filesystem('http') + http_file = fs.open(uri, 'rb') + ds = pyfive.File(http_file) + print(f"Dataset loaded from https with Pyfive: {uri}") + return ds + + def get_missing_attributes(ds): """" Load all the missing attributes we need from a netcdf file @@ -151,6 +164,8 @@ def __init__( self.filename = self.ds elif input_variable and self.storage_type == "s3": self.filename = self.ds.id._filename + elif input_variable and self.storage_type == "https": + self.filename = self.ds # get storage_options self.storage_options = storage_options @@ -187,6 +202,8 @@ def __load_nc_file(self): nc = pyfive.File(self.uri) elif self.storage_type == "s3": nc = load_from_s3(self.uri, self.storage_options) + elif self.storage_type == "https": + nc = load_from_https(self.uri) self.filename = self.uri self.ds = nc[ncvar] diff --git a/activestorage/storage.py b/activestorage/storage.py index 56a1ff8..c03e1dc 100644 --- a/activestorage/storage.py +++ b/activestorage/storage.py @@ -1,4 +1,5 @@ """Active storage module.""" +import fsspec import numpy as np import pyfive @@ -34,18 +35,33 @@ def reduce_chunk(rfile, #FIXME: for the moment, open the file every time ... we might want to do that, or not # we could just use an instance of pyfive.high_level.Dataset.id # passed directly from active.py, as below - with open(rfile,'rb') as open_file: - # get the data - chunk = read_block(open_file, offset, size) - # reverse any compression and filters - chunk = filter_pipeline(chunk, compression, filters) - # make it a numpy array of bytes - chunk = ensure_ndarray(chunk) - # convert to the appropriate data type - chunk = chunk.view(dtype) - # sort out ordering and convert to the parent hyperslab dimensions - chunk = chunk.reshape(-1, order='A') - chunk = chunk.reshape(shape, order=order) + try: + with open(rfile,'rb') as open_file: + # get the data + chunk = read_block(open_file, offset, size) + # reverse any compression and filters + chunk = filter_pipeline(chunk, compression, filters) + # make it a numpy array of bytes + chunk = ensure_ndarray(chunk) + # convert to the appropriate data type + chunk = chunk.view(dtype) + # sort out ordering and convert to the parent hyperslab dimensions + chunk = chunk.reshape(-1, order='A') + chunk = chunk.reshape(shape, order=order) + except FileNotFoundError: # could a https file + fs = fsspec.filesystem('http') + with fs.open(rfile, 'rb') as open_file: + # get the data + chunk = read_block(open_file, offset, size) + # reverse any compression and filters + chunk = filter_pipeline(chunk, compression, filters) + # make it a numpy array of bytes + chunk = ensure_ndarray(chunk) + # convert to the appropriate data type + chunk = chunk.view(dtype) + # sort out ordering and convert to the parent hyperslab dimensions + chunk = chunk.reshape(-1, order='A') + chunk = chunk.reshape(shape, order=order) else: class storeinfo: pass storeinfo.byte_offset = offset diff --git a/tests/test_real_https.py b/tests/test_real_https.py new file mode 100644 index 0000000..f4a5b8c --- /dev/null +++ b/tests/test_real_https.py @@ -0,0 +1,31 @@ +import os +import numpy as np + +from activestorage.active import Active +from activestorage.active import load_from_https + + +def test_https(): + """Run a true test with a https FILE.""" + test_file_uri = "https://esgf.ceda.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/AerChemMIP/MOHC/UKESM1-0-LL/ssp370SST-lowNTCF/r1i1p1f2/Amon/cl/gn/latest/cl_Amon_UKESM1-0-LL_ssp370SST-lowNTCF_r1i1p1f2_gn_205001-209912.nc" + + active = Active(test_file_uri, "cl", storage_type="https") + active._version = 2 + active._method = "min" + result = active[0:3, 4:6, 7:9] + print("Result is", result) + assert result == np.array([0.6909787], dtype="float32") + + +def test_https_dataset(): + """Run a true test with a https DATASET.""" + test_file_uri = "https://esgf.ceda.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/AerChemMIP/MOHC/UKESM1-0-LL/ssp370SST-lowNTCF/r1i1p1f2/Amon/cl/gn/latest/cl_Amon_UKESM1-0-LL_ssp370SST-lowNTCF_r1i1p1f2_gn_205001-209912.nc" + dataset = load_from_https(test_file_uri) + av = dataset['cl'] + + active = Active(av, storage_type="https") + active._version = 2 + active._method = "min" + result = active[0:3, 4:6, 7:9] + print("Result is", result) + assert result == np.array([0.6909787], dtype="float32")