From 42c5cd492fe1036081c208cf1700525baf2d41b2 Mon Sep 17 00:00:00 2001 From: Valeriu Predoi Date: Mon, 10 Mar 2025 16:04:19 +0000 Subject: [PATCH 1/6] start implementing https functionality --- activestorage/active.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/activestorage/active.py b/activestorage/active.py index 734321f..c275ce7 100644 --- a/activestorage/active.py +++ b/activestorage/active.py @@ -1,5 +1,6 @@ import concurrent.futures import os +import fsspec import numpy as np import pathlib import urllib @@ -50,6 +51,18 @@ def load_from_s3(uri, storage_options=None): return ds +def load_from_https(uri): + """ + Load a Dataset from a netCDF4 file on an https server (NGINX). + """ + #TODO need to test if NGINX server behind https:// + fs = fsspec.filesystem('http') + http_file = fs.open(uri, 'rb') + ds = pyfive.File(http_file) + print(f"Dataset loaded from https with Pyfive: {uri}") + return ds + + def get_missing_attributes(ds): """" Load all the missing attributes we need from a netcdf file @@ -187,6 +200,8 @@ def __load_nc_file(self): nc = pyfive.File(self.uri) elif self.storage_type == "s3": nc = load_from_s3(self.uri, self.storage_options) + elif self.storage_type == "https": + nc = load_from_https(self.uri) self.filename = self.uri self.ds = nc[ncvar] From 3c3325f193f3156eba6ed997e238ebcb8191c16f Mon Sep 17 00:00:00 2001 From: Valeriu Predoi Date: Mon, 10 Mar 2025 16:04:39 +0000 Subject: [PATCH 2/6] add dedicated https test module --- tests/test_real_https.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 tests/test_real_https.py diff --git a/tests/test_real_https.py b/tests/test_real_https.py new file mode 100644 index 0000000..d5501a9 --- /dev/null +++ b/tests/test_real_https.py @@ -0,0 +1,19 @@ +import os +import numpy as np + +from activestorage.active import Active +from activestorage.active import load_from_https + + +def test_https_dataset(): + """Run a true test with a https dataset.""" + test_file_uri = "https://esgf.ceda.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/AerChemMIP/MOHC/UKESM1-0-LL/ssp370SST-lowNTCF/r1i1p1f2/Amon/cl/gn/latest/cl_Amon_UKESM1-0-LL_ssp370SST-lowNTCF_r1i1p1f2_gn_205001-209912.nc" + dataset = load_from_https(test_file_uri) + av = dataset['cl'] + + active = Active(av, storage_type="https") + active._version = 2 + active._method = "min" + result = active[0:3, 4:6, 7:9] + print("Result is", result) + assert result == 5098.625 From b0c96843510b4246f9043882293da72c4d711764 Mon Sep 17 00:00:00 2001 From: Valeriu Predoi Date: Mon, 10 Mar 2025 16:20:27 +0000 Subject: [PATCH 3/6] working test with local reduction --- tests/test_real_https.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_real_https.py b/tests/test_real_https.py index d5501a9..78787e2 100644 --- a/tests/test_real_https.py +++ b/tests/test_real_https.py @@ -16,4 +16,4 @@ def test_https_dataset(): active._method = "min" result = active[0:3, 4:6, 7:9] print("Result is", result) - assert result == 5098.625 + assert result == np.array([0.6909787], dtype="float32") From 61bf1712a0754a68823402eba8e7def81ef7d4d1 Mon Sep 17 00:00:00 2001 From: Valeriu Predoi Date: Mon, 10 Mar 2025 16:20:50 +0000 Subject: [PATCH 4/6] first working https prototype --- activestorage/active.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/activestorage/active.py b/activestorage/active.py index c275ce7..da6e549 100644 --- a/activestorage/active.py +++ b/activestorage/active.py @@ -164,6 +164,8 @@ def __init__( self.filename = self.ds elif input_variable and self.storage_type == "s3": self.filename = self.ds.id._filename + elif input_variable and self.storage_type == "https": + self.filename = self.ds # get storage_options self.storage_options = storage_options From 3f9d619bdd1886807ebbf974665a0075a932fab3 Mon Sep 17 00:00:00 2001 From: Valeriu Predoi Date: Mon, 10 Mar 2025 16:37:05 +0000 Subject: [PATCH 5/6] openhttps file --- activestorage/storage.py | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/activestorage/storage.py b/activestorage/storage.py index 56a1ff8..c03e1dc 100644 --- a/activestorage/storage.py +++ b/activestorage/storage.py @@ -1,4 +1,5 @@ """Active storage module.""" +import fsspec import numpy as np import pyfive @@ -34,18 +35,33 @@ def reduce_chunk(rfile, #FIXME: for the moment, open the file every time ... we might want to do that, or not # we could just use an instance of pyfive.high_level.Dataset.id # passed directly from active.py, as below - with open(rfile,'rb') as open_file: - # get the data - chunk = read_block(open_file, offset, size) - # reverse any compression and filters - chunk = filter_pipeline(chunk, compression, filters) - # make it a numpy array of bytes - chunk = ensure_ndarray(chunk) - # convert to the appropriate data type - chunk = chunk.view(dtype) - # sort out ordering and convert to the parent hyperslab dimensions - chunk = chunk.reshape(-1, order='A') - chunk = chunk.reshape(shape, order=order) + try: + with open(rfile,'rb') as open_file: + # get the data + chunk = read_block(open_file, offset, size) + # reverse any compression and filters + chunk = filter_pipeline(chunk, compression, filters) + # make it a numpy array of bytes + chunk = ensure_ndarray(chunk) + # convert to the appropriate data type + chunk = chunk.view(dtype) + # sort out ordering and convert to the parent hyperslab dimensions + chunk = chunk.reshape(-1, order='A') + chunk = chunk.reshape(shape, order=order) + except FileNotFoundError: # could a https file + fs = fsspec.filesystem('http') + with fs.open(rfile, 'rb') as open_file: + # get the data + chunk = read_block(open_file, offset, size) + # reverse any compression and filters + chunk = filter_pipeline(chunk, compression, filters) + # make it a numpy array of bytes + chunk = ensure_ndarray(chunk) + # convert to the appropriate data type + chunk = chunk.view(dtype) + # sort out ordering and convert to the parent hyperslab dimensions + chunk = chunk.reshape(-1, order='A') + chunk = chunk.reshape(shape, order=order) else: class storeinfo: pass storeinfo.byte_offset = offset From 2fd00f003bc2e1f5c4c75591effd2a5a89c66f8a Mon Sep 17 00:00:00 2001 From: Valeriu Predoi Date: Mon, 10 Mar 2025 16:37:20 +0000 Subject: [PATCH 6/6] test for actual https file --- tests/test_real_https.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/test_real_https.py b/tests/test_real_https.py index 78787e2..f4a5b8c 100644 --- a/tests/test_real_https.py +++ b/tests/test_real_https.py @@ -5,8 +5,20 @@ from activestorage.active import load_from_https +def test_https(): + """Run a true test with a https FILE.""" + test_file_uri = "https://esgf.ceda.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/AerChemMIP/MOHC/UKESM1-0-LL/ssp370SST-lowNTCF/r1i1p1f2/Amon/cl/gn/latest/cl_Amon_UKESM1-0-LL_ssp370SST-lowNTCF_r1i1p1f2_gn_205001-209912.nc" + + active = Active(test_file_uri, "cl", storage_type="https") + active._version = 2 + active._method = "min" + result = active[0:3, 4:6, 7:9] + print("Result is", result) + assert result == np.array([0.6909787], dtype="float32") + + def test_https_dataset(): - """Run a true test with a https dataset.""" + """Run a true test with a https DATASET.""" test_file_uri = "https://esgf.ceda.ac.uk/thredds/fileServer/esg_cmip6/CMIP6/AerChemMIP/MOHC/UKESM1-0-LL/ssp370SST-lowNTCF/r1i1p1f2/Amon/cl/gn/latest/cl_Amon_UKESM1-0-LL_ssp370SST-lowNTCF_r1i1p1f2_gn_205001-209912.nc" dataset = load_from_https(test_file_uri) av = dataset['cl']