Skip to content

Commit

Permalink
Add nczarr support
Browse files Browse the repository at this point in the history
  • Loading branch information
openSourcerer9000 authored and ocefpaf committed May 22, 2024
1 parent b639de2 commit c9d1ebe
Show file tree
Hide file tree
Showing 28 changed files with 781 additions and 499 deletions.
28 changes: 18 additions & 10 deletions compliance_checker/protocols/netcdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,26 @@ def is_netcdf(url):
if url.endswith("nc"):
return True

# Brute force
with open(url, "rb") as f:
magic_number = f.read(4)
if len(magic_number) < 4:
return False
if is_classic_netcdf(magic_number):
return True
elif is_hdf5(magic_number):
return True

try:
# Brute force
with open(url, "rb") as f:
magic_number = f.read(4)
if len(magic_number) < 4:
return False
if is_classic_netcdf(magic_number):
return True
elif is_hdf5(magic_number):
return True
except PermissionError:
# open will fail for both a directory or a local url, either of which may be pointing to a Zarr dataset
# directory
return False
except OSError:
# local file url
return False

return False


def is_classic_netcdf(file_buffer):
"""
Expand Down
25 changes: 15 additions & 10 deletions compliance_checker/protocols/opendap.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,19 @@ def is_opendap(url):
das_url = url.replace("#fillmismatch", ".das")
else:
das_url = url + ".das"
response = requests.get(das_url, allow_redirects=True)
if "xdods-server" in response.headers:
return True
# Check if it is an access restricted ESGF thredds service
if (
response.status_code == 401
and "text/html" in response.headers["content-type"]
and "The following URL requires authentication:" in response.text
):
return True

try:
response = requests.get(das_url, allow_redirects=True)

if "xdods-server" in response.headers:
return True
# Check if it is an access restricted ESGF thredds service
if (
response.status_code == 401
and "text/html" in response.headers["content-type"]
and "The following URL requires authentication:" in response.text
):
return True
except requests.exceptions.InvalidSchema:
return False # not opendap if url + ".das" isn't found
return False
81 changes: 81 additions & 0 deletions compliance_checker/protocols/zarr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import zipfile
from pathlib import Path
from urllib.parse import urlparse
from urllib.request import url2pathname
from zipfile import ZipFile

from compliance_checker.protocols import netcdf

#


def is_zarr(url):
"""Check if a URL is a zarr dataset."""
url = str(url)

if netcdf.is_netcdf(url):
return False

if ".zarr" in url:
return True

if urlparse(url).scheme in ("https", "s3", "file"):
return True

if zipfile.is_zipfile(url):
if ".zmetadata" in ZipFile(url).namelist():
return True

if Path(url).is_dir():
if (Path(url) / ".zmetadata").exists():
return True

return False


def as_zarr(url):
"""
Transform pointers to zarr datasets to valid nczarr urls, as described in
https://www.unidata.ucar.edu/blogs/developer/entry/overview-of-zarr-support-in\n
url: str or Path to valid zarr dataset\n
Distinct from is_cdl etc in that it will return the appropriate URI \n\n
Not tested on Windows paths at the moment, as NCZarr is not supported in Windows\n
A valid Zarr dataset could be provided in any of the following forms:\n
"http://s3.amazonaws.com/bucket/dataset.zarr"\n
"http://s3.amazonaws.com/bucket/dataset.zarr"#mode=nczarr,s3\n
"/home/path/to/dataset.zarr"\n
Path('/home/path/to/dataset.zarr')\n
"file:///home/path/to/dataset.zarr"\n
"file:///home/path/to/dataset.randomExt#mode=nczarr,file"
"file:///home/path/to/dataset.zarr#mode=nczarr,zip"
"""

pr = urlparse(str(url))

if "mode=nczarr" in pr.fragment:
if pr.netloc:
return str(url) # already valid nczarr url
elif pr.scheme == "file":
return str(url) # already valid nczarr url

zarr_url = Path(
url2pathname(pr.path),
).resolve() # url2pathname necessary to avoid urlparse bug in windows

if pr.netloc:
mode = "s3"
elif zipfile.is_zipfile(zarr_url):
mode = "zip"
elif zarr_url.is_dir():
mode = "file"
else:
raise ValueError(
f"Could not identify {url},\nif #mode=nczarr,zarr, please pass this explicitly\nValid url options are described here\nhttps://www.unidata.ucar.edu/blogs/developer/entry/overview-of-zarr-support-in",
)

url_base = url if mode == "s3" else zarr_url.as_uri()

zarr_url = f"{url_base}#mode=nczarr,{mode}"
return zarr_url
10 changes: 9 additions & 1 deletion compliance_checker/suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import inspect
import itertools
import os
import platform
import re
import subprocess
import sys
Expand All @@ -27,7 +28,7 @@

from compliance_checker import __version__, tempnc
from compliance_checker.base import BaseCheck, GenericFile, Result, fix_return_value
from compliance_checker.protocols import cdl, netcdf, opendap
from compliance_checker.protocols import cdl, netcdf, opendap, zarr

# Ensure output is encoded as Unicode when checker output is redirected or piped
if sys.stdout.encoding is None:
Expand Down Expand Up @@ -890,6 +891,13 @@ def load_local_dataset(self, ds_str):
if cdl.is_cdl(ds_str):
ds_str = self.generate_dataset(ds_str)

if zarr.is_zarr(ds_str):
if platform.system() != "Linux":
print(
f"WARNING: {platform.system()} OS detected. NCZarr is not officially supported for your OS as of when this API was written. Your mileage may vary.",
)
return Dataset(zarr.as_zarr(ds_str))

if netcdf.is_netcdf(ds_str):
return Dataset(ds_str)

Expand Down
Loading

0 comments on commit c9d1ebe

Please sign in to comment.