diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 95aa5a57438..043d47ffca5 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -52,6 +52,8 @@ New Features ~~~~~~~~~~~~ - Relax nanosecond datetime restriction in CF time decoding (:issue:`7493`, :pull:`9618`). By `Kai Mühlbauer `_ and `Spencer Clark `_. +- Add new ``errors`` arg to :py:meth:`open_mfdataset` to better handle invalid files. + (:issue:`6736`, :pull:`9955`). By `Pratiman Patel `_. Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 3211b9efbae..30a74e779ab 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import warnings from collections.abc import ( Callable, Hashable, @@ -60,6 +61,7 @@ from xarray.core.types import ( CombineAttrsOptions, CompatOptions, + ErrorOptionsWithWarn, JoinOptions, NestedSequence, ReadBuffer, @@ -103,8 +105,7 @@ def _get_default_engine_remote_uri() -> Literal["netcdf4", "pydap"]: engine = "pydap" except ImportError as err: raise ValueError( - "netCDF4 or pydap is required for accessing " - "remote datasets via OPeNDAP" + "netCDF4 or pydap is required for accessing remote datasets via OPeNDAP" ) from err return engine @@ -1393,6 +1394,7 @@ def open_mfdataset( join: JoinOptions = "outer", attrs_file: str | os.PathLike | None = None, combine_attrs: CombineAttrsOptions = "override", + errors: ErrorOptionsWithWarn = "raise", **kwargs, ) -> Dataset: """Open multiple files as a single dataset. @@ -1519,7 +1521,12 @@ def open_mfdataset( If a callable, it must expect a sequence of ``attrs`` dicts and a context object as its only parameters. - **kwargs : optional + errors : {'raise', 'warn', 'ignore'}, default 'raise' + - If 'raise', then invalid dataset will raise an exception. + - If 'warn', then a warning will be issued for each invalid dataset. + - If 'ignore', then invalid dataset will be ignored. + + **kwargs : optional Additional arguments passed on to :py:func:`xarray.open_dataset`. For an overview of some of the possible options, see the documentation of :py:func:`xarray.open_dataset` @@ -1611,7 +1618,31 @@ def open_mfdataset( open_ = open_dataset getattr_ = getattr - datasets = [open_(p, **open_kwargs) for p in paths1d] + if errors not in ("raise", "warn", "ignore"): + raise ValueError( + f"'errors' must be 'raise', 'warn' or 'ignore', got '{errors}'" + ) + + datasets = [] + for i, p in enumerate(paths1d): + try: + ds = open_(p, **open_kwargs) + datasets.append(ds) + except Exception: + # remove invalid ids and paths + if combine == "nested": + ids.pop(i) + paths1d.pop(i) + if errors == "raise": + raise + elif errors == "warn": + warnings.warn( + f"Could not open {p}. Ignoring.", UserWarning, stacklevel=2 + ) + continue + else: + continue + closers = [getattr_(ds, "_close") for ds in datasets] if preprocess is not None: datasets = [preprocess(ds) for ds in datasets] @@ -1649,8 +1680,7 @@ def open_mfdataset( ) else: raise ValueError( - f"{combine} is an invalid option for the keyword argument" - " ``combine``" + f"{combine} is an invalid option for the keyword argument ``combine``" ) except ValueError: for ds in datasets: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 72078da11b9..dce99ccc624 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -4831,6 +4831,68 @@ def test_open_mfdataset_2(self) -> None: ) as actual: assert_identical(original, actual) + def test_open_mfdataset_with_ignore(self) -> None: + original = Dataset({"foo": ("x", np.random.randn(10))}) + with create_tmp_files(2) as (tmp1, tmp2): + ds1 = original.isel(x=slice(5)) + ds2 = original.isel(x=slice(5, 10)) + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + with open_mfdataset( + [tmp1, tmp2, "non-existent-file.nc"], + concat_dim="x", + combine="nested", + errors="ignore", + ) as actual: + assert_identical(original, actual) + + def test_open_mfdataset_with_warn(self) -> None: + original = Dataset({"foo": ("x", np.random.randn(10))}) + with pytest.warns(UserWarning, match="Ignoring."): + with create_tmp_files(2) as (tmp1, tmp2): + ds1 = original.isel(x=slice(5)) + ds2 = original.isel(x=slice(5, 10)) + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + with open_mfdataset( + [tmp1, tmp2, "non-existent-file.nc"], + concat_dim="x", + combine="nested", + errors="warn", + ) as actual: + assert_identical(original, actual) + + def test_open_mfdataset_2d_with_ignore(self) -> None: + original = Dataset({"foo": (["x", "y"], np.random.randn(10, 8))}) + with create_tmp_files(4) as (tmp1, tmp2, tmp3, tmp4): + original.isel(x=slice(5), y=slice(4)).to_netcdf(tmp1) + original.isel(x=slice(5, 10), y=slice(4)).to_netcdf(tmp2) + original.isel(x=slice(5), y=slice(4, 8)).to_netcdf(tmp3) + original.isel(x=slice(5, 10), y=slice(4, 8)).to_netcdf(tmp4) + with open_mfdataset( + [[tmp1, tmp2], [tmp3, tmp4, "non-existent-file.nc"]], + combine="nested", + concat_dim=["y", "x"], + errors="ignore", + ) as actual: + assert_identical(original, actual) + + def test_open_mfdataset_2d_with_warn(self) -> None: + original = Dataset({"foo": (["x", "y"], np.random.randn(10, 8))}) + with pytest.warns(UserWarning, match="Ignoring."): + with create_tmp_files(4) as (tmp1, tmp2, tmp3, tmp4): + original.isel(x=slice(5), y=slice(4)).to_netcdf(tmp1) + original.isel(x=slice(5, 10), y=slice(4)).to_netcdf(tmp2) + original.isel(x=slice(5), y=slice(4, 8)).to_netcdf(tmp3) + original.isel(x=slice(5, 10), y=slice(4, 8)).to_netcdf(tmp4) + with open_mfdataset( + [[tmp1, tmp2], [tmp3, tmp4, "non-existent-file.nc"]], + combine="nested", + concat_dim=["y", "x"], + errors="warn", + ) as actual: + assert_identical(original, actual) + def test_attrs_mfdataset(self) -> None: original = Dataset({"foo": ("x", np.random.randn(10))}) with create_tmp_file() as tmp1: