diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ad83cfac531..8a80296f6fd 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -12,6 +12,10 @@ v2025.07.1 (unreleased) New Features ~~~~~~~~~~~~ +- Added new argument in ``xarray.open_mfdataset`` to better handle the invalid files + {'raise', 'warn', 'ignore'}, default 'raise'. If 'raise', then invalid dataset will raise an exception. + If 'ignore', then invalid dataset will be ignored. If 'warn', then a warning will be issued for each invalid dataset. + By `Pratiman Patel `_. Breaking changes diff --git a/xarray/backends/api.py b/xarray/backends/api.py index b80ec927b1e..06b640ba619 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -42,7 +42,7 @@ from xarray.core.indexes import Index from xarray.core.treenode import group_subtrees from xarray.core.types import NetcdfWriteModes, ZarrWriteModes -from xarray.core.utils import is_remote_uri +from xarray.core.utils import emit_user_level_warning, is_remote_uri from xarray.namedarray.daskmanager import DaskManager from xarray.namedarray.parallelcompat import guess_chunkmanager from xarray.structure.chunks import _get_chunk, _maybe_chunk @@ -62,6 +62,7 @@ from xarray.core.types import ( CombineAttrsOptions, CompatOptions, + ErrorOptionsWithWarn, JoinOptions, NestedSequence, ReadBuffer, @@ -1390,6 +1391,23 @@ def open_groups( return groups +def _remove_path(paths, path_to_remove) -> list: + # Initialize an empty list to store the result + result = [] + + for item in paths: + if isinstance(item, list): + # If the current item is a list, recursively call remove_elements on it + nested_result = _remove_path(item, path_to_remove) + if nested_result: # Only add non-empty lists to avoid adding empty lists + result.append(nested_result) + elif item not in path_to_remove: + # Add the item to the result if it is not in the set of elements to remove + result.append(item) + + return result + + def open_mfdataset( paths: str | os.PathLike @@ -1415,6 +1433,7 @@ def open_mfdataset( join: JoinOptions = "outer", attrs_file: str | os.PathLike | None = None, combine_attrs: CombineAttrsOptions = "override", + errors: ErrorOptionsWithWarn = "raise", **kwargs, ) -> Dataset: """Open multiple files as a single dataset. @@ -1541,6 +1560,12 @@ def open_mfdataset( If a callable, it must expect a sequence of ``attrs`` dicts and a context object as its only parameters. + errors : {"raise", "warn", "ignore"}, default: "raise" + String indicating how to handle errors in opening dataset. + + - "raise": invalid dataset will raise an exception. + - "warn": a warning will be issued for each invalid dataset. + - "ignore": invalid dataset will be ignored. **kwargs : optional Additional arguments passed on to :py:func:`xarray.open_dataset`. For an overview of some of the possible options, see the documentation of @@ -1633,7 +1658,27 @@ def open_mfdataset( open_ = open_dataset getattr_ = getattr - datasets = [open_(p, **open_kwargs) for p in paths1d] + if errors not in ("raise", "warn", "ignore"): + raise ValueError( + f"'errors' must be 'raise', 'warn' or 'ignore', got '{errors}'" + ) + + datasets = [] + remove_paths = False + for p in paths1d: + try: + ds = open_(p, **open_kwargs) + datasets.append(ds) + except Exception as e: + if errors == "raise": + raise + elif errors == "warn": + emit_user_level_warning(f"Could not open {p} due to {e}. Ignoring.") + # remove invalid paths + if combine == "nested": + paths = _remove_path(paths, p) + remove_paths = True + closers = [getattr_(ds, "_close") for ds in datasets] if preprocess is not None: datasets = [preprocess(ds) for ds in datasets] @@ -1646,6 +1691,11 @@ def open_mfdataset( # Combine all datasets, closing them in case of a ValueError try: if combine == "nested": + # Create new ids and paths based on removed items + if remove_paths: + combined_ids_paths = _infer_concat_order_from_positions(paths) + ids = list(combined_ids_paths.keys()) + # Combined nested list by successive concat and merge operations # along each dimension, using structure given by "ids" combined = _nested_combine( diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 785b06a26fd..2701fa8d001 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5118,6 +5118,68 @@ def test_open_mfdataset_2(self) -> None: ) as actual: assert_identical(original, actual) + def test_open_mfdataset_with_ignore(self) -> None: + original = Dataset({"foo": ("x", np.random.randn(10))}) + with create_tmp_files(2) as (tmp1, tmp2): + ds1 = original.isel(x=slice(5)) + ds2 = original.isel(x=slice(5, 10)) + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + with open_mfdataset( + [tmp1, "non-existent-file.nc", tmp2], + concat_dim="x", + combine="nested", + errors="ignore", + ) as actual: + assert_identical(original, actual) + + def test_open_mfdataset_with_warn(self) -> None: + original = Dataset({"foo": ("x", np.random.randn(10))}) + with pytest.warns(UserWarning, match="Ignoring."): + with create_tmp_files(2) as (tmp1, tmp2): + ds1 = original.isel(x=slice(5)) + ds2 = original.isel(x=slice(5, 10)) + ds1.to_netcdf(tmp1) + ds2.to_netcdf(tmp2) + with open_mfdataset( + [tmp1, "non-existent-file.nc", tmp2], + concat_dim="x", + combine="nested", + errors="warn", + ) as actual: + assert_identical(original, actual) + + def test_open_mfdataset_2d_with_ignore(self) -> None: + original = Dataset({"foo": (["x", "y"], np.random.randn(10, 8))}) + with create_tmp_files(4) as (tmp1, tmp2, tmp3, tmp4): + original.isel(x=slice(5), y=slice(4)).to_netcdf(tmp1) + original.isel(x=slice(5, 10), y=slice(4)).to_netcdf(tmp2) + original.isel(x=slice(5), y=slice(4, 8)).to_netcdf(tmp3) + original.isel(x=slice(5, 10), y=slice(4, 8)).to_netcdf(tmp4) + with open_mfdataset( + [[tmp1, tmp2], ["non-existent-file.nc", tmp3, tmp4]], + combine="nested", + concat_dim=["y", "x"], + errors="ignore", + ) as actual: + assert_identical(original, actual) + + def test_open_mfdataset_2d_with_warn(self) -> None: + original = Dataset({"foo": (["x", "y"], np.random.randn(10, 8))}) + with pytest.warns(UserWarning, match="Ignoring."): + with create_tmp_files(4) as (tmp1, tmp2, tmp3, tmp4): + original.isel(x=slice(5), y=slice(4)).to_netcdf(tmp1) + original.isel(x=slice(5, 10), y=slice(4)).to_netcdf(tmp2) + original.isel(x=slice(5), y=slice(4, 8)).to_netcdf(tmp3) + original.isel(x=slice(5, 10), y=slice(4, 8)).to_netcdf(tmp4) + with open_mfdataset( + [[tmp1, tmp2, "non-existent-file.nc"], [tmp3, tmp4]], + combine="nested", + concat_dim=["y", "x"], + errors="warn", + ) as actual: + assert_identical(original, actual) + def test_attrs_mfdataset(self) -> None: original = Dataset({"foo": ("x", np.random.randn(10))}) with create_tmp_file() as tmp1: