diff --git a/dfm_tools/download.py b/dfm_tools/download.py index 609d061a..2e39b9e6 100644 --- a/dfm_tools/download.py +++ b/dfm_tools/download.py @@ -71,6 +71,9 @@ def download_ERA5(varkey, raise KeyError(f'"{varkey}" not available, choose from: {", ".join(variables_dict.keys())}') period_range = pd.period_range(date_min,date_max,freq='M') + if len(period_range) == 0: + raise ValueError(f"requested time extents ({date_min} to {date_max}) " + "resulted in empty period_range") print(f'retrieving data from {period_range[0]} to {period_range[-1]} (freq={period_range.freq})') #make sure the data fully covers the desired spatial extent. Download 1 additional grid cell (resolution is 1/4 degrees) in both directions diff --git a/dfm_tools/xarray_helpers.py b/dfm_tools/xarray_helpers.py index 05790670..d2fac439 100644 --- a/dfm_tools/xarray_helpers.py +++ b/dfm_tools/xarray_helpers.py @@ -96,25 +96,49 @@ def preprocess_hisnc(ds): def preprocess_ERA5(ds): """ - Reduces the expver dimension in some of the ERA5 data (mtpr and other variables), which occurs in files with very recent data. The dimension contains the unvalidated data from the latest month in the second index in the expver dimension. The reduction is done with mean, but this is arbitrary, since there is only one valid value per timestep and the other one is nan. + Aligning ERA5 datasets before merging them. These operations are currently + (2025) only required when (also) using previously retrieved ERA5 data. + + In recent datasets retrieved from ERA5 the time dimension and variable are + now called valid_time. This is inconvenient since it causes issues when + merging with previously retrieved datasets. However, it is not necessary + for succesfully running a Delft3D FM simulation. + + Reducing the expver dimension: In the past, the expver dimension was + present if you downloaded ERA5 data that consisted of a mix of ERA5 and + ERA5T data. This dimension was also present in the data variables, so it + broke code. Therefore this dimension is reduced with a mean operation. + Any reduction operation would do the trick since there is only one valid + value per timestep and the other one is nan. In datasets downloaded + currently (2025) the expver dimension is not present anymore, + but anexpver variable is present defining whether the data comes + from ERA5 (1) or ERA5T (5). + + Removing scale_factor and add_offset: In the past, the ERA5 data was + supplied as integers with a scaling and offset that was different for + each downloaded file. This caused serious issues with merging files, + since the scaling/offset from the first file was assumed to be valid + for the others also, leading to invalid values. Only relevant for old + files. More info at https://github.com/Deltares/dfm_tools/issues/239. """ - if 'expver' in ds.dims: - # TODO: this drops int encoding which leads to unzipped float32 netcdf files: https://github.com/Deltares/dfm_tools/issues/781 - ds = ds.mean(dim='expver') - # datasets retrieved with new cds-beta have valid_time instead of time dimn/varn - # https://forum.ecmwf.int/t/new-time-format-in-era5-netcdf-files/3796/5?u=jelmer_veenstra - # TODO: can be removed after https://github.com/Unidata/netcdf4-python/issues/1357 or https://forum.ecmwf.int/t/3796 is fixed + # datasets retrieved with new CDS have valid_time instead of time dim/var + # https://forum.ecmwf.int/t/new-time-format-in-era5-netcdf-files/3796/5 if 'valid_time' in ds.coords: ds = ds.rename({'valid_time':'time'}) - # Prevent writing to (incorrectly scaled) int, since it might mess up mfdataset (https://github.com/Deltares/dfm_tools/issues/239) - # By dropping scaling/offset encoding and converting to float32 (will result in a larger dataset) - # ERA5 datasets retrieved with the new CDS-beta are zipped float32 instead of scaled int, so this is only needed for backwards compatibility with old files. + # reduce the expver dimension (not present in newly retrieved files) + if 'expver' in ds.dims: + ds = ds.mean(dim='expver') + + # drop scaling/offset encoding if present and converting to float32. Not + # present in newly retrieved files, variables are zipped float32 instead for var in ds.data_vars.keys(): - if not set(['dtype','scale_factor','add_offset']).issubset(ds.variables[var].encoding.keys()): + list_attrs = ['dtype','scale_factor','add_offset'] + if not set(list_attrs).issubset(ds.variables[var].encoding.keys()): continue - # the _FillValue will still be -32767 (int default), but this is no issue for float32 + # the _FillValue will still be -32767 (int default) + # this is no issue for float32 ds[var].encoding.pop('scale_factor') ds[var].encoding.pop('add_offset') ds[var].encoding["dtype"] = "float32" diff --git a/docs/notebooks/modelbuilder_example.ipynb b/docs/notebooks/modelbuilder_example.ipynb index dce5d14c..26c02c2f 100644 --- a/docs/notebooks/modelbuilder_example.ipynb +++ b/docs/notebooks/modelbuilder_example.ipynb @@ -70,7 +70,7 @@ "dir_output = os.path.abspath(f'./{model_name}_model')\n", "# path_style = 'windows' # windows / unix\n", "overwrite = False # overwrite the downloaded forcing data or not. Always set to True when changing the domain\n", - "crs = 'EPSG:4326' # coordinate reference system\n", + "crs = 'EPSG:4326' # coordinate reference system, only EPSG 4326 (WGS84) is currently supported by the ERA5 and CMEMS download scripts.\n", "\n", "# domain and resolution\n", "# the actual maximum extents can slightly vary: see dfmt.meshkernel_get_bbox() below\n", @@ -144,7 +144,7 @@ ], "source": [ "# generate spherical regular grid\n", - "mk_object = dfmt.make_basegrid(lon_min, lon_max, lat_min, lat_max, dx=dxy, dy=dxy, crs=crs)\n", + "mk_object = dfmt.make_basegrid(lon_min=lon_min, lon_max=lon_max, lat_min=lat_min, lat_max=lat_max, dx=dxy, dy=dxy, crs=crs)\n", "\n", "# retrieve actual lat/lon bounds from grid, the lon_max and lat_max are likely larger than requested\n", "lon_min, lat_min, lon_max, lat_max = dfmt.meshkernel_get_bbox(mk_object)\n", diff --git a/tests/test_download.py b/tests/test_download.py index b2ad0375..f190d022 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -351,16 +351,35 @@ def test_download_hycom(tmp_path): def test_download_era5_unsupported_varkey(): date_min = '2010-01-31' date_max = '2010-02-01' - longitude_min, longitude_max, latitude_min, latitude_max = 2, 3, 51, 52 #test domain + longitude_min, longitude_max, latitude_min, latitude_max = 2, 3, 51, 52 varkey = 'unexisting' with pytest.raises(KeyError) as e: - dfmt.download_ERA5(varkey, - longitude_min=longitude_min, longitude_max=longitude_max, latitude_min=latitude_min, latitude_max=latitude_max, - date_min=date_min, date_max=date_max, - dir_output='.', overwrite=True) + dfmt.download_ERA5( + varkey, + longitude_min=longitude_min, longitude_max=longitude_max, + latitude_min=latitude_min, latitude_max=latitude_max, + date_min=date_min, date_max=date_max, + dir_output='.', overwrite=True) assert '"unexisting" not available' in str(e.value) +@pytest.mark.requiressecrets +@pytest.mark.unittest +def test_download_era5_incorrect_times(): + date_min = '2010-01-01' + date_max = '2009-01-01' + longitude_min, longitude_max, latitude_min, latitude_max = 2, 3, 51, 52 + varkey = 'msl' + with pytest.raises(ValueError) as e: + dfmt.download_ERA5( + varkey, + longitude_min=longitude_min, longitude_max=longitude_max, + latitude_min=latitude_min, latitude_max=latitude_max, + date_min=date_min, date_max=date_max, + dir_output='.', overwrite=True) + assert 'resulted in empty period_range' in str(e.value) + + @pytest.mark.requiressecrets @pytest.mark.unittest @pytest.mark.era5slow # temporarily skip these on github @@ -372,21 +391,19 @@ def test_download_era5(file_nc_era5_pattern): ds = xr.open_mfdataset(file_nc_era5_pattern) - assert 'valid_time' in ds.dims # TODO: if this fails, remove the exception below and in preprocess_ERA5 + # datasets retrieved with intermediate CDS had expver dimension causing issues + assert 'expver' not in ds.dims # TODO: if this fails, update the docstring of preprocess_ERA5 - timedim = 'time' - # datasets retrieved with new cds-beta have valid_time instead of time dimn/varn - # https://forum.ecmwf.int/t/new-time-format-in-era5-netcdf-files/3796/5?u=jelmer_veenstra - # TODO: can be removed after https://github.com/Unidata/netcdf4-python/issues/1357 or https://forum.ecmwf.int/t/3796 is fixed - if 'valid_time' in ds.dims: - timedim = 'valid_time' + # datasets retrieved with new CDS have valid_time instead of time dim/var + assert 'valid_time' in ds.dims # TODO: if this fails, remove the exception below and in preprocess_ERA5 + timedim = 'valid_time' assert ds.sizes[timedim] == 1416 assert ds[timedim].to_pandas().iloc[0] == pd.Timestamp('2010-01-01') assert ds[timedim].to_pandas().iloc[-1] == pd.Timestamp('2010-02-28 23:00') - # check if there are no integers in the dataset anymore - # this was the case before CDS-beta in https://github.com/Deltares/dfm_tools/issues/239 + # datasets retrieved with new CDS are float32 instead of scaled ints + # ints raised problem in https://github.com/Deltares/dfm_tools/issues/239 msl_encoding = ds['msl'].encoding assert str(msl_encoding['dtype']) == 'float32' assert 'scale_factor' not in msl_encoding.keys()