diff --git a/copernicusmarine/command_line_interface/group_subset.py b/copernicusmarine/command_line_interface/group_subset.py index c2b66530..89e3b49f 100644 --- a/copernicusmarine/command_line_interface/group_subset.py +++ b/copernicusmarine/command_line_interface/group_subset.py @@ -249,10 +249,10 @@ def cli_subset() -> None: "--netcdf-compression-level", type=click.IntRange(0, 9), is_flag=False, - flag_value=4, + flag_value=1, default=0, help=documentation_utils.SUBSET["NETCDF_COMPRESSION_LEVEL_HELP"] - + " If used as a flag, the assigned value will be 4.", + + " If used as a flag, the assigned value will be 1.", ) @click.option( "--netcdf3-compatible", diff --git a/copernicusmarine/core_functions/models.py b/copernicusmarine/core_functions/models.py index ee5abc4c..ca29a691 100644 --- a/copernicusmarine/core_functions/models.py +++ b/copernicusmarine/core_functions/models.py @@ -180,6 +180,8 @@ class ResponseSubset(BaseModel): #: File name. filename: str #: Estimation of the size of the final result file in MB. + #: This estimation may not be accurate if you save the result as + #: a compressed NetCDF file. file_size: Optional[float] #: Estimation of the maximum amount of data needed to #: get the final result in MB. diff --git a/copernicusmarine/download_functions/download_arco_series.py b/copernicusmarine/download_functions/download_arco_series.py index 3f3c44de..b04785e5 100644 --- a/copernicusmarine/download_functions/download_arco_series.py +++ b/copernicusmarine/download_functions/download_arco_series.py @@ -539,22 +539,8 @@ def _download_dataset_as_netcdf( shuffle=True, ) keys_to_keep = { - "_FillValue", - "blosc_shuffle", - "chunksizes", - "complevel", - "compression", - "compression_opts", - "contiguous", - "dtype", - "endian", - "fletcher32", - "quantize_mode", - "shuffle", - "significant_digits", - "szip_coding", - "szip_pixels_per_block", - "zlib", + "scale_factor", + "add_offset", } encoding = { name: { diff --git a/doc/usage/subset-usage.rst b/doc/usage/subset-usage.rst index efb29725..dd6cce8a 100644 --- a/doc/usage/subset-usage.rst +++ b/doc/usage/subset-usage.rst @@ -58,11 +58,11 @@ About ``--netcdf-compression-level`` options If writing data to a NetCDF file (the default format), the ``--netcdf-compression-level`` option can be set to compress the downloaded file. This reduces file size but increases writing time. Without this option, the file is written faster but with a larger size. For Zarr format ('.zarr' extension), the default compression of the Copernicus Marine Data Store is applied, making the download fast and compressed without using ``--netcdf-compression-level``. -Default NetCDF compression settings for xarray: +Default NetCDF compression settings for the toolbox are: .. code-block:: text - {'zlib': True, 'complevel': 4, 'contiguous': False, 'shuffle': True} + {'zlib': True, 'complevel': 1, 'contiguous': False, 'shuffle': True} Set the ``--netcdf-compression-level`` to a custom compression level between 0 (no compression, by default) and 9 (maximum compression). diff --git a/tests/__snapshots__/test_help_command_interface.ambr b/tests/__snapshots__/test_help_command_interface.ambr index 4999b502..482302f8 100644 --- a/tests/__snapshots__/test_help_command_interface.ambr +++ b/tests/__snapshots__/test_help_command_interface.ambr @@ -365,7 +365,7 @@ ' NetCDF output file. A value of 0 means no', ' compression, and 9 is the highest level of', ' compression available. If used as a flag,', - ' the assigned value will be 4. [0<=x<=9]', + ' the assigned value will be 1. [0<=x<=9]', ' --netcdf3-compatible Enable downloading the dataset in a netCDF3', ' compatible format.', ' --chunk-size-limit INTEGER RANGE', diff --git a/tests/test_command_line_interface.py b/tests/test_command_line_interface.py index a858d54c..bac9659a 100644 --- a/tests/test_command_line_interface.py +++ b/tests/test_command_line_interface.py @@ -1337,7 +1337,7 @@ def test_netcdf_compression_option(self, tmp_path): assert dataset_without_option.uo.encoding["complevel"] == 0 assert dataset_with_option.uo.encoding["zlib"] is True - assert dataset_with_option.uo.encoding["complevel"] == 4 + assert dataset_with_option.uo.encoding["complevel"] == 1 assert dataset_with_option.uo.encoding["contiguous"] is False assert dataset_with_option.uo.encoding["shuffle"] is True @@ -1387,7 +1387,7 @@ def test_netcdf_compression_with_optimised_files(self, tmp_path): size_without_option = get_file_size(filepath_without_option) size_with_option = get_file_size(filepath_with_option) logger.info(f"{size_without_option=}, {size_with_option=}") - assert 4 * size_with_option < size_without_option + assert 1.4 * size_with_option < size_without_option def test_omi_arco_service(self, tmp_path): base_command = [ diff --git a/tests/test_python_interface.py b/tests/test_python_interface.py index 14e74732..07fba1fe 100644 --- a/tests/test_python_interface.py +++ b/tests/test_python_interface.py @@ -283,3 +283,37 @@ def test_subset_keeps_fillvalue_empty_w_compression(self, tmp_path): assert "_FillValue" not in subsetdata.depth.attrs assert subsetdata.time.attrs["calendar"] == "gregorian" assert subsetdata.time.attrs["units"] == "hours since 1950-01-01" + + def test_compressed_and_uncompressed_no_diff(self, tmp_path): + data_query = { + "dataset_id": "cmems_mod_glo_phy_my_0.083deg_P1D-m", + "start_datetime": "2019-01-31", + "end_datetime": "2019-01-31", + "minimum_depth": 0, + "maximum_depth": 1, + "variables": ["sea_water_potential_temperature"], + "output_directory": tmp_path, + } + subset(**data_query, output_filename="uncompressed_data.nc") + subset( + **data_query, + netcdf_compression_level=1, + output_filename="compressed_data.nc", + ) + dataset_uncompressed = xarray.open_dataset( + tmp_path / "uncompressed_data.nc" + ) + dataset_compressed = xarray.open_dataset( + tmp_path / "compressed_data.nc" + ) + size_uncompressed = (tmp_path / "uncompressed_data.nc").stat().st_size + size_compressed = (tmp_path / "compressed_data.nc").stat().st_size + assert size_uncompressed > 1.5 * size_compressed + diff = dataset_uncompressed - dataset_compressed + diff.attrs = dataset_uncompressed.attrs + for var in diff.data_vars: + diff[var].attrs = dataset_uncompressed[var].attrs + + diff.to_netcdf(tmp_path / "diff.nc") + diff = xarray.open_dataset(tmp_path / "diff.nc") + assert diff.thetao.mean().values == 0.0