fix: correcting issues with compression (#241)

Co-authored-by: Renaud <[email protected]> Not keeping anymore the `dtype` and `_FillValue` if set, but only `add_offset` and `scale_factor` to the datasets. Looks like in some cases we lose capability to compress, but still compresses and also now we don't have loss with compression.
mercator-ocean · Dec 4, 2024 · 2bf4b02 · 2bf4b02
1 parent 73c0ed8
commit 2bf4b02
Show file tree

Hide file tree

Showing 7 changed files with 45 additions and 23 deletions.
diff --git a/copernicusmarine/command_line_interface/group_subset.py b/copernicusmarine/command_line_interface/group_subset.py
@@ -249,10 +249,10 @@ def cli_subset() -> None:
     "--netcdf-compression-level",
     type=click.IntRange(0, 9),
     is_flag=False,
-    flag_value=4,
+    flag_value=1,
     default=0,
     help=documentation_utils.SUBSET["NETCDF_COMPRESSION_LEVEL_HELP"]
-    + " If used as a flag, the assigned value will be 4.",
+    + " If used as a flag, the assigned value will be 1.",
 )
 @click.option(
     "--netcdf3-compatible",

diff --git a/copernicusmarine/core_functions/models.py b/copernicusmarine/core_functions/models.py
@@ -180,6 +180,8 @@ class ResponseSubset(BaseModel):
     #: File name.
     filename: str
     #: Estimation of the size of the final result file in MB.
+    #: This estimation may not be accurate if you save the result as
+    #: a compressed NetCDF file.
     file_size: Optional[float]
     #: Estimation of the maximum amount of data needed to
     #: get the final result in MB.

diff --git a/copernicusmarine/download_functions/download_arco_series.py b/copernicusmarine/download_functions/download_arco_series.py
@@ -539,22 +539,8 @@ def _download_dataset_as_netcdf(
             shuffle=True,
         )
         keys_to_keep = {
-            "_FillValue",
-            "blosc_shuffle",
-            "chunksizes",
-            "complevel",
-            "compression",
-            "compression_opts",
-            "contiguous",
-            "dtype",
-            "endian",
-            "fletcher32",
-            "quantize_mode",
-            "shuffle",
-            "significant_digits",
-            "szip_coding",
-            "szip_pixels_per_block",
-            "zlib",
+            "scale_factor",
+            "add_offset",
         }
         encoding = {
             name: {

diff --git a/doc/usage/subset-usage.rst b/doc/usage/subset-usage.rst
@@ -58,11 +58,11 @@ About ``--netcdf-compression-level`` options
 
 If writing data to a NetCDF file (the default format), the ``--netcdf-compression-level`` option can be set to compress the downloaded file. This reduces file size but increases writing time. Without this option, the file is written faster but with a larger size. For Zarr format ('.zarr' extension), the default compression of the Copernicus Marine Data Store is applied, making the download fast and compressed without using ``--netcdf-compression-level``.
 
-Default NetCDF compression settings for xarray:
+Default NetCDF compression settings for the toolbox are:
 
 .. code-block:: text
 
-    {'zlib': True, 'complevel': 4, 'contiguous': False, 'shuffle': True}
+    {'zlib': True, 'complevel': 1, 'contiguous': False, 'shuffle': True}
 
 Set the ``--netcdf-compression-level`` to a custom compression level between 0 (no compression, by default) and 9 (maximum compression).
 

diff --git a/tests/__snapshots__/test_help_command_interface.ambr b/tests/__snapshots__/test_help_command_interface.ambr
@@ -365,7 +365,7 @@
     '                                  NetCDF output file. A value of 0 means no',
     '                                  compression, and 9 is the highest level of',
     '                                  compression available. If used as a flag,',
-    '                                  the assigned value will be 4.  [0<=x<=9]',
+    '                                  the assigned value will be 1.  [0<=x<=9]',
     '  --netcdf3-compatible            Enable downloading the dataset in a netCDF3',
     '                                  compatible format.',
     '  --chunk-size-limit INTEGER RANGE',

diff --git a/tests/test_command_line_interface.py b/tests/test_command_line_interface.py
@@ -1337,7 +1337,7 @@ def test_netcdf_compression_option(self, tmp_path):
         assert dataset_without_option.uo.encoding["complevel"] == 0
 
         assert dataset_with_option.uo.encoding["zlib"] is True
-        assert dataset_with_option.uo.encoding["complevel"] == 4
+        assert dataset_with_option.uo.encoding["complevel"] == 1
         assert dataset_with_option.uo.encoding["contiguous"] is False
         assert dataset_with_option.uo.encoding["shuffle"] is True
 
@@ -1387,7 +1387,7 @@ def test_netcdf_compression_with_optimised_files(self, tmp_path):
         size_without_option = get_file_size(filepath_without_option)
         size_with_option = get_file_size(filepath_with_option)
         logger.info(f"{size_without_option=}, {size_with_option=}")
-        assert 4 * size_with_option < size_without_option
+        assert 1.4 * size_with_option < size_without_option
 
     def test_omi_arco_service(self, tmp_path):
         base_command = [

diff --git a/tests/test_python_interface.py b/tests/test_python_interface.py
@@ -283,3 +283,37 @@ def test_subset_keeps_fillvalue_empty_w_compression(self, tmp_path):
         assert "_FillValue" not in subsetdata.depth.attrs
         assert subsetdata.time.attrs["calendar"] == "gregorian"
         assert subsetdata.time.attrs["units"] == "hours since 1950-01-01"
+
+    def test_compressed_and_uncompressed_no_diff(self, tmp_path):
+        data_query = {
+            "dataset_id": "cmems_mod_glo_phy_my_0.083deg_P1D-m",
+            "start_datetime": "2019-01-31",
+            "end_datetime": "2019-01-31",
+            "minimum_depth": 0,
+            "maximum_depth": 1,
+            "variables": ["sea_water_potential_temperature"],
+            "output_directory": tmp_path,
+        }
+        subset(**data_query, output_filename="uncompressed_data.nc")
+        subset(
+            **data_query,
+            netcdf_compression_level=1,
+            output_filename="compressed_data.nc",
+        )
+        dataset_uncompressed = xarray.open_dataset(
+            tmp_path / "uncompressed_data.nc"
+        )
+        dataset_compressed = xarray.open_dataset(
+            tmp_path / "compressed_data.nc"
+        )
+        size_uncompressed = (tmp_path / "uncompressed_data.nc").stat().st_size
+        size_compressed = (tmp_path / "compressed_data.nc").stat().st_size
+        assert size_uncompressed > 1.5 * size_compressed
+        diff = dataset_uncompressed - dataset_compressed
+        diff.attrs = dataset_uncompressed.attrs
+        for var in diff.data_vars:
+            diff[var].attrs = dataset_uncompressed[var].attrs
+
+        diff.to_netcdf(tmp_path / "diff.nc")
+        diff = xarray.open_dataset(tmp_path / "diff.nc")
+        assert diff.thetao.mean().values == 0.0