Skip to content

Commit

Permalink
fix: correcting issues with compression (#241)
Browse files Browse the repository at this point in the history
Co-authored-by: Renaud <[email protected]>
Not keeping anymore the `dtype` and `_FillValue` if set, but only `add_offset` and `scale_factor` to the datasets. Looks like in some cases we lose capability to compress, but still compresses and also now we don't have loss with compression.
  • Loading branch information
uriii3 authored Dec 4, 2024
1 parent 73c0ed8 commit 2bf4b02
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 23 deletions.
4 changes: 2 additions & 2 deletions copernicusmarine/command_line_interface/group_subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,10 +249,10 @@ def cli_subset() -> None:
"--netcdf-compression-level",
type=click.IntRange(0, 9),
is_flag=False,
flag_value=4,
flag_value=1,
default=0,
help=documentation_utils.SUBSET["NETCDF_COMPRESSION_LEVEL_HELP"]
+ " If used as a flag, the assigned value will be 4.",
+ " If used as a flag, the assigned value will be 1.",
)
@click.option(
"--netcdf3-compatible",
Expand Down
2 changes: 2 additions & 0 deletions copernicusmarine/core_functions/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ class ResponseSubset(BaseModel):
#: File name.
filename: str
#: Estimation of the size of the final result file in MB.
#: This estimation may not be accurate if you save the result as
#: a compressed NetCDF file.
file_size: Optional[float]
#: Estimation of the maximum amount of data needed to
#: get the final result in MB.
Expand Down
18 changes: 2 additions & 16 deletions copernicusmarine/download_functions/download_arco_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,22 +539,8 @@ def _download_dataset_as_netcdf(
shuffle=True,
)
keys_to_keep = {
"_FillValue",
"blosc_shuffle",
"chunksizes",
"complevel",
"compression",
"compression_opts",
"contiguous",
"dtype",
"endian",
"fletcher32",
"quantize_mode",
"shuffle",
"significant_digits",
"szip_coding",
"szip_pixels_per_block",
"zlib",
"scale_factor",
"add_offset",
}
encoding = {
name: {
Expand Down
4 changes: 2 additions & 2 deletions doc/usage/subset-usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,11 @@ About ``--netcdf-compression-level`` options

If writing data to a NetCDF file (the default format), the ``--netcdf-compression-level`` option can be set to compress the downloaded file. This reduces file size but increases writing time. Without this option, the file is written faster but with a larger size. For Zarr format ('.zarr' extension), the default compression of the Copernicus Marine Data Store is applied, making the download fast and compressed without using ``--netcdf-compression-level``.

Default NetCDF compression settings for xarray:
Default NetCDF compression settings for the toolbox are:

.. code-block:: text
{'zlib': True, 'complevel': 4, 'contiguous': False, 'shuffle': True}
{'zlib': True, 'complevel': 1, 'contiguous': False, 'shuffle': True}
Set the ``--netcdf-compression-level`` to a custom compression level between 0 (no compression, by default) and 9 (maximum compression).

Expand Down
2 changes: 1 addition & 1 deletion tests/__snapshots__/test_help_command_interface.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@
' NetCDF output file. A value of 0 means no',
' compression, and 9 is the highest level of',
' compression available. If used as a flag,',
' the assigned value will be 4. [0<=x<=9]',
' the assigned value will be 1. [0<=x<=9]',
' --netcdf3-compatible Enable downloading the dataset in a netCDF3',
' compatible format.',
' --chunk-size-limit INTEGER RANGE',
Expand Down
4 changes: 2 additions & 2 deletions tests/test_command_line_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -1337,7 +1337,7 @@ def test_netcdf_compression_option(self, tmp_path):
assert dataset_without_option.uo.encoding["complevel"] == 0

assert dataset_with_option.uo.encoding["zlib"] is True
assert dataset_with_option.uo.encoding["complevel"] == 4
assert dataset_with_option.uo.encoding["complevel"] == 1
assert dataset_with_option.uo.encoding["contiguous"] is False
assert dataset_with_option.uo.encoding["shuffle"] is True

Expand Down Expand Up @@ -1387,7 +1387,7 @@ def test_netcdf_compression_with_optimised_files(self, tmp_path):
size_without_option = get_file_size(filepath_without_option)
size_with_option = get_file_size(filepath_with_option)
logger.info(f"{size_without_option=}, {size_with_option=}")
assert 4 * size_with_option < size_without_option
assert 1.4 * size_with_option < size_without_option

def test_omi_arco_service(self, tmp_path):
base_command = [
Expand Down
34 changes: 34 additions & 0 deletions tests/test_python_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,3 +283,37 @@ def test_subset_keeps_fillvalue_empty_w_compression(self, tmp_path):
assert "_FillValue" not in subsetdata.depth.attrs
assert subsetdata.time.attrs["calendar"] == "gregorian"
assert subsetdata.time.attrs["units"] == "hours since 1950-01-01"

def test_compressed_and_uncompressed_no_diff(self, tmp_path):
data_query = {
"dataset_id": "cmems_mod_glo_phy_my_0.083deg_P1D-m",
"start_datetime": "2019-01-31",
"end_datetime": "2019-01-31",
"minimum_depth": 0,
"maximum_depth": 1,
"variables": ["sea_water_potential_temperature"],
"output_directory": tmp_path,
}
subset(**data_query, output_filename="uncompressed_data.nc")
subset(
**data_query,
netcdf_compression_level=1,
output_filename="compressed_data.nc",
)
dataset_uncompressed = xarray.open_dataset(
tmp_path / "uncompressed_data.nc"
)
dataset_compressed = xarray.open_dataset(
tmp_path / "compressed_data.nc"
)
size_uncompressed = (tmp_path / "uncompressed_data.nc").stat().st_size
size_compressed = (tmp_path / "compressed_data.nc").stat().st_size
assert size_uncompressed > 1.5 * size_compressed
diff = dataset_uncompressed - dataset_compressed
diff.attrs = dataset_uncompressed.attrs
for var in diff.data_vars:
diff[var].attrs = dataset_uncompressed[var].attrs

diff.to_netcdf(tmp_path / "diff.nc")
diff = xarray.open_dataset(tmp_path / "diff.nc")
assert diff.thetao.mean().values == 0.0

0 comments on commit 2bf4b02

Please sign in to comment.