Skip to content

Commit

Permalink
better estimating the file size (#275)
Browse files Browse the repository at this point in the history
improve the function that estimates the size of the resulting file.
  • Loading branch information
uriii3 authored Feb 10, 2025
1 parent 916e077 commit f4f722c
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 14 deletions.
18 changes: 11 additions & 7 deletions copernicusmarine/download_functions/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,14 +234,18 @@ def get_approximation_size_final_result(
dataset: xarray.Dataset,
) -> Optional[float]:
coordinates_size = 1
variables_size = 0
baseline_size = 0.013

for variable in dataset.data_vars:
variables_size += dataset[variable].encoding["dtype"].itemsize

for coordinate_name in dataset.sizes:
coordinates_size *= dataset[coordinate_name].size
estimate_size = (
coordinates_size
* len(list(dataset.data_vars))
* dataset[list(dataset.data_vars)[0]].dtype.itemsize
/ 1048e3
)
for coord_label in COORDINATES_LABEL:
if coordinate_name in COORDINATES_LABEL[coord_label]:
coordinates_size *= dataset[coordinate_name].size
estimate_size = baseline_size + coordinates_size * variables_size / 1048e3

return estimate_size


Expand Down
10 changes: 8 additions & 2 deletions tests/test_cf_compliance.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
import xarray

from copernicusmarine import subset
from tests.test_utils import execute_in_terminal
from tests.test_utils import (
execute_in_terminal,
main_checks_when_file_is_downloaded,
)


class TestCFCompliance:
Expand Down Expand Up @@ -31,7 +34,7 @@ def test_subset_with_warns(self, tmp_path, snapshot):
def if_I_subset_a_dataset(
self, dataset_id, tmp_path, output_filename, variable
):
subset(
response = subset(
dataset_id=dataset_id,
variables=[variable],
output_directory=tmp_path,
Expand All @@ -40,6 +43,9 @@ def if_I_subset_a_dataset(
end_datetime="2022-01-05T00:00:00",
)
assert (tmp_path / output_filename).exists()
main_checks_when_file_is_downloaded(
tmp_path / output_filename, dict(response)
)

def then_it_is_cf_compliant(
self, dataset_id, tmp_path, snapshot, output_filename
Expand Down
48 changes: 43 additions & 5 deletions tests/test_command_line_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from tests.test_utils import (
execute_in_terminal,
main_checks_when_file_is_downloaded,
remove_extra_logging_prefix_info,
)

Expand Down Expand Up @@ -167,14 +168,19 @@ def test_retention_period_works(self, tmp_path):
"48.13780081656672",
"--output-directory",
tmp_path,
"--output-filename",
"dataset.nc",
"--log-level",
"DEBUG",
]

self.output = execute_in_terminal(self.command)
assert self.output.returncode == 0
assert (
b"time (time) datetime64[ns] 2023" not in self.output.stderr
)
response = loads(self.output.stdout)
main_checks_when_file_is_downloaded(tmp_path / "dataset.nc", response)

def test_retention_period_works_when_only_values_in_metadata(
self, tmp_path
Expand All @@ -196,20 +202,27 @@ def test_retention_period_works_when_only_values_in_metadata(
"48.13780081656672",
"--output-directory",
tmp_path,
"--output-filename",
"dataset.nc",
"--log-level",
"DEBUG",
]

self.output = execute_in_terminal(self.command)
assert self.output.returncode == 0
assert (
b"time (time) datetime64[ns] 2023" not in self.output.stderr
)
response = loads(self.output.stdout)
main_checks_when_file_is_downloaded(tmp_path / "dataset.nc", response)

# -------------------------#
# Test on get requests #
# -------------------------#

def test_get_original_files_functionality(self, tmp_path):
def test_get_original_files_functionality(
self, tmp_path
): # TODO: check this test and what does it do
command = [
"copernicusmarine",
"get",
Expand Down Expand Up @@ -462,7 +475,10 @@ def test_subset_with_dry_run_option(self, tmp_path):
assert str(tmp_path) in returned_value["file_path"]
assert not os.path.exists(returned_value["file_path"])

def test_subset_by_default_returns_status_message(self, tmp_path):
def test_subset_by_default_returns_status_message(
self, tmp_path
): # TODO: it feels like we can just add this test into another one!
# so, do we need to download a whole dataset for this?
command = [
"copernicusmarine",
"subset",
Expand Down Expand Up @@ -544,6 +560,10 @@ def test_subset_output_file_as_netcdf(self, tmp_path):

self.output = execute_in_terminal(command)
is_file = pathlib.Path(tmp_path, output_filename).is_file()
response = loads(self.output.stdout)
main_checks_when_file_is_downloaded(
tmp_path / output_filename, response
)
assert self.output.returncode == 0
assert is_file

Expand Down Expand Up @@ -645,7 +665,7 @@ def then_I_got_a_clear_output_with_available_service_for_subset(self):

def when_I_request_subset_dataset_with_zarr_service(
self,
output_path,
tmp_path,
vertical_axis: Literal["depth", "elevation"] = "depth",
):
command = [
Expand Down Expand Up @@ -676,7 +696,7 @@ def when_I_request_subset_dataset_with_zarr_service(
"--service",
"arco-time-series",
"-o",
f"{output_path}",
f"{tmp_path}",
"-f",
"data.zarr",
]
Expand Down Expand Up @@ -819,7 +839,9 @@ def test_get_2023_08_original_files(self):
assert self.output.returncode == 0
assert b"No data to download" not in self.output.stderr

def test_subset_with_chunking(self, tmp_path):
def test_subset_with_chunking(
self, tmp_path
): # TODO: it says subset with chunking but looks kind of 'normal'
command = [
"copernicusmarine",
"subset",
Expand All @@ -845,11 +867,15 @@ def test_subset_with_chunking(self, tmp_path):
"8",
"-o",
f"{tmp_path}",
"-f",
"output.nc",
]

self.output = execute_in_terminal(command)

assert self.output.returncode == 0
response = loads(self.output.stdout)
main_checks_when_file_is_downloaded(tmp_path / "output.nc", response)

def test_short_option_for_copernicus_marine_command_helper(self):
short_option_command = [
Expand Down Expand Up @@ -1030,6 +1056,10 @@ def test_subset_filter_by_standard_name(self, tmp_path):
"thetao"
in xarray.open_zarr(f"{tmp_path}/{output_filename}").variables
)
response = loads(self.output.stdout)
main_checks_when_file_is_downloaded(
tmp_path / output_filename, response
)

def test_log_level_debug(self, tmp_path):
dataset_id = "cmems_mod_ibi_phy_my_0.083deg-3D_P1Y-m"
Expand Down Expand Up @@ -1067,6 +1097,10 @@ def test_log_level_debug(self, tmp_path):
self.output = execute_in_terminal(command)
assert self.output.returncode == 0
assert b"DEBUG - " in self.output.stderr
response = loads(self.output.stdout)
main_checks_when_file_is_downloaded(
tmp_path / output_filename, response
)

def test_arco_subset_is_fast(self, tmp_path):
command = [
Expand Down Expand Up @@ -1203,6 +1237,10 @@ def test_dataset_has_always_every_dimensions(self, tmp_path):
)
== 4
)
response = loads(self.output.stdout)
main_checks_when_file_is_downloaded(
tmp_path / output_filename, response
)

def test_netcdf_compression_option(self, tmp_path):
filename_without_option = "without_option.nc"
Expand Down
21 changes: 21 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import logging
import os
import pathlib
import subprocess
import time
from subprocess import CompletedProcess
Expand Down Expand Up @@ -41,3 +43,22 @@ def execute_in_terminal(
duration_second = t2 - t1
logger.info(f"Command executed in {duration_second} s: {command_to_print}")
return output


def main_checks_when_file_is_downloaded(
file_path: pathlib.Path,
response: dict,
):
size_variance = 0.2
offset_size = 0.05 # small datasets are hard to predict
file_size = os.path.getsize(file_path)
assert (
file_size / 1048e3
<= response["file_size"] * (1 + size_variance) + offset_size
)
assert (
file_size / 1048e3
>= response["file_size"] * (1 - size_variance) - offset_size
)
assert response["file_size"] <= response["data_transfer_size"]
return

0 comments on commit f4f722c

Please sign in to comment.