From 55c63b14d840747afc6e5da6b125a52aa219a9fd Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Wed, 11 Dec 2024 14:09:53 -0500 Subject: [PATCH 1/4] remove variable value comparison logic and args --- ncompare/console.py | 2 - ncompare/core.py | 150 +------------------------------------------- tests/test_core.py | 131 -------------------------------------- 3 files changed, 1 insertion(+), 282 deletions(-) diff --git a/ncompare/console.py b/ncompare/console.py index e530043..76a5251 100755 --- a/ncompare/console.py +++ b/ncompare/console.py @@ -51,8 +51,6 @@ def _cli(args: Optional[Sequence[str]]) -> argparse.Namespace: ) parser.add_argument("nc_a", help="First NetCDF file") parser.add_argument("nc_b", help="Second NetCDF file") - parser.add_argument("-v", "--comparison_var_name", help="Comparison variable name") - parser.add_argument("-g", "--comparison_var_group", help="Comparison variable group") parser.add_argument( "--only-diffs", action="store_true", diff --git a/ncompare/core.py b/ncompare/core.py index 5528f3b..a2e6cb4 100644 --- a/ncompare/core.py +++ b/ncompare/core.py @@ -29,17 +29,14 @@ """Compare the structure of two NetCDF files.""" -import random -import traceback from collections import namedtuple from collections.abc import Iterable, Iterator from pathlib import Path from typing import Optional, Union import netCDF4 -import numpy as np import xarray as xr -from colorama import Fore, Style +from colorama import Fore from ncompare.printing import Outputter from ncompare.sequence_operations import common_elements, count_diffs @@ -57,8 +54,6 @@ def compare( nc_a: Union[str, Path], nc_b: Union[str, Path], - comparison_var_group: Optional[str] = None, - comparison_var_name: Optional[str] = None, only_diffs: bool = False, no_color: bool = False, show_chunks: bool = False, @@ -76,10 +71,6 @@ def compare( filepath to NetCDF4 nc_b : str filepath to NetCDF4 - comparison_var_group : str, optional - The name of a group which contains a desired comparison variable - comparison_var_name : str, optional - The name of a variable for which we want to compare values only_diffs : bool, optional Whether to show only the variables/attributes that are different between the two files no_color : bool, default False @@ -127,8 +118,6 @@ def compare( out, nc_a, nc_b, - comparison_var_group=comparison_var_group, - comparison_var_name=comparison_var_name, show_chunks=show_chunks, show_attributes=show_attributes, ) @@ -146,8 +135,6 @@ def run_through_comparisons( out: Outputter, nc_a: Union[str, Path], nc_b: Union[str, Path], - comparison_var_group: Optional[str], - comparison_var_name: Optional[str], show_chunks: bool, show_attributes: bool, ) -> None: @@ -158,8 +145,6 @@ def run_through_comparisons( out nc_a nc_b - comparison_var_group - comparison_var_name show_chunks show_attributes """ @@ -175,92 +160,12 @@ def run_through_comparisons( list_b = _get_groups(nc_b) _, _, _ = out.lists_diff(list_a, list_b) - if comparison_var_group: - # Show the variables within the selected group. - out.print( - Fore.LIGHTBLUE_EX + f"\nVariables within specified group <{comparison_var_group}>:", - add_to_history=True, - ) - vlist_a = _get_vars(nc_a, comparison_var_group) - vlist_b = _get_vars(nc_b, comparison_var_group) - _, _, _ = out.lists_diff(vlist_a, vlist_b) - - # TODO: Remove comparison variable/val? - if comparison_var_name: - try: - # Print the first part of the values array for the selected variable. - out.print( - Fore.LIGHTBLUE_EX - + f"\nSample values within specified variable <{comparison_var_name}>:" - ) - _print_sample_values(out, nc_a, comparison_var_group, comparison_var_name) - _print_sample_values(out, nc_b, comparison_var_group, comparison_var_name) - # compare_sample_values(nc_a, nc_b, groupname=comparison_var_group, varname=comparison_var_name) - - out.print( - Fore.LIGHTBLUE_EX - + f"\nChecking multiple random values within specified variable <{comparison_var_name}>:" - ) - compare_multiple_random_values( - out, - nc_a, - nc_b, - groupname=comparison_var_group, - varname=comparison_var_name, - ) - - except KeyError: - out.print( - Style.BRIGHT - + Fore.RED - + f"\nError when comparing values for variable <{comparison_var_name}> " - f"in group <{comparison_var_group}>." - ) - out.print(traceback.format_exc()) - out.print("\n") - else: - out.print(Fore.LIGHTBLACK_EX + "\nNo variable selected for comparison. Skipping..") - else: - out.print(Fore.LIGHTBLACK_EX + "\nNo variable group selected for comparison. Skipping..") - out.print(Fore.LIGHTBLUE_EX + "\nAll variables:", add_to_history=True) _, _, _ = compare_two_nc_files( out, nc_a, nc_b, show_chunks=show_chunks, show_attributes=show_attributes ) -def compare_multiple_random_values( - out: Outputter, - nc_a: Union[str, Path], - nc_b: Union[str, Path], - groupname: str, - varname: str, - num_comparisons: int = 100, -): - """Iterate through N random samples, and evaluate whether the differences exceed a threshold.""" - # Open a variable from each NetCDF - nc_var_a = xr.open_dataset(nc_a, backend_kwargs={"group": groupname}).variables[varname] - nc_var_b = xr.open_dataset(nc_b, backend_kwargs={"group": groupname}).variables[varname] - - num_mismatches = 0 - for _ in range(num_comparisons): - match_result = _match_random_value(out, nc_var_a, nc_var_b) - if match_result is True: - out.print(".", colors=False, end="") - elif match_result is None: - out.print("n", colors=False, end="") - num_mismatches += 1 - else: - out.print("x", colors=False, end="") - num_mismatches += 1 - - if num_mismatches > 0: - out.print(Fore.RED + f" {num_mismatches} mismatches, out of {num_comparisons} samples.") - else: - out.print(Fore.CYAN + " No mismatches.") - out.print("Done.", colors=False) - - def walk_common_groups_tree( top_a_name: str, top_a: Union[netCDF4.Dataset, netCDF4.Group], @@ -587,59 +492,6 @@ def _var_properties(group: Union[netCDF4.Dataset, netCDF4.Group], varname: str) return VarProperties(varname, the_variable, v_dtype, v_shape, v_chunking, v_attributes) -def _match_random_value( - out: Outputter, nc_var_a: xr.Variable, nc_var_b: xr.Variable, thresh: float = 1e-6 -) -> Union[bool, None]: - """Check whether a randomly selected data point matches between two variables. - - Returns - ------- - None or bool - None if data point is null for one and only one of the variables - True if values match - False if the difference exceeds the given threshold - """ - # Get a random indexer - rand_index = [] - for dim_length in nc_var_a.shape: - rand_index.append(random.randint(0, dim_length - 1)) - rand_index_tuple = tuple(rand_index) - - # Get the values from each variable - value_a = nc_var_a.values[rand_index_tuple] - value_b = nc_var_b.values[rand_index_tuple] - - # Check whether null - if np.isnan(value_a) and np.isnan(value_b): - return True - elif np.isnan(value_a) or np.isnan(value_b): - return None - - # Evaluate difference between values - diff = value_b - value_a - if abs(diff) > thresh: - out.print() - out.print(Fore.RED + f"Difference exceeded threshold (diff == {diff}") - out.print(f"var shape: {nc_var_a.shape}", colors=False) - out.print(f"indices: {rand_index_tuple}", colors=False) - out.print(f"value a: {value_a}", colors=False) - out.print(f"value b: {value_b}", colors=False, end="\n\n") - return False - - return True - - -def _print_sample_values(out: Outputter, nc_filepath, groupname: str, varname: str) -> None: - comparison_variable = xr.open_dataset(nc_filepath, backend_kwargs={"group": groupname})[varname] - vector_of_values = comparison_variable.values.flatten() - n_values = len(vector_of_values) - if n_values > 100: - sample_length = 100 - else: - sample_length = n_values - out.print(str(vector_of_values[:sample_length]), colors=False) - - def _get_attribute_value_as_str(varprops: VarProperties, attribute_key: str) -> str: if attribute_key and (attribute_key in varprops.attributes): attr = varprops.attributes[attribute_key] diff --git a/tests/test_core.py b/tests/test_core.py index 16c9da4..51455e5 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -33,12 +33,9 @@ import netCDF4 as nc import pytest -import xarray as xr from ncompare.core import ( _get_vars, - _match_random_value, - _print_sample_values, _var_properties, compare, ) @@ -76,134 +73,6 @@ def test_no_error_compare_2groupsTo1Subgroup( compare_ba(ds_3dims_3vars_4coords_2groups, ds_3dims_3vars_4coords_1subgroup) -def test_matching_random_values( - ds_3dims_2vars_4coords, - ds_4dims_3vars_5coords, - ds_3dims_3vars_4coords_1group, - ds_1dim_1var_1coord, - ds_1dim_1var_allnan_1coord, - outputter_to_console, -): - variable_array_1 = xr.open_dataset(ds_3dims_2vars_4coords).variables["z1"] - variable_array_2 = xr.open_dataset(ds_4dims_3vars_5coords).variables["z1"] - variable_array_3 = xr.open_dataset(ds_1dim_1var_1coord).variables["z1"] - variable_array_allnan = xr.open_dataset(ds_1dim_1var_allnan_1coord).variables["z1"] - - assert ( - _match_random_value( - outputter_to_console, - variable_array_1, - variable_array_1, - ) - is True - ) - assert ( - _match_random_value( - outputter_to_console, - variable_array_1, - variable_array_2, - ) - is False - ) - assert ( - _match_random_value( - outputter_to_console, - variable_array_3, - variable_array_3, - ) - is True - ) - # NaN to non-NaN is NOT considered a match - assert ( - _match_random_value( - outputter_to_console, - variable_array_3, - variable_array_allnan, - ) - is None - ) - # NaN to NaN is considered a match - assert ( - _match_random_value( - outputter_to_console, - variable_array_allnan, - variable_array_allnan, - ) - is True - ) - - -def test_print_values_runs_with_no_error(ds_3dims_3vars_4coords_1group, outputter_to_console): - with does_not_raise(): - _print_sample_values( - outputter_to_console, - ds_3dims_3vars_4coords_1group, - groupname="Group1", - varname="step", - ) - - -def test_print_values_to_text_file_runs_with_no_error( - ds_3dims_3vars_4coords_1group, outputter_to_text_file, temp_test_text_file_path -): - _print_sample_values( - outputter_to_text_file, - ds_3dims_3vars_4coords_1group, - groupname="Group1", - varname="step", - ) - outputter_to_text_file._text_file_obj.close() - - comparison_variable = xr.open_dataset( - ds_3dims_3vars_4coords_1group, backend_kwargs={"group": "Group1"} - )["step"] - - with open(temp_test_text_file_path) as f: - lines = f.readlines() - assert lines[0].strip().replace("[", "").replace("]", "").split() == [ - str(round(x, 1)) for x in comparison_variable[:].values - ] - - -def test_comparison_group_no_error_for_duplicate_dataset( - ds_3dims_3vars_4coords_1group, temp_test_text_file_path -): - compare( - ds_3dims_3vars_4coords_1group, - ds_3dims_3vars_4coords_1group, - comparison_var_group="Group1", - file_text=temp_test_text_file_path, - ) - - found_expected = False - with open(temp_test_text_file_path) as f: - for line in f.readlines(): - if "Variables within specified group :" in line: - found_expected = True - - assert found_expected - - -def test_comparison_var_no_error_for_duplicate_dataset( - ds_3dims_3vars_4coords_1group, temp_test_text_file_path -): - compare( - ds_3dims_3vars_4coords_1group, - ds_3dims_3vars_4coords_1group, - comparison_var_group="Group1", - comparison_var_name="var1", - file_text=temp_test_text_file_path, - ) - - found_expected = False - with open(temp_test_text_file_path) as f: - for line in f.readlines(): - if "Sample values within specified variable :" in line: - found_expected = True - - assert found_expected - - def test_get_vars_with_group(ds_3dims_3vars_4coords_1group): result = _get_vars(ds_3dims_3vars_4coords_1group, groupname="Group1") assert set(result) == {"step", "var1", "var2", "w"} From 17446282add3867d4c1522f405a0c597c10cd31d Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Wed, 11 Dec 2024 14:10:39 -0500 Subject: [PATCH 2/4] remove variable value comparison from README.md --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 57a0834..5c91f6f 100644 --- a/README.md +++ b/README.md @@ -90,8 +90,6 @@ ncompare S001G01.nc S001G01_SUBSET.nc --file-text subset_comparison.txt - `--no-color` : Turn off all colorized output. - `--show-attributes` : Include variable attributes in the table that compares variables. - `--show-chunks` : Include chunk sizes in the table that compares variables. -- `-v` (`--comparison_var_name`) [VAR_NAME]: Compare specific values for this variable. -- `-g` (`--comparison_var_group`) [VAR_GROUP]: Group that contains the `comparison_var_name`. - `--column-widths` [WIDTH, WIDTH, WIDTH]: Width, in number of characters, of the three columns in the comparison report - `--version` : Show the current version and then exit. From d4f12a100341dc81952fcb58503dabec23d5f2af Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Wed, 11 Dec 2024 14:16:02 -0500 Subject: [PATCH 3/4] update text golden file --- tests/data/a-b_test_golden_file.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/data/a-b_test_golden_file.txt b/tests/data/a-b_test_golden_file.txt index 54166ff..1efe5f5 100644 --- a/tests/data/a-b_test_golden_file.txt +++ b/tests/data/a-b_test_golden_file.txt @@ -9,8 +9,6 @@ Root-level Groups: Are all items the same? ---> True. ['Data', 'Position', 'Statistics'] -No variable group selected for comparison. Skipping.. - All variables: File A File B All Variables From 021396642e6f9093e2b0f47709b3efad96b4cd42 Mon Sep 17 00:00:00 2001 From: danielfromearth Date: Wed, 11 Dec 2024 14:20:54 -0500 Subject: [PATCH 4/4] update CHANGELOG.md --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index cc16216..6f31242 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Catch "unsupported datatype" exception from netCDF library ([#268](https://github.com/nasa/ncompare/pull/268)) ([**@danielfromearth**](https://github.com/danielfromearth)) +### Removed + +- **Breaking:** remove randomized value check logic and from API ([#271](https://github.com/nasa/ncompare/pull/271)) ([**@danielfromearth**](https://github.com/danielfromearth)) + ## [1.11.0] - 2024-11-14 ### Added