Merge branch 'main' into feature/add-error-collector-to-iso3-codes-va…

…lidator
dc-almeida · Sep 17, 2024 · 1e9faca · 1e9faca
2 parents 5f99c87 + 367e10c
commit 1e9faca
Show file tree

Hide file tree

Showing 10 changed files with 101 additions and 45 deletions.
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -15,7 +15,7 @@ jobs:
         shell: bash
     strategy:
       matrix:
-        os: ["macos", "ubuntu"]
+        os: ["macos", "ubuntu", "windows"]
         # keep consistent with py-version badge in README.md and docs/index.rst
         python-version: ["3.10", "3.11", "3.12"]
       fail-fast: false
@@ -44,10 +44,10 @@ jobs:
     #       load cached venv if cache exists
     #----------------------------------------------
     - name: Load cached venv
-      id: cached-poetry-dependencies
+      id: cached-pip-wheels
       uses: actions/cache@v4
       with:
-        path: .venv
+        path: ~/.cache
         key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
     #----------------------------------------------
     # install dependencies if cache does not exist

diff --git a/nomenclature/__init__.py b/nomenclature/__init__.py
@@ -4,13 +4,6 @@
 
 import yaml
 
-
-def log_error(dimension, error_list):
-    """Compile an error message and write to log"""
-    msg = f"The following {dimension}(s) are not defined in the {dimension} codelist:"
-    logging.error("\n - ".join(map(str, [msg] + error_list)))
-
-
 from nomenclature.cli import cli  # noqa
 from nomenclature.codelist import CodeList  # noqa
 from nomenclature.core import process  # noqa

diff --git a/nomenclature/codelist.py b/nomenclature/codelist.py
@@ -12,10 +12,9 @@
 from pydantic_core import PydanticCustomError
 
 import nomenclature
-from nomenclature import log_error
 from nomenclature.code import Code, MetaCode, RegionCode, VariableCode
 from nomenclature.config import CodeListConfig, NomenclatureConfig
-from nomenclature.error import ErrorCollector, custom_pydantic_errors
+from nomenclature.error import ErrorCollector, custom_pydantic_errors, log_error
 
 here = Path(__file__).parent.absolute()
 
@@ -98,9 +97,14 @@ def keys(self):
     def values(self):
         return self.mapping.values()
 
-    def validate_data(self, df: IamDataFrame, dimension: str) -> bool:
+    def validate_data(
+        self,
+        df: IamDataFrame,
+        dimension: str,
+        project: str | None = None,
+    ) -> bool:
         if invalid := self.validate_items(getattr(df, dimension)):
-            log_error(dimension, invalid)
+            log_error(dimension, invalid, project)
             return False
         return True
 
@@ -600,7 +604,11 @@ def vars_kwargs(self, variables: List[str]) -> List[VariableCode]:
             if self[var].agg_kwargs and not self[var].skip_region_aggregation
         ]
 
-    def validate_units(self, unit_mapping) -> bool:
+    def validate_units(
+        self,
+        unit_mapping,
+        project: None | str = None,
+    ) -> bool:
         if invalid_units := [
             (variable, unit, self.mapping[variable].unit)
             for variable, unit in unit_mapping.items()
@@ -613,14 +621,28 @@ def validate_units(self, unit_mapping) -> bool:
                 for v, u, e in invalid_units
             ]
             msg = "The following variable(s) are reported with the wrong unit:"
-            logging.error("\n - ".join([msg] + lst))
+            file_service_address = "https://files.ece.iiasa.ac.at"
+            logging.error(
+                "\n - ".join([msg] + lst)
+                + (
+                    f"\n\nPlease refer to {file_service_address}/{project}/"
+                    f"{project}-template.xlsx for the list of allowed units."
+                    if project is not None
+                    else ""
+                )
+            )
             return False
         return True
 
-    def validate_data(self, df: IamDataFrame, dimension: str) -> bool:
+    def validate_data(
+        self,
+        df: IamDataFrame,
+        dimension: str,
+        project: str | None = None,
+    ) -> bool:
         # validate variables
-        all_variables_valid = super().validate_data(df, dimension)
-        all_units_valid = self.validate_units(df.unit_mapping)
+        all_variables_valid = super().validate_data(df, dimension, project)
+        all_units_valid = self.validate_units(df.unit_mapping, project)
         return all_variables_valid and all_units_valid
 
     def list_missing_variables(

diff --git a/nomenclature/definition.py b/nomenclature/definition.py
@@ -44,6 +44,7 @@ def __init__(self, path, dimensions=None):
             path = Path(path)
 
         self.project_folder = path.parent
+        self.project = self.project_folder.name.split("-workflow")[0]
 
         if (file := self.project_folder / "nomenclature.yaml").exists():
             self.config = NomenclatureConfig.from_file(file=file)
@@ -98,7 +99,12 @@ def validate(self, df: IamDataFrame, dimensions: list | None = None) -> None:
         """
 
         if any(
-            getattr(self, dimension).validate_data(df, dimension) is False
+            getattr(self, dimension).validate_data(
+                df,
+                dimension,
+                self.project,
+            )
+            is False
             for dimension in (dimensions or self.dimensions)
         ):
             raise ValueError("The validation failed. Please check the log for details.")

diff --git a/nomenclature/error.py b/nomenclature/error.py
@@ -1,3 +1,4 @@
+import logging
 import textwrap
 from collections import namedtuple
 from typing import Optional
@@ -71,3 +72,23 @@ def __repr__(self) -> str:
 
     def __bool__(self) -> bool:
         return bool(self.errors)
+
+
+def log_error(
+    dimension: str,
+    error_list,
+    project: str | None = None,
+) -> None:
+    """Compile an error message and write to log"""
+    file_service_address = "https://files.ece.iiasa.ac.at"
+    msg = f"The following {dimension}(s) are not defined in the {dimension} codelist:"
+
+    logging.error(
+        "\n - ".join(map(str, [msg] + error_list))
+        + (
+            f"\n\nPlease refer to {file_service_address}/{project}/{project}"
+            f"-template.xlsx for the list of allowed {dimension}s."
+            if project is not None
+            else ""
+        )
+    )
diff --git a/nomenclature/processor/region.py b/nomenclature/processor/region.py
@@ -22,10 +22,9 @@
 from pydantic.types import DirectoryPath, FilePath
 from pydantic_core import PydanticCustomError
 
-from nomenclature import log_error
 from nomenclature.codelist import RegionCodeList, VariableCodeList
 from nomenclature.definition import DataStructureDefinition
-from nomenclature.error import custom_pydantic_errors, ErrorCollector
+from nomenclature.error import custom_pydantic_errors, ErrorCollector, log_error
 from nomenclature.processor import Processor
 from nomenclature.processor.utils import get_relative_path
 

diff --git a/nomenclature/processor/required_data.py b/nomenclature/processor/required_data.py
@@ -178,7 +178,13 @@ def apply(self, df: IamDataFrame) -> IamDataFrame:
             for model, data_list in missing_data.items():
                 missing_data_log_info += f"Missing for '{model}':\n"
                 for data in data_list:
-                    missing_data_log_info += f"{data}\n\n"
+                    missing_data_log_info += (
+                        data.to_string(
+                            index=False,
+                            justify="left",
+                        )
+                        + "\n\n"
+                    )
             logger.error(
                 "Missing required data.\nFile: %s\n\n%s",
                 get_relative_path(self.file),
@@ -212,6 +218,7 @@ def check_required_data_per_model(
                         .to_frame()
                         .reset_index()
                         .drop(columns=["model"])
+                        .rename(columns={"year": "year(s)"})
                     )
         return missing_data
 

diff --git a/tests/data/required_data/required_data/requiredData_apply_error.yaml b/tests/data/required_data/required_data/requiredData_apply_error.yaml
@@ -1,7 +1,7 @@
 model: model_a
 required_data:
   - measurand:
-    - Primary Energy:
+    - Primary Energy|Making sure that a really long variable is displayed completely:
         unit: [GWh/yr, Mtoe]
     year: [2005, 2010, 2015] # 2015 is missing from simple_df for all models
   - variable: Final Energy

diff --git a/tests/test_required_data.py b/tests/test_required_data.py
@@ -100,24 +100,20 @@ def test_RequiredData_apply_raises(simple_df, caplog):
         required_data_validator.apply(simple_df)
 
     missing_data = [
-        """
-  scenario        variable    unit            year
-0   scen_a  Primary Energy  GWh/yr  2005,2010,2015
-1   scen_a  Primary Energy    Mtoe  2005,2010,2015
-2   scen_b  Primary Energy  GWh/yr  2005,2010,2015
-3   scen_b  Primary Energy    Mtoe  2005,2010,2015""",
-        """
-  scenario      variable
-0   scen_a  Final Energy
-1   scen_b  Final Energy""",
-        """
-  scenario       variable       unit
-0   scen_a  Emissions|CO2  Mt CO2/yr
-1   scen_b  Emissions|CO2  Mt CO2/yr""",
-        """
-  scenario region      variable
-0   scen_a  World  Final Energy
-1   scen_b  World  Final Energy""",
+        """scenario variable                                                                       unit   year(s)""",
+        """scen_a   Primary Energy|Making sure that a really long variable is displayed completely GWh/yr 2005,2010,2015
+scen_a   Primary Energy|Making sure that a really long variable is displayed completely   Mtoe 2005,2010,2015
+scen_b   Primary Energy|Making sure that a really long variable is displayed completely GWh/yr 2005,2010,2015
+scen_b   Primary Energy|Making sure that a really long variable is displayed completely   Mtoe 2005,2010,2015""",
+        """scenario variable""",
+        """scen_a   Final Energy
+scen_b   Final Energy""",
+        """scenario variable      unit""",
+        """scen_a   Emissions|CO2 Mt CO2/yr
+scen_b   Emissions|CO2 Mt CO2/yr""",
+        """scenario region variable""",
+        """scen_a   World  Final Energy
+scen_b   World  Final Energy""",
     ]
     # check if the log message contains the correct information
     assert all(

diff --git a/tests/test_validation.py b/tests/test_validation.py
@@ -28,28 +28,40 @@ def test_validation_brackets(extras_definition, simple_df):
     extras_definition.validate(simple_df)
 
 
-def test_validation_fails_variable(simple_definition, simple_df):
+def test_validation_fails_variable(simple_definition, simple_df, caplog):
     """Changing a variable name raises"""
     simple_df.rename(variable={"Primary Energy": "foo"}, inplace=True)
 
     with pytest.raises(ValueError, match=MATCH_FAIL_VALIDATION):
         simple_definition.validate(simple_df)
+    assert (
+        "Please refer to https://files.ece.iiasa.ac.at/data/data-template.xlsx"
+        " for the list of allowed variables." in caplog.text
+    )
 
 
-def test_validation_fails_unit(simple_definition, simple_df):
+def test_validation_fails_unit(simple_definition, simple_df, caplog):
     """Changing a unit raises"""
     simple_df.rename(unit={"EJ/yr": "GWh/yr"}, inplace=True)
 
     with pytest.raises(ValueError, match=MATCH_FAIL_VALIDATION):
         simple_definition.validate(simple_df)
+    assert (
+        "Please refer to https://files.ece.iiasa.ac.at/data/data-template.xlsx"
+        " for the list of allowed units." in caplog.text
+    )
 
 
-def test_validation_fails_region(simple_definition, simple_df):
+def test_validation_fails_region(simple_definition, simple_df, caplog):
     """Changing a region name raises"""
     simple_df.rename(region={"World": "foo"}, inplace=True)
 
     with pytest.raises(ValueError, match=MATCH_FAIL_VALIDATION):
         simple_definition.validate(simple_df)
+    assert (
+        "Please refer to https://files.ece.iiasa.ac.at/data/data-template.xlsx"
+        " for the list of allowed regions." in caplog.text
+    )
 
 
 def test_validation_fails_region_as_int(simple_definition, simple_df):