From d2760eca82b05c70438c044bbaad965bfa094aca Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Wed, 17 Nov 2021 20:22:40 +0100
Subject: [PATCH] Reimplement validation for subannual/datetime data (#129)

---
 .github/workflows/pytest.yml               | 33 ++++++++
 .github/workflows/validation.yml           |  6 +-
 {openentrance/tests => tests}/test_core.py |  6 +-
 tests/test_definitions.py                  | 54 +++++++++++++
 tests/test_validate.py                     | 90 ++++++++++++++++++++++
 workflow.py                                | 33 +++++++-
 6 files changed, 213 insertions(+), 9 deletions(-)
 create mode 100644 .github/workflows/pytest.yml
 rename {openentrance/tests => tests}/test_core.py (58%)
 create mode 100644 tests/test_definitions.py
 create mode 100644 tests/test_validate.py

diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
new file mode 100644
index 00000000..c42402be
--- /dev/null
+++ b/.github/workflows/pytest.yml
@@ -0,0 +1,33 @@
+# This workflow will install Python dependencies and run the tests
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Pytest
+
+on:
+  push:
+    branches: [ '**' ]
+  pull_request:
+    branches: [ '**' ]
+
+jobs:
+  tests:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.9
+
+    - name: Install dependencies
+      run: |
+        pip install -r requirements.txt
+        pip install pytest
+
+    - name: Install and test package functions
+      run: |
+        pip install --editable .
+        pytest tests
diff --git a/.github/workflows/validation.yml b/.github/workflows/validation.yml
index 0d1520cb..c0c6f3d7 100644
--- a/.github/workflows/validation.yml
+++ b/.github/workflows/validation.yml
@@ -1,7 +1,7 @@
 # This workflow will install Python dependencies and validate the project
 # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 
-name: Validate the project
+name: Nomenclature
 
 on:
   push:
@@ -10,7 +10,7 @@ on:
     branches: [ '**' ]
 
 jobs:
-  pytest:
+  validation:
 
     runs-on: ubuntu-latest
 
@@ -22,7 +22,7 @@ jobs:
       with:
         python-version: 3.9
 
-    - name: Install requirements
+    - name: Install dependencies
       run: pip install -r requirements.txt
 
     - name: Run the nomenclature project validation
diff --git a/openentrance/tests/test_core.py b/tests/test_core.py
similarity index 58%
rename from openentrance/tests/test_core.py
rename to tests/test_core.py
index 233fc702..e1809ed8 100644
--- a/openentrance/tests/test_core.py
+++ b/tests/test_core.py
@@ -1,12 +1,12 @@
-import openentrance as oe
+from openentrance import iso_mapping, nuts_hierarchy
 
 
 def test_iso_mapping():
     # check that iso-mapping dictionary is not empty and has specific elements
     for name in ["GR", "GRC", "EL"]:
-        assert oe.iso_mapping[name] == "Greece"
+        assert iso_mapping[name] == "Greece"
 
 
 def test_nuts_hierarchy():
     # check that nuts-hierarchy is not empty and has specific elements
-    assert oe.nuts_hierarchy["Belgium"]["BE2"]["BE24"] == ["BE241", "BE242"]
+    assert nuts_hierarchy["Belgium"]["BE2"]["BE24"] == ["BE241", "BE242"]
diff --git a/tests/test_definitions.py b/tests/test_definitions.py
new file mode 100644
index 00000000..3139da57
--- /dev/null
+++ b/tests/test_definitions.py
@@ -0,0 +1,54 @@
+import nomenclature
+
+definition = nomenclature.DataStructureDefinition("definitions")
+
+
+def test_variables():
+    # check that regions dictionary is not empty and has specific element
+    assert "Emissions|CO2" in definition.variable
+
+
+def test_variables_fuel_types():
+    # check that exploding of <Fuel> to fuels works (including CCS subcategory)
+    obs = definition.variable["Secondary Energy|Electricity|Gas"]
+    exp = (
+        "Net electricity production from natural gas "
+        "(including methane from biomass or hydrogenation)"
+    )
+    assert obs["description"] == exp
+
+    obs = definition.variable["Secondary Energy|Electricity|Gas|w/ CCS"]
+    exp = (
+        "Net electricity production from natural gas (including methane "
+        "from biomass or hydrogenation) with a CO2 capture component"
+    )
+    assert obs["description"] == exp
+
+
+def test_variables_industry_types():
+    # check that exploding of <industry> to industries works
+    obs = definition.variable["Capital|iAGRI"]
+    exp = "Total capital costs spend by agriculture"
+    assert obs["description"] == exp
+
+
+def test_variables_transport_types():
+    # check that exploding of <transport> to transportation modes works
+    obs = definition.variable["Energy Service|Transportation|Freight|Rail"]
+    exp = (
+        "Provision of energy services related to freight "
+        "rail-based transportation technologies"
+    )
+    assert obs["description"] == exp
+
+
+def test_variables_product_types():
+    # check that exploding of <product> to procuts works
+    obs = definition.variable["Consumption|Households|pAGRI|Imported"]
+    exp = "Consumption of imported agriculture by households"
+    assert obs["description"] == exp
+
+
+def test_regions():
+    # check that regions dictionary is not empty and has specific element
+    assert "Europe" in definition.region
diff --git a/tests/test_validate.py b/tests/test_validate.py
new file mode 100644
index 00000000..bf1571c9
--- /dev/null
+++ b/tests/test_validate.py
@@ -0,0 +1,90 @@
+import pandas as pd
+from pyam import IamDataFrame
+import pytest
+
+import sys
+
+sys.path.append("..")
+
+from workflow import main as workflow
+
+
+TEST_DF = pd.DataFrame(
+    [
+        ["model_a", "scen_a", "Europe", "Primary Energy", "EJ/yr", 1, 6.0],
+    ],
+    columns=["model", "scenario", "region", "variable", "unit", 2005, 2010],
+)
+df = IamDataFrame(TEST_DF)
+
+
+def validate(df):
+    try:
+        workflow(df)
+        return True
+    except ValueError as e:
+        print(e)
+        return False
+
+
+def test_validate():
+    # test simple validation
+    assert validate(df)
+
+
+def test_validate_fail():
+    # test that simple validation fails on variable and region dimension
+    assert not (validate(df.rename(variable={"Primary Energy": "foo"})))
+    assert not (validate(df.rename(region={"Europe": "foo"})))
+
+
+def _test_validate_directional():
+    # test that validation works as expected with directional data
+    assert validate(df.rename(region={"Europe": "Austria>Germany"}))
+    assert not validate(df.rename(region={"Europe": "Austria>foo"}))
+
+    # test that directional data with more than one `>` fails
+    assert not validate(df.rename(region={"Europe": "Austria>Italy>France"}))
+
+
+def test_validate_subannual_months():
+    # test that validation works as expected with months
+    # (and representative timeslices generally)
+    assert validate(IamDataFrame(TEST_DF, subannual="January"))
+    assert not validate(IamDataFrame(TEST_DF, subannual="foo"))
+
+
+@pytest.mark.parametrize(
+    "subannual, status",
+    [
+        ("01-01 00:00+01:00", True),
+        ("01-01 00:00", False),
+        ("01-01 00:00+02:00", False),
+        ("01-32 00:00+01:00", False),
+    ],
+)
+def test_validate_subannual_datetime(subannual, status):
+    # test that validation works as expected with continuous time as subannual
+    assert validate(IamDataFrame(TEST_DF, subannual=subannual)) == status
+
+
+@pytest.mark.parametrize(
+    "rename_mapping, status",
+    [
+        ({2005: "2005-06-17 00:00+01:00", 2010: "2010-06-17 00:00+01:00"}, True),
+        ({2005: "2005-06-17 00:00+02:00", 2010: "2010-06-17 00:00+02:00"}, False),
+        ({2005: "2005-06-17 00:00", 2010: "2010-06-17 00:00"}, False),
+    ],
+)
+def test_validate_time_entry(rename_mapping, status):
+    # test that validation works as expected with datetime-domain
+    _df = IamDataFrame(
+        IamDataFrame(TEST_DF)
+        .data.rename(columns={"year": "time"})
+        .replace(rename_mapping)
+    )
+    assert validate(_df) == status
+
+
+def test_validate_unit_entry():
+    assert not (validate(df.rename(unit={"EJ/yr": "MWh"})))
diff --git a/workflow.py b/workflow.py
index 4e0c0027..f975e31c 100755
--- a/workflow.py
+++ b/workflow.py
@@ -5,22 +5,49 @@
 
 here = Path(__file__).absolute().parent
 logger = logging.getLogger(__name__)
+from datetime import datetime, timedelta
+
+
+# datetime must be in Central European Time (CET)
+EXP_TZ = "UTC+01:00"
+EXP_TIME_OFFSET = timedelta(seconds=3600)
 
 
 def main(df: pyam.IamDataFrame) -> pyam.IamDataFrame:
     """Main function for validation and processing"""
     logger.info("Starting openENTRANCE timeseries-upload processing workflow...")
 
-    if "subannual" in df.dimensions:
+    if "subannual" in df.dimensions or df.time_col == "time":
         dimensions = ["region", "variable", "subannual"]
     else:
         dimensions = ["region", "variable"]
 
     definition = DataStructureDefinition(here / "definitions", dimensions=dimensions)
-    definition.validate(df)
 
+    definition.validate(df, dimensions=["region", "variable"])
+
+    # convert to subannual format if data provided in datetime format
     if df.time_col == "time":
         logger.info('Re-casting from "time" column to categorical "subannual" format')
-        df.swap_time_for_year(inplace=True)
+        df = df.swap_time_for_year(subannual=True)
+
+    # check that any datetime-like items in "subannual" are valid datetime and UTC+01:00
+    if "subannual" in df.dimensions:
+        _datetime = [s for s in df.subannual if s not in definition.subannual]
+
+        for d in _datetime:
+            try:
+                _dt = datetime.strptime(f"2020-{d}", "%Y-%m-%d %H:%M%z")
+            except ValueError:
+                try:
+                    datetime.strptime(f"2020-{d}", "%Y-%m-%d %H:%M")
+                except ValueError:
+                    raise ValueError(f"Invalid subannual timeslice: {d}")
+
+                raise ValueError(f"Missing timezone: {d}")
+
+            # casting to datetime with timezone was successful
+            if not (_dt.tzname() == EXP_TZ or _dt.utcoffset() == EXP_TIME_OFFSET):
+                raise ValueError(f"Invalid timezone: {d}")
 
     return df