Add support for CSVs in SVObs format. (#290)

datacommonsorg · Mar 11, 2024 · fc90a1d · fc90a1d
1 parent e8b0ada
commit fc90a1d
Show file tree

Hide file tree

Showing 20 changed files with 336 additions and 0 deletions.
diff --git a/simple/stats/config.py b/simple/stats/config.py
@@ -17,6 +17,7 @@
 from stats.data import AggregationConfig
 from stats.data import EventType
 from stats.data import ImportType
+from stats.data import InputFileFormat
 from stats.data import Provenance
 from stats.data import Source
 from stats.data import StatVar
@@ -42,6 +43,8 @@
 _AGGREGATION_FIELD = "aggregation"
 _PROPERTIES_FIELD = "properties"
 _DATA_DOWNLOAD_URL_FIELD = "dataDownloadUrl"
+_FORMAT_FIELD = "format"
+_COLUMN_MAPPINGS_FIELD = "columnMappings"
 
 
 class Config:
@@ -83,6 +86,17 @@ def import_type(self, input_file_name: str) -> ImportType:
           f"Unsupported import type: {import_type_str} ({input_file_name})")
     return ImportType(import_type_str)
 
+  def format(self, input_file_name: str) -> ImportType | None:
+    format_str = self._input_file(input_file_name).get(_FORMAT_FIELD)
+    if not format_str:
+      return None
+    if format_str not in iter(InputFileFormat):
+      raise ValueError(f"Unsupported format: {format_str} ({input_file_name})")
+    return InputFileFormat(format_str)
+
+  def column_mappings(self, input_file_name: str) -> dict[str, str]:
+    return self._input_file(input_file_name).get(_COLUMN_MAPPINGS_FIELD, {})
+
   def computed_variables(self, input_file_name: str) -> list[str]:
     return self._input_file(input_file_name).get(_COMPUTED_VARIABLES_FIELD, [])
 

diff --git a/simple/stats/constants.py b/simple/stats/constants.py
@@ -71,6 +71,7 @@
 COLUMN_DATE = "date"
 COLUMN_VALUE = "value"
 COLUMN_PROVENANCE = "provenance"
+COLUMN_ENTITY = "entity"
 
 # Debug CSV columns and values
 DEBUG_COLUMN_INPUT = "input"

diff --git a/simple/stats/data.py b/simple/stats/data.py
@@ -296,6 +296,11 @@ class ImportType(StrEnum):
   EVENTS = "events"
 
 
+class InputFileFormat(StrEnum):
+  VARIABLE_PER_ROW = "variablePerRow"
+  VARIABLE_PER_COLUMN = "variablePerColumn"
+
+
 class TimePeriod(StrEnum):
   DAY = "day"
   MONTH = "month"

diff --git a/simple/stats/runner.py b/simple/stats/runner.py
@@ -19,6 +19,7 @@
 from stats import constants
 from stats.config import Config
 from stats.data import ImportType
+from stats.data import InputFileFormat
 from stats.db import create_db
 from stats.db import create_main_dc_config
 from stats.db import create_sqlite_config
@@ -31,6 +32,7 @@
 from stats.nodes import Nodes
 from stats.observations_importer import ObservationsImporter
 from stats.reporter import ImportReporter
+from stats.variable_per_row_importer import VariablePerRowImporter
 from util.filehandler import create_file_handler
 from util.filehandler import FileHandler
 
@@ -173,6 +175,12 @@ def _create_importer(self, input_fh: FileHandler) -> Importer:
     reporter = self.reporter.import_file(input_file)
 
     if import_type == ImportType.OBSERVATIONS:
+      input_file_format = self.config.format(input_file)
+      if input_file_format == InputFileFormat.VARIABLE_PER_ROW:
+        return VariablePerRowImporter(input_fh=input_fh,
+                                      db=self.db,
+                                      reporter=reporter,
+                                      nodes=self.nodes)
       return ObservationsImporter(input_fh=input_fh,
                                   db=self.db,
                                   debug_resolve_fh=debug_resolve_fh,

diff --git a/simple/stats/variable_per_row_importer.py b/simple/stats/variable_per_row_importer.py
@@ -0,0 +1,96 @@
+# Copyright 2024 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from csv import DictReader
+import logging
+import random
+
+from stats import constants
+from stats.data import Observation
+from stats.db import Db
+from stats.importer import Importer
+from stats.nodes import Nodes
+from stats.reporter import FileImportReporter
+from util.filehandler import FileHandler
+
+_COLUMNS = [
+    constants.COLUMN_ENTITY, constants.COLUMN_VARIABLE, constants.COLUMN_DATE,
+    constants.COLUMN_VALUE
+]
+_DEFAULT_COLUMN_MAPPINGS = {x: x for x in _COLUMNS}
+
+
+class VariablePerRowImporter(Importer):
+  """Imports a single observations input file where variables are specified in rows (aka "SVObs").
+  This is in contrast to the ObservationsImporter where variables are specified in columns.
+
+  Currently this importer only writes observations and no entities.
+  It also does not resolve any entities and expects all entities to be pre-resolved. 
+    """
+
+  def __init__(self, input_fh: FileHandler, db: Db,
+               reporter: FileImportReporter, nodes: Nodes) -> None:
+    self.input_fh = input_fh
+    self.db = db
+    self.reporter = reporter
+    self.input_file_name = self.input_fh.basename()
+    self.nodes = nodes
+    self.config = nodes.config
+    # Reassign after reading CSV.
+    self.column_mappings = dict(_DEFAULT_COLUMN_MAPPINGS)
+    self.reader: DictReader = None
+
+  def do_import(self) -> None:
+    self.reporter.report_started()
+    try:
+      self._read_csv()
+      self._map_columns()
+      self._write_observations()
+      self.reporter.report_success()
+    except Exception as e:
+      self.reporter.report_failure(str(e))
+      raise e
+
+  def _read_csv(self) -> None:
+    self.reader = DictReader(self.input_fh.read_string_io())
+
+  def _map_columns(self):
+    config_mappings = self.config.column_mappings(self.input_file_name)
+    for key in self.column_mappings.keys():
+      if key in config_mappings:
+        self.column_mappings[key] = config_mappings[key]
+
+    # Ensure that the expected columns exist.
+    expected_column_names = set(self.column_mappings.values())
+    logging.info("Expected column names: %s", expected_column_names)
+    actual_column_names = set(self.reader.fieldnames)
+    logging.info("Actual column names: %s", actual_column_names)
+    difference = expected_column_names - actual_column_names
+    if difference:
+      raise ValueError(
+          f"The following expected columns were not found: {difference}. You can specify column mappings using the columnMappings field."
+      )
+
+  def _write_observations(self) -> None:
+    provenance = self.nodes.provenance(self.input_file_name).id
+    observations: list[Observation] = []
+    for row in self.reader:
+      observation = Observation(
+          entity=row[self.column_mappings[constants.COLUMN_ENTITY]],
+          variable=row[self.column_mappings[constants.COLUMN_VARIABLE]],
+          date=row[self.column_mappings[constants.COLUMN_DATE]],
+          value=row[self.column_mappings[constants.COLUMN_VALUE]],
+          provenance=provenance)
+      observations.append(observation)
+    self.db.insert_observations(observations, self.input_file_name)
diff --git a/simple/tests/stats/config_test.py b/simple/tests/stats/config_test.py
@@ -18,6 +18,7 @@
 from stats.data import AggregationConfig
 from stats.data import AggregationMethod
 from stats.data import ImportType
+from stats.data import InputFileFormat
 from stats.data import Provenance
 from stats.data import Source
 from stats.data import StatVar
@@ -248,3 +249,30 @@ def test_input_file(self):
                 }
             }
         })._input_file("foo1.csv"), {}, "no wildcard match")
+
+  def test_input_file_format(self):
+    config = Config({})
+    self.assertEqual(config.format("foo.csv"), None, "empty")
+
+    config = Config({"inputFiles": {"foo.csv": {"format": "variablePerRow"}}})
+    self.assertEqual(config.format("foo.csv"), InputFileFormat.VARIABLE_PER_ROW)
+
+    config = Config(
+        {"inputFiles": {
+            "foo.csv": {
+                "format": "variablePerColumn"
+            }
+        }})
+    self.assertEqual(config.format("foo.csv"),
+                     InputFileFormat.VARIABLE_PER_COLUMN)
+
+    config = Config({"inputFiles": {"foo.csv": {"format": "INVALID"}}})
+    with self.assertRaisesRegex(ValueError, "Unsupported format"):
+      config.format("foo.csv")
+
+  def test_column_mappings(self):
+    config = Config({})
+    self.assertDictEqual(config.column_mappings("foo.csv"), {}, "empty")
+
+    config = Config({"inputFiles": {"foo.csv": {"columnMappings": {"x": "y"}}}})
+    self.assertDictEqual(config.column_mappings("foo.csv"), {"x": "y"})
diff --git a/simple/tests/stats/test_data/runner/config/config_driven.json b/simple/tests/stats/test_data/runner/config/config_driven.json
@@ -3,11 +3,19 @@
   "inputFiles": {
     "countries.csv": {
       "importType": "observations",
+      "format": "variablePerColumn",
       "entityType": "Country",
       "provenance": "Provenance1 Name"
     },
     "wikidataids.csv": {
       "importType": "observations",
+      "format": "variablePerColumn",
+      "entityType": "Country",
+      "provenance": "Provenance1 Name"
+    },
+    "variable_per_row.csv": {
+      "importType": "observations",
+      "format": "variablePerRow",
       "entityType": "Country",
       "provenance": "Provenance1 Name"
     }

diff --git a/simple/tests/stats/test_data/runner/config/config_with_wildcards.json b/simple/tests/stats/test_data/runner/config/config_with_wildcards.json
@@ -3,8 +3,15 @@
     "tests/stats/test_data/runner/input/config_with_wildcards"
   ],
   "inputFiles": {
+    "variable_per_row.csv": {
+      "importType": "observations",
+      "format": "variablePerRow",
+      "entityType": "Country",
+      "provenance": "Provenance1 Name"
+    },
     "*.csv": {
       "importType": "observations",
+      "format": "variablePerColumn",
       "entityType": "Country",
       "provenance": "Provenance1 Name"
     }

diff --git a/simple/tests/stats/test_data/runner/expected/config_driven/observations.db.csv b/simple/tests/stats/test_data/runner/expected/config_driven/observations.db.csv
@@ -21,6 +21,10 @@ country/ASM,var2,2023,34,c/p/1
 country/AIA,var2,2023,42,c/p/1
 country/WLF,var2,2023,75,c/p/1
 country/ESH,var2,2023,65,c/p/1
+country/IND,var1,2020,0.16,c/p/1
+country/IND,var2,2020,53,c/p/1
+country/CHN,var1,2020,0.23,c/p/1
+country/CHN,var2,2020,67,c/p/1
 country/USA,var1,2021,555,c/p/1
 country/IND,var1,2022,321,c/p/1
 country/USA,var2,2021,666,c/p/1

diff --git a/simple/tests/stats/test_data/runner/expected/config_with_wildcards/observations.db.csv b/simple/tests/stats/test_data/runner/expected/config_with_wildcards/observations.db.csv
@@ -21,6 +21,10 @@ country/ASM,var2,2023,34,c/p/1
 country/AIA,var2,2023,42,c/p/1
 country/WLF,var2,2023,75,c/p/1
 country/ESH,var2,2023,65,c/p/1
+country/IND,var1,2020,0.16,c/p/1
+country/IND,var2,2020,53,c/p/1
+country/CHN,var1,2020,0.23,c/p/1
+country/CHN,var2,2020,67,c/p/1
 country/USA,var1,2021,555,c/p/1
 country/IND,var1,2022,321,c/p/1
 country/USA,var2,2021,666,c/p/1

diff --git a/simple/tests/stats/test_data/runner/expected/input_dir_driven/observations.db.csv b/simple/tests/stats/test_data/runner/expected/input_dir_driven/observations.db.csv
@@ -21,6 +21,10 @@ country/ASM,var2,2023,34,c/p/1
 country/AIA,var2,2023,42,c/p/1
 country/WLF,var2,2023,75,c/p/1
 country/ESH,var2,2023,65,c/p/1
+country/IND,var1,2020,0.16,c/p/1
+country/IND,var2,2020,53,c/p/1
+country/CHN,var1,2020,0.23,c/p/1
+country/CHN,var2,2020,67,c/p/1
 country/USA,var1,2021,555,c/p/1
 country/IND,var1,2022,321,c/p/1
 country/USA,var2,2021,666,c/p/1

diff --git a/simple/tests/stats/test_data/runner/input/config_driven/variable_per_row.csv b/simple/tests/stats/test_data/runner/input/config_driven/variable_per_row.csv
@@ -0,0 +1,5 @@
+entity,variable,date,value
+country/IND,var1,2020,0.16
+country/IND,var2,2020,53
+country/CHN,var1,2020,0.23
+country/CHN,var2,2020,67
diff --git a/simple/tests/stats/test_data/runner/input/config_with_wildcards/variable_per_row.csv b/simple/tests/stats/test_data/runner/input/config_with_wildcards/variable_per_row.csv
@@ -0,0 +1,5 @@
+entity,variable,date,value
+country/IND,var1,2020,0.16
+country/IND,var2,2020,53
+country/CHN,var1,2020,0.23
+country/CHN,var2,2020,67
diff --git a/simple/tests/stats/test_data/runner/input/input_dir_driven/config.json b/simple/tests/stats/test_data/runner/input/input_dir_driven/config.json
@@ -2,11 +2,19 @@
   "inputFiles": {
     "countries.csv": {
       "importType": "observations",
+      "format": "variablePerColumn",
       "entityType": "Country",
       "provenance": "Provenance1 Name"
     },
     "wikidataids.csv": {
       "importType": "observations",
+      "format": "variablePerColumn",
+      "entityType": "Country",
+      "provenance": "Provenance1 Name"
+    },
+    "variable_per_row.csv": {
+      "importType": "observations",
+      "format": "variablePerRow",
       "entityType": "Country",
       "provenance": "Provenance1 Name"
     }

diff --git a/simple/tests/stats/test_data/runner/input/input_dir_driven/variable_per_row.csv b/simple/tests/stats/test_data/runner/input/input_dir_driven/variable_per_row.csv
@@ -0,0 +1,5 @@
+entity,variable,date,value
+country/IND,var1,2020,0.16
+country/IND,var2,2020,53
+country/CHN,var1,2020,0.23
+country/CHN,var2,2020,67
diff --git a/simple/tests/stats/test_data/variable_per_row_importer/expected/custom_column_names.db.csv b/simple/tests/stats/test_data/variable_per_row_importer/expected/custom_column_names.db.csv
@@ -0,0 +1,7 @@
+entity,variable,date,value,provenance
+country/BRA,var1,2023,0.19,c/p/default
+country/BRA,var2,2023,6,c/p/default
+country/JPN,var1,2023,0.21,c/p/default
+country/JPN,var2,2023,56,c/p/default
+country/USA,var2,2023,66,c/p/default
+country/CHN,var1,2022,-123.456,c/p/default
diff --git a/simple/tests/stats/test_data/variable_per_row_importer/expected/default_column_names.db.csv b/simple/tests/stats/test_data/variable_per_row_importer/expected/default_column_names.db.csv
@@ -0,0 +1,7 @@
+entity,variable,date,value,provenance
+country/BRA,var1,2023,0.19,c/p/default
+country/BRA,var2,2023,6,c/p/default
+country/JPN,var1,2023,0.21,c/p/default
+country/JPN,var2,2023,56,c/p/default
+country/USA,var2,2023,66,c/p/default
+country/CHN,var1,2022,-123.456,c/p/default
diff --git a/simple/tests/stats/test_data/variable_per_row_importer/input/custom_column_names.csv b/simple/tests/stats/test_data/variable_per_row_importer/input/custom_column_names.csv
@@ -0,0 +1,7 @@
+Observed Variable,Observed Location,date,value
+var1,country/BRA,2023,0.19
+var2,country/BRA,2023,6
+var1,country/JPN,2023,0.21
+var2,country/JPN,2023,56
+var2,country/USA,2023,66
+var1,country/CHN,2022,-123.456
diff --git a/simple/tests/stats/test_data/variable_per_row_importer/input/default_column_names.csv b/simple/tests/stats/test_data/variable_per_row_importer/input/default_column_names.csv
@@ -0,0 +1,7 @@
+variable,entity,date,value
+var1,country/BRA,2023,0.19
+var2,country/BRA,2023,6
+var1,country/JPN,2023,0.21
+var2,country/JPN,2023,56
+var2,country/USA,2023,66
+var1,country/CHN,2022,-123.456