Skip to content

Commit

Permalink
Add support for CSVs in SVObs format. (#290)
Browse files Browse the repository at this point in the history
  • Loading branch information
keyurva authored Mar 11, 2024
1 parent e8b0ada commit fc90a1d
Show file tree
Hide file tree
Showing 20 changed files with 336 additions and 0 deletions.
14 changes: 14 additions & 0 deletions simple/stats/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from stats.data import AggregationConfig
from stats.data import EventType
from stats.data import ImportType
from stats.data import InputFileFormat
from stats.data import Provenance
from stats.data import Source
from stats.data import StatVar
Expand All @@ -42,6 +43,8 @@
_AGGREGATION_FIELD = "aggregation"
_PROPERTIES_FIELD = "properties"
_DATA_DOWNLOAD_URL_FIELD = "dataDownloadUrl"
_FORMAT_FIELD = "format"
_COLUMN_MAPPINGS_FIELD = "columnMappings"


class Config:
Expand Down Expand Up @@ -83,6 +86,17 @@ def import_type(self, input_file_name: str) -> ImportType:
f"Unsupported import type: {import_type_str} ({input_file_name})")
return ImportType(import_type_str)

def format(self, input_file_name: str) -> ImportType | None:
format_str = self._input_file(input_file_name).get(_FORMAT_FIELD)
if not format_str:
return None
if format_str not in iter(InputFileFormat):
raise ValueError(f"Unsupported format: {format_str} ({input_file_name})")
return InputFileFormat(format_str)

def column_mappings(self, input_file_name: str) -> dict[str, str]:
return self._input_file(input_file_name).get(_COLUMN_MAPPINGS_FIELD, {})

def computed_variables(self, input_file_name: str) -> list[str]:
return self._input_file(input_file_name).get(_COMPUTED_VARIABLES_FIELD, [])

Expand Down
1 change: 1 addition & 0 deletions simple/stats/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
COLUMN_DATE = "date"
COLUMN_VALUE = "value"
COLUMN_PROVENANCE = "provenance"
COLUMN_ENTITY = "entity"

# Debug CSV columns and values
DEBUG_COLUMN_INPUT = "input"
Expand Down
5 changes: 5 additions & 0 deletions simple/stats/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,11 @@ class ImportType(StrEnum):
EVENTS = "events"


class InputFileFormat(StrEnum):
VARIABLE_PER_ROW = "variablePerRow"
VARIABLE_PER_COLUMN = "variablePerColumn"


class TimePeriod(StrEnum):
DAY = "day"
MONTH = "month"
Expand Down
8 changes: 8 additions & 0 deletions simple/stats/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from stats import constants
from stats.config import Config
from stats.data import ImportType
from stats.data import InputFileFormat
from stats.db import create_db
from stats.db import create_main_dc_config
from stats.db import create_sqlite_config
Expand All @@ -31,6 +32,7 @@
from stats.nodes import Nodes
from stats.observations_importer import ObservationsImporter
from stats.reporter import ImportReporter
from stats.variable_per_row_importer import VariablePerRowImporter
from util.filehandler import create_file_handler
from util.filehandler import FileHandler

Expand Down Expand Up @@ -173,6 +175,12 @@ def _create_importer(self, input_fh: FileHandler) -> Importer:
reporter = self.reporter.import_file(input_file)

if import_type == ImportType.OBSERVATIONS:
input_file_format = self.config.format(input_file)
if input_file_format == InputFileFormat.VARIABLE_PER_ROW:
return VariablePerRowImporter(input_fh=input_fh,
db=self.db,
reporter=reporter,
nodes=self.nodes)
return ObservationsImporter(input_fh=input_fh,
db=self.db,
debug_resolve_fh=debug_resolve_fh,
Expand Down
96 changes: 96 additions & 0 deletions simple/stats/variable_per_row_importer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# Copyright 2024 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from csv import DictReader
import logging
import random

from stats import constants
from stats.data import Observation
from stats.db import Db
from stats.importer import Importer
from stats.nodes import Nodes
from stats.reporter import FileImportReporter
from util.filehandler import FileHandler

_COLUMNS = [
constants.COLUMN_ENTITY, constants.COLUMN_VARIABLE, constants.COLUMN_DATE,
constants.COLUMN_VALUE
]
_DEFAULT_COLUMN_MAPPINGS = {x: x for x in _COLUMNS}


class VariablePerRowImporter(Importer):
"""Imports a single observations input file where variables are specified in rows (aka "SVObs").
This is in contrast to the ObservationsImporter where variables are specified in columns.
Currently this importer only writes observations and no entities.
It also does not resolve any entities and expects all entities to be pre-resolved.
"""

def __init__(self, input_fh: FileHandler, db: Db,
reporter: FileImportReporter, nodes: Nodes) -> None:
self.input_fh = input_fh
self.db = db
self.reporter = reporter
self.input_file_name = self.input_fh.basename()
self.nodes = nodes
self.config = nodes.config
# Reassign after reading CSV.
self.column_mappings = dict(_DEFAULT_COLUMN_MAPPINGS)
self.reader: DictReader = None

def do_import(self) -> None:
self.reporter.report_started()
try:
self._read_csv()
self._map_columns()
self._write_observations()
self.reporter.report_success()
except Exception as e:
self.reporter.report_failure(str(e))
raise e

def _read_csv(self) -> None:
self.reader = DictReader(self.input_fh.read_string_io())

def _map_columns(self):
config_mappings = self.config.column_mappings(self.input_file_name)
for key in self.column_mappings.keys():
if key in config_mappings:
self.column_mappings[key] = config_mappings[key]

# Ensure that the expected columns exist.
expected_column_names = set(self.column_mappings.values())
logging.info("Expected column names: %s", expected_column_names)
actual_column_names = set(self.reader.fieldnames)
logging.info("Actual column names: %s", actual_column_names)
difference = expected_column_names - actual_column_names
if difference:
raise ValueError(
f"The following expected columns were not found: {difference}. You can specify column mappings using the columnMappings field."
)

def _write_observations(self) -> None:
provenance = self.nodes.provenance(self.input_file_name).id
observations: list[Observation] = []
for row in self.reader:
observation = Observation(
entity=row[self.column_mappings[constants.COLUMN_ENTITY]],
variable=row[self.column_mappings[constants.COLUMN_VARIABLE]],
date=row[self.column_mappings[constants.COLUMN_DATE]],
value=row[self.column_mappings[constants.COLUMN_VALUE]],
provenance=provenance)
observations.append(observation)
self.db.insert_observations(observations, self.input_file_name)
28 changes: 28 additions & 0 deletions simple/tests/stats/config_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from stats.data import AggregationConfig
from stats.data import AggregationMethod
from stats.data import ImportType
from stats.data import InputFileFormat
from stats.data import Provenance
from stats.data import Source
from stats.data import StatVar
Expand Down Expand Up @@ -248,3 +249,30 @@ def test_input_file(self):
}
}
})._input_file("foo1.csv"), {}, "no wildcard match")

def test_input_file_format(self):
config = Config({})
self.assertEqual(config.format("foo.csv"), None, "empty")

config = Config({"inputFiles": {"foo.csv": {"format": "variablePerRow"}}})
self.assertEqual(config.format("foo.csv"), InputFileFormat.VARIABLE_PER_ROW)

config = Config(
{"inputFiles": {
"foo.csv": {
"format": "variablePerColumn"
}
}})
self.assertEqual(config.format("foo.csv"),
InputFileFormat.VARIABLE_PER_COLUMN)

config = Config({"inputFiles": {"foo.csv": {"format": "INVALID"}}})
with self.assertRaisesRegex(ValueError, "Unsupported format"):
config.format("foo.csv")

def test_column_mappings(self):
config = Config({})
self.assertDictEqual(config.column_mappings("foo.csv"), {}, "empty")

config = Config({"inputFiles": {"foo.csv": {"columnMappings": {"x": "y"}}}})
self.assertDictEqual(config.column_mappings("foo.csv"), {"x": "y"})
8 changes: 8 additions & 0 deletions simple/tests/stats/test_data/runner/config/config_driven.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,19 @@
"inputFiles": {
"countries.csv": {
"importType": "observations",
"format": "variablePerColumn",
"entityType": "Country",
"provenance": "Provenance1 Name"
},
"wikidataids.csv": {
"importType": "observations",
"format": "variablePerColumn",
"entityType": "Country",
"provenance": "Provenance1 Name"
},
"variable_per_row.csv": {
"importType": "observations",
"format": "variablePerRow",
"entityType": "Country",
"provenance": "Provenance1 Name"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,15 @@
"tests/stats/test_data/runner/input/config_with_wildcards"
],
"inputFiles": {
"variable_per_row.csv": {
"importType": "observations",
"format": "variablePerRow",
"entityType": "Country",
"provenance": "Provenance1 Name"
},
"*.csv": {
"importType": "observations",
"format": "variablePerColumn",
"entityType": "Country",
"provenance": "Provenance1 Name"
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ country/ASM,var2,2023,34,c/p/1
country/AIA,var2,2023,42,c/p/1
country/WLF,var2,2023,75,c/p/1
country/ESH,var2,2023,65,c/p/1
country/IND,var1,2020,0.16,c/p/1
country/IND,var2,2020,53,c/p/1
country/CHN,var1,2020,0.23,c/p/1
country/CHN,var2,2020,67,c/p/1
country/USA,var1,2021,555,c/p/1
country/IND,var1,2022,321,c/p/1
country/USA,var2,2021,666,c/p/1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ country/ASM,var2,2023,34,c/p/1
country/AIA,var2,2023,42,c/p/1
country/WLF,var2,2023,75,c/p/1
country/ESH,var2,2023,65,c/p/1
country/IND,var1,2020,0.16,c/p/1
country/IND,var2,2020,53,c/p/1
country/CHN,var1,2020,0.23,c/p/1
country/CHN,var2,2020,67,c/p/1
country/USA,var1,2021,555,c/p/1
country/IND,var1,2022,321,c/p/1
country/USA,var2,2021,666,c/p/1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ country/ASM,var2,2023,34,c/p/1
country/AIA,var2,2023,42,c/p/1
country/WLF,var2,2023,75,c/p/1
country/ESH,var2,2023,65,c/p/1
country/IND,var1,2020,0.16,c/p/1
country/IND,var2,2020,53,c/p/1
country/CHN,var1,2020,0.23,c/p/1
country/CHN,var2,2020,67,c/p/1
country/USA,var1,2021,555,c/p/1
country/IND,var1,2022,321,c/p/1
country/USA,var2,2021,666,c/p/1
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
entity,variable,date,value
country/IND,var1,2020,0.16
country/IND,var2,2020,53
country/CHN,var1,2020,0.23
country/CHN,var2,2020,67
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
entity,variable,date,value
country/IND,var1,2020,0.16
country/IND,var2,2020,53
country/CHN,var1,2020,0.23
country/CHN,var2,2020,67
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,19 @@
"inputFiles": {
"countries.csv": {
"importType": "observations",
"format": "variablePerColumn",
"entityType": "Country",
"provenance": "Provenance1 Name"
},
"wikidataids.csv": {
"importType": "observations",
"format": "variablePerColumn",
"entityType": "Country",
"provenance": "Provenance1 Name"
},
"variable_per_row.csv": {
"importType": "observations",
"format": "variablePerRow",
"entityType": "Country",
"provenance": "Provenance1 Name"
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
entity,variable,date,value
country/IND,var1,2020,0.16
country/IND,var2,2020,53
country/CHN,var1,2020,0.23
country/CHN,var2,2020,67
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
entity,variable,date,value,provenance
country/BRA,var1,2023,0.19,c/p/default
country/BRA,var2,2023,6,c/p/default
country/JPN,var1,2023,0.21,c/p/default
country/JPN,var2,2023,56,c/p/default
country/USA,var2,2023,66,c/p/default
country/CHN,var1,2022,-123.456,c/p/default
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
entity,variable,date,value,provenance
country/BRA,var1,2023,0.19,c/p/default
country/BRA,var2,2023,6,c/p/default
country/JPN,var1,2023,0.21,c/p/default
country/JPN,var2,2023,56,c/p/default
country/USA,var2,2023,66,c/p/default
country/CHN,var1,2022,-123.456,c/p/default
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Observed Variable,Observed Location,date,value
var1,country/BRA,2023,0.19
var2,country/BRA,2023,6
var1,country/JPN,2023,0.21
var2,country/JPN,2023,56
var2,country/USA,2023,66
var1,country/CHN,2022,-123.456
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
variable,entity,date,value
var1,country/BRA,2023,0.19
var2,country/BRA,2023,6
var1,country/JPN,2023,0.21
var2,country/JPN,2023,56
var2,country/USA,2023,66
var1,country/CHN,2022,-123.456
Loading

0 comments on commit fc90a1d

Please sign in to comment.