From d38b9a2fbdcbb6e041cd98aeed73119593d50998 Mon Sep 17 00:00:00 2001 From: Keyur Shah Date: Fri, 12 Apr 2024 18:45:07 -0700 Subject: [PATCH] Insert generated SVG hierarchy triples into DB. (#301) --- simple/sample/input/config.json | 3 +- simple/sample/output/tables/triples.csv | 17 ++++++++ simple/stats/config.md | 6 +++ simple/stats/config.py | 4 ++ simple/stats/db.py | 35 +++++++++++++++- simple/stats/runner.py | 24 +++++++++++ simple/tests/stats/db_test.py | 7 ++++ simple/tests/stats/runner_test.py | 3 ++ .../observations.db.csv | 5 +++ .../generate_svg_hierarchy/triples.db.csv | 41 +++++++++++++++++++ .../input/generate_svg_hierarchy/config.json | 18 ++++++++ .../generate_svg_hierarchy/observations.csv | 5 +++ .../generate_svg_hierarchy/variables.mcf | 16 ++++++++ 13 files changed, 182 insertions(+), 2 deletions(-) create mode 100644 simple/tests/stats/test_data/runner/expected/generate_svg_hierarchy/observations.db.csv create mode 100644 simple/tests/stats/test_data/runner/expected/generate_svg_hierarchy/triples.db.csv create mode 100644 simple/tests/stats/test_data/runner/input/generate_svg_hierarchy/config.json create mode 100644 simple/tests/stats/test_data/runner/input/generate_svg_hierarchy/observations.csv create mode 100644 simple/tests/stats/test_data/runner/input/generate_svg_hierarchy/variables.mcf diff --git a/simple/sample/input/config.json b/simple/sample/input/config.json index 09b30f3d..8d767607 100644 --- a/simple/sample/input/config.json +++ b/simple/sample/input/config.json @@ -79,5 +79,6 @@ "Provenance2 Name": "http://source1.com/provenance2" } } - } + }, + "groupStatVarsByProperty": true } diff --git a/simple/sample/output/tables/triples.csv b/simple/sample/output/tables/triples.csv index ee2705b4..b10698a1 100644 --- a/simple/sample/output/tables/triples.csv +++ b/simple/sample/output/tables/triples.csv @@ -421,3 +421,20 @@ dc/02b53twnh3fx,typeOf,PowerPlant,"" dc/dk2p9l3l8x1b6,typeOf,PowerPlant,"" s2CellId/0x80982b0000000000,typeOf,S2CellLevel10,"" s2CellId/0x3be7c90000000000,typeOf,S2CellLevel10,"" +c/g/Person,typeOf,StatVarGroup,"" +c/g/Person,name,"",Person +c/g/Person,specializationOf,dc/g/Root,"" +c/g/Person_Gender,typeOf,StatVarGroup,"" +c/g/Person_Gender,name,"","Person With Gender" +c/g/Person_Gender,specializationOf,c/g/Person,"" +c/g/Person_Gender-Female,typeOf,StatVarGroup,"" +c/g/Person_Gender-Female,name,"","Person With Gender = Female" +c/g/Person_Gender-Female,specializationOf,c/g/Person_Gender,"" +var1,memberOf,c/g/Person_Gender-Female,"" +c/g/Thing,typeOf,StatVarGroup,"" +c/g/Thing,name,"",Thing +c/g/Thing,specializationOf,dc/g/Root,"" +var2,memberOf,c/g/Thing,"" +Variable_1,memberOf,c/g/Thing,"" +Variable_2,memberOf,c/g/Thing,"" +Crime_Count,memberOf,c/g/Thing,"" diff --git a/simple/stats/config.md b/simple/stats/config.md index 18fbb159..872b5afc 100644 --- a/simple/stats/config.md +++ b/simple/stats/config.md @@ -182,3 +182,9 @@ Local directory: "dataDownloadUrl": ["//local/path/to/dir"] } ``` + +## `groupStatVarsByProperty` + +If `true`, auto generates a hierarchy of groups based on properties of variables in the dataset. Default is `false`. + +> TODO: Add more details. diff --git a/simple/stats/config.py b/simple/stats/config.py index f4322bd9..0ba021eb 100644 --- a/simple/stats/config.py +++ b/simple/stats/config.py @@ -49,6 +49,7 @@ _ROW_ENTITY_TYPE_FIELD = "rowEntityType" _ENTITY_COLUMNS = "entityColumns" _ENTITIES_FIELD = "entities" +_GROUP_STAT_VARS_BY_PROPERTY = "groupStatVarsByProperty" class Config: @@ -159,6 +160,9 @@ def entity_columns(self, input_file_name: str) -> list[str]: def database(self) -> dict: return self.data.get(_DATABASE_FIELD) + def generate_hierarchy(self) -> bool: + return self.data.get(_GROUP_STAT_VARS_BY_PROPERTY) or False + def _input_file(self, input_file_name: str) -> dict: # Exact match. input_file_config = self._input_files_config.get(input_file_name, {}) diff --git a/simple/stats/db.py b/simple/stats/db.py index affe2dec..752540d1 100644 --- a/simple/stats/db.py +++ b/simple/stats/db.py @@ -20,6 +20,7 @@ import os import sqlite3 import tempfile +from typing import Any from google.cloud.sql.connector.connector import Connector import pandas as pd @@ -92,6 +93,8 @@ _INSERT_IMPORTS_STATEMENT = "insert into imports values(?, ?, ?)" +_SELECT_TRIPLES_BY_SUBJECT_TYPE = "select * from triples where subject_id in (select subject_id from triples where predicate = 'typeOf' and object_id = ?)" + _INIT_STATEMENTS = [ _CREATE_TRIPLES_TABLE, _CREATE_OBSERVATIONS_TABLE, @@ -139,6 +142,10 @@ def insert_import_info(self, status: ImportStatus): def commit_and_close(self): pass + # Returns all triples of nodes with the specified "typeOf" predicate. + def select_triples_by_subject_type(self, subject_type: str) -> list[Triple]: + pass + class MainDcDb(Db): """Generates output for main DC. @@ -182,6 +189,10 @@ def commit_and_close(self): self.output_dir_fh.make_file(OBSERVATIONS_TMCF_FILE_NAME).write_string( OBSERVATIONS_TMCF) + # Not supported for main DC at this time. + def select_triples_with_type_of(self, type_of: str) -> list[Triple]: + return [] + def _add_triple(self, triple: Triple): node = self.nodes.get(triple.subject_id) if not node: @@ -226,6 +237,11 @@ def insert_import_info(self, status: ImportStatus): def commit_and_close(self): self.engine.commit_and_close() + def select_triples_by_subject_type(self, subject_type: str) -> list[Triple]: + tuples = self.engine.fetch_all(_SELECT_TRIPLES_BY_SUBJECT_TYPE, + (subject_type,)) + return list(map(lambda tuple: from_triple_tuple(tuple), tuples)) + def _import_metadata(self) -> dict: return { "numVars": len(self.variables), @@ -233,6 +249,10 @@ def _import_metadata(self) -> dict: } +def from_triple_tuple(tuple: tuple) -> Triple: + return Triple(*tuple) + + def to_triple_tuple(triple: Triple): return (_strip_namespace(triple.subject_id), triple.predicate, _strip_namespace(triple.object_id), triple.object_value) @@ -256,6 +276,9 @@ def execute(self, sql: str, parameters=None): def executemany(self, sql: str, parameters=None): pass + def fetch_all(self, sql: str, parameters=None) -> list[Any]: + pass + def commit_and_close(self): pass @@ -296,6 +319,12 @@ def executemany(self, sql: str, parameters=None): else: self.cursor.executemany(sql, parameters) + def fetch_all(self, sql: str, parameters=None) -> list[Any]: + if not parameters: + return self.cursor.execute(sql).fetchall() + else: + return self.cursor.execute(sql, parameters).fetchall() + def commit_and_close(self): self.connection.commit() self.connection.close() @@ -314,7 +343,7 @@ def commit_and_close(self): _CLOUD_MY_SQL_PARAMS = [CLOUD_MY_SQL_INSTANCE] + _CLOUD_MY_SQL_CONNECT_PARAMS -class CloudSqlDbEngine: +class CloudSqlDbEngine(DbEngine): def __init__(self, db_params: dict[str, str]) -> None: for param in _CLOUD_MY_SQL_PARAMS: @@ -341,6 +370,10 @@ def execute(self, sql: str, parameters=None): def executemany(self, sql: str, parameters=None): self.cursor.executemany(_pymysql(sql), parameters) + def fetch_all(self, sql: str, parameters=None): + self.cursor.execute(_pymysql(sql), parameters) + return self.cursor.fetchall() + def commit_and_close(self): self.cursor.close() self.connection.commit() diff --git a/simple/stats/runner.py b/simple/stats/runner.py index cb8f32c3..4eb5c3a6 100644 --- a/simple/stats/runner.py +++ b/simple/stats/runner.py @@ -17,6 +17,7 @@ import logging from stats import constants +from stats import stat_var_hierarchy_generator from stats.config import Config from stats.data import ImportType from stats.data import InputFileFormat @@ -34,6 +35,7 @@ from stats.nodes import Nodes from stats.observations_importer import ObservationsImporter from stats.reporter import ImportReporter +import stats.schema_constants as sc from stats.variable_per_row_importer import VariablePerRowImporter from util.filehandler import create_file_handler from util.filehandler import FileHandler @@ -132,6 +134,9 @@ def run(self): # Write triples to DB. self.db.insert_triples(triples) + # Generate SVG hierarchy. + self._generate_svg_hierarchy() + # Generate SV sentences. nl.generate_sv_sentences( list(self.nodes.variables.values()), @@ -149,6 +154,25 @@ def run(self): logging.exception("Error running import") self.reporter.report_failure(error=str(e)) + def _generate_svg_hierarchy(self): + if self.mode == RunMode.MAIN_DC: + logging.info("Hierarchy generation not supported for main dc, skipping.") + return + if not self.config.generate_hierarchy(): + logging.info("Hierarchy generation not enabled, skipping.") + return + + logging.info("Generating SVG hierarchy.") + sv_triples = self.db.select_triples_by_subject_type( + sc.TYPE_STATISTICAL_VARIABLE) + if not sv_triples: + logging.info("No SV triples found, skipping SVG generating hierarchy.") + logging.info("Generating SVG hierarchy for %s SV triples.", len(sv_triples)) + + svg_triples = stat_var_hierarchy_generator.generate(sv_triples) + logging.info("Inserting %s SVG triples into DB.", len(svg_triples)) + self.db.insert_triples(svg_triples) + def _run_imports(self): input_fhs: list[FileHandler] = [] input_mcf_fhs: list[FileHandler] = [] diff --git a/simple/tests/stats/db_test.py b/simple/tests/stats/db_test.py index fa0d800b..8c3eef24 100644 --- a/simple/tests/stats/db_test.py +++ b/simple/tests/stats/db_test.py @@ -66,6 +66,13 @@ def test_sql_db(self): db.insert_triples(_TRIPLES) db.insert_observations(_OBSERVATIONS, "foo.csv") db.insert_import_info(status=ImportStatus.SUCCESS) + + sv_triples = db.select_triples_by_subject_type("StatisticalVariable") + self.assertListEqual(sv_triples, _TRIPLES) + + svg_triples = db.select_triples_by_subject_type("StatVarGroup") + self.assertListEqual(svg_triples, []) + db.commit_and_close() sqldb = sqlite3.connect(db_file_path) diff --git a/simple/tests/stats/runner_test.py b/simple/tests/stats/runner_test.py index 71bae6dd..8de6af26 100644 --- a/simple/tests/stats/runner_test.py +++ b/simple/tests/stats/runner_test.py @@ -111,3 +111,6 @@ def test_config_with_wildcards(self): def test_input_dir_driven(self): _test_runner(self, "input_dir_driven", is_config_driven=False) + + def test_generate_svg_hierarchy(self): + _test_runner(self, "generate_svg_hierarchy", is_config_driven=False) diff --git a/simple/tests/stats/test_data/runner/expected/generate_svg_hierarchy/observations.db.csv b/simple/tests/stats/test_data/runner/expected/generate_svg_hierarchy/observations.db.csv new file mode 100644 index 00000000..f952e02d --- /dev/null +++ b/simple/tests/stats/test_data/runner/expected/generate_svg_hierarchy/observations.db.csv @@ -0,0 +1,5 @@ +entity,variable,date,value,provenance +country/IND,var1,2020,0.16,c/p/1 +country/IND,var2,2020,53,c/p/1 +country/CHN,var1,2020,0.23,c/p/1 +country/CHN,var2,2020,67,c/p/1 diff --git a/simple/tests/stats/test_data/runner/expected/generate_svg_hierarchy/triples.db.csv b/simple/tests/stats/test_data/runner/expected/generate_svg_hierarchy/triples.db.csv new file mode 100644 index 00000000..522fd1e5 --- /dev/null +++ b/simple/tests/stats/test_data/runner/expected/generate_svg_hierarchy/triples.db.csv @@ -0,0 +1,41 @@ +subject_id,predicate,object_id,object_value +some_var1,typeOf,StatisticalVariable, +some_var1,measuredProperty,value, +some_var1,name,,Some Variable 1 Name +some_var1,description,,Some Variable 1 Description +some_var1,populationType,Person, +some_var1,gender,Female, +some_var2,typeOf,StatisticalVariable, +some_var2,measuredProperty,value, +some_var2,name,,Some Variable 2 Name +some_var2,description,,Some Variable 2 Description +some_var2,populationType,Person, +some_var2,gender,Male, +c/s/default,typeOf,Source, +c/s/default,name,,Custom Data Commons +c/s/1,typeOf,Source, +c/s/1,name,,Source1 +c/s/1,url,,http://source1.com +c/s/1,domain,,source1.com +c/p/default,typeOf,Provenance, +c/p/default,name,,Custom Import +c/p/default,source,c/s/default, +c/p/default,url,,custom-import +c/p/1,typeOf,Provenance, +c/p/1,name,,Provenance1 +c/p/1,source,c/s/1, +c/p/1,url,,http://source1.com/provenance1 +c/g/Person,typeOf,StatVarGroup, +c/g/Person,name,,Person +c/g/Person,specializationOf,dc/g/Root, +c/g/Person_Gender,typeOf,StatVarGroup, +c/g/Person_Gender,name,,Person With Gender +c/g/Person_Gender,specializationOf,c/g/Person, +c/g/Person_Gender-Female,typeOf,StatVarGroup, +c/g/Person_Gender-Female,name,,Person With Gender = Female +c/g/Person_Gender-Female,specializationOf,c/g/Person_Gender, +some_var1,memberOf,c/g/Person_Gender-Female, +c/g/Person_Gender-Male,typeOf,StatVarGroup, +c/g/Person_Gender-Male,name,,Person With Gender = Male +c/g/Person_Gender-Male,specializationOf,c/g/Person_Gender, +some_var2,memberOf,c/g/Person_Gender-Male, diff --git a/simple/tests/stats/test_data/runner/input/generate_svg_hierarchy/config.json b/simple/tests/stats/test_data/runner/input/generate_svg_hierarchy/config.json new file mode 100644 index 00000000..457722d8 --- /dev/null +++ b/simple/tests/stats/test_data/runner/input/generate_svg_hierarchy/config.json @@ -0,0 +1,18 @@ +{ + "inputFiles": { + "*.csv": { + "importType": "observations", + "format": "variablePerRow", + "provenance": "Provenance1" + } + }, + "sources": { + "Source1": { + "url": "http://source1.com", + "provenances": { + "Provenance1": "http://source1.com/provenance1" + } + } + }, + "groupStatVarsByProperty": true +} diff --git a/simple/tests/stats/test_data/runner/input/generate_svg_hierarchy/observations.csv b/simple/tests/stats/test_data/runner/input/generate_svg_hierarchy/observations.csv new file mode 100644 index 00000000..9c18cffa --- /dev/null +++ b/simple/tests/stats/test_data/runner/input/generate_svg_hierarchy/observations.csv @@ -0,0 +1,5 @@ +entity,variable,date,value +country/IND,var1,2020,0.16 +country/IND,var2,2020,53 +country/CHN,var1,2020,0.23 +country/CHN,var2,2020,67 \ No newline at end of file diff --git a/simple/tests/stats/test_data/runner/input/generate_svg_hierarchy/variables.mcf b/simple/tests/stats/test_data/runner/input/generate_svg_hierarchy/variables.mcf new file mode 100644 index 00000000..7370dee8 --- /dev/null +++ b/simple/tests/stats/test_data/runner/input/generate_svg_hierarchy/variables.mcf @@ -0,0 +1,16 @@ +Node: v1 +dcid:"some_var1" +typeOf: dcs:StatisticalVariable +measuredProperty: dcs:value +name: "Some Variable 1 Name" +description: "Some Variable 1 Description" +populationType: schema:Person +gender: dcs:Female + +Node: dcid:some_var2 +typeOf: dcs:StatisticalVariable +measuredProperty: dcs:value +name: "Some Variable 2 Name" +description: "Some Variable 2 Description" +populationType: schema:Person +gender: dcs:Male