Skip to content

Commit

Permalink
Insert generated SVG hierarchy triples into DB. (#301)
Browse files Browse the repository at this point in the history
  • Loading branch information
keyurva authored Apr 13, 2024
1 parent 7a9ff92 commit d38b9a2
Show file tree
Hide file tree
Showing 13 changed files with 182 additions and 2 deletions.
3 changes: 2 additions & 1 deletion simple/sample/input/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -79,5 +79,6 @@
"Provenance2 Name": "http://source1.com/provenance2"
}
}
}
},
"groupStatVarsByProperty": true
}
17 changes: 17 additions & 0 deletions simple/sample/output/tables/triples.csv
Original file line number Diff line number Diff line change
Expand Up @@ -421,3 +421,20 @@ dc/02b53twnh3fx,typeOf,PowerPlant,""
dc/dk2p9l3l8x1b6,typeOf,PowerPlant,""
s2CellId/0x80982b0000000000,typeOf,S2CellLevel10,""
s2CellId/0x3be7c90000000000,typeOf,S2CellLevel10,""
c/g/Person,typeOf,StatVarGroup,""
c/g/Person,name,"",Person
c/g/Person,specializationOf,dc/g/Root,""
c/g/Person_Gender,typeOf,StatVarGroup,""
c/g/Person_Gender,name,"","Person With Gender"
c/g/Person_Gender,specializationOf,c/g/Person,""
c/g/Person_Gender-Female,typeOf,StatVarGroup,""
c/g/Person_Gender-Female,name,"","Person With Gender = Female"
c/g/Person_Gender-Female,specializationOf,c/g/Person_Gender,""
var1,memberOf,c/g/Person_Gender-Female,""
c/g/Thing,typeOf,StatVarGroup,""
c/g/Thing,name,"",Thing
c/g/Thing,specializationOf,dc/g/Root,""
var2,memberOf,c/g/Thing,""
Variable_1,memberOf,c/g/Thing,""
Variable_2,memberOf,c/g/Thing,""
Crime_Count,memberOf,c/g/Thing,""
6 changes: 6 additions & 0 deletions simple/stats/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,3 +182,9 @@ Local directory:
"dataDownloadUrl": ["//local/path/to/dir"]
}
```

## `groupStatVarsByProperty`

If `true`, auto generates a hierarchy of groups based on properties of variables in the dataset. Default is `false`.

> TODO: Add more details.
4 changes: 4 additions & 0 deletions simple/stats/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
_ROW_ENTITY_TYPE_FIELD = "rowEntityType"
_ENTITY_COLUMNS = "entityColumns"
_ENTITIES_FIELD = "entities"
_GROUP_STAT_VARS_BY_PROPERTY = "groupStatVarsByProperty"


class Config:
Expand Down Expand Up @@ -159,6 +160,9 @@ def entity_columns(self, input_file_name: str) -> list[str]:
def database(self) -> dict:
return self.data.get(_DATABASE_FIELD)

def generate_hierarchy(self) -> bool:
return self.data.get(_GROUP_STAT_VARS_BY_PROPERTY) or False

def _input_file(self, input_file_name: str) -> dict:
# Exact match.
input_file_config = self._input_files_config.get(input_file_name, {})
Expand Down
35 changes: 34 additions & 1 deletion simple/stats/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import os
import sqlite3
import tempfile
from typing import Any

from google.cloud.sql.connector.connector import Connector
import pandas as pd
Expand Down Expand Up @@ -92,6 +93,8 @@

_INSERT_IMPORTS_STATEMENT = "insert into imports values(?, ?, ?)"

_SELECT_TRIPLES_BY_SUBJECT_TYPE = "select * from triples where subject_id in (select subject_id from triples where predicate = 'typeOf' and object_id = ?)"

_INIT_STATEMENTS = [
_CREATE_TRIPLES_TABLE,
_CREATE_OBSERVATIONS_TABLE,
Expand Down Expand Up @@ -139,6 +142,10 @@ def insert_import_info(self, status: ImportStatus):
def commit_and_close(self):
pass

# Returns all triples of nodes with the specified "typeOf" predicate.
def select_triples_by_subject_type(self, subject_type: str) -> list[Triple]:
pass


class MainDcDb(Db):
"""Generates output for main DC.
Expand Down Expand Up @@ -182,6 +189,10 @@ def commit_and_close(self):
self.output_dir_fh.make_file(OBSERVATIONS_TMCF_FILE_NAME).write_string(
OBSERVATIONS_TMCF)

# Not supported for main DC at this time.
def select_triples_with_type_of(self, type_of: str) -> list[Triple]:
return []

def _add_triple(self, triple: Triple):
node = self.nodes.get(triple.subject_id)
if not node:
Expand Down Expand Up @@ -226,13 +237,22 @@ def insert_import_info(self, status: ImportStatus):
def commit_and_close(self):
self.engine.commit_and_close()

def select_triples_by_subject_type(self, subject_type: str) -> list[Triple]:
tuples = self.engine.fetch_all(_SELECT_TRIPLES_BY_SUBJECT_TYPE,
(subject_type,))
return list(map(lambda tuple: from_triple_tuple(tuple), tuples))

def _import_metadata(self) -> dict:
return {
"numVars": len(self.variables),
"numObs": self.num_observations,
}


def from_triple_tuple(tuple: tuple) -> Triple:
return Triple(*tuple)


def to_triple_tuple(triple: Triple):
return (_strip_namespace(triple.subject_id), triple.predicate,
_strip_namespace(triple.object_id), triple.object_value)
Expand All @@ -256,6 +276,9 @@ def execute(self, sql: str, parameters=None):
def executemany(self, sql: str, parameters=None):
pass

def fetch_all(self, sql: str, parameters=None) -> list[Any]:
pass

def commit_and_close(self):
pass

Expand Down Expand Up @@ -296,6 +319,12 @@ def executemany(self, sql: str, parameters=None):
else:
self.cursor.executemany(sql, parameters)

def fetch_all(self, sql: str, parameters=None) -> list[Any]:
if not parameters:
return self.cursor.execute(sql).fetchall()
else:
return self.cursor.execute(sql, parameters).fetchall()

def commit_and_close(self):
self.connection.commit()
self.connection.close()
Expand All @@ -314,7 +343,7 @@ def commit_and_close(self):
_CLOUD_MY_SQL_PARAMS = [CLOUD_MY_SQL_INSTANCE] + _CLOUD_MY_SQL_CONNECT_PARAMS


class CloudSqlDbEngine:
class CloudSqlDbEngine(DbEngine):

def __init__(self, db_params: dict[str, str]) -> None:
for param in _CLOUD_MY_SQL_PARAMS:
Expand All @@ -341,6 +370,10 @@ def execute(self, sql: str, parameters=None):
def executemany(self, sql: str, parameters=None):
self.cursor.executemany(_pymysql(sql), parameters)

def fetch_all(self, sql: str, parameters=None):
self.cursor.execute(_pymysql(sql), parameters)
return self.cursor.fetchall()

def commit_and_close(self):
self.cursor.close()
self.connection.commit()
Expand Down
24 changes: 24 additions & 0 deletions simple/stats/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import logging

from stats import constants
from stats import stat_var_hierarchy_generator
from stats.config import Config
from stats.data import ImportType
from stats.data import InputFileFormat
Expand All @@ -34,6 +35,7 @@
from stats.nodes import Nodes
from stats.observations_importer import ObservationsImporter
from stats.reporter import ImportReporter
import stats.schema_constants as sc
from stats.variable_per_row_importer import VariablePerRowImporter
from util.filehandler import create_file_handler
from util.filehandler import FileHandler
Expand Down Expand Up @@ -132,6 +134,9 @@ def run(self):
# Write triples to DB.
self.db.insert_triples(triples)

# Generate SVG hierarchy.
self._generate_svg_hierarchy()

# Generate SV sentences.
nl.generate_sv_sentences(
list(self.nodes.variables.values()),
Expand All @@ -149,6 +154,25 @@ def run(self):
logging.exception("Error running import")
self.reporter.report_failure(error=str(e))

def _generate_svg_hierarchy(self):
if self.mode == RunMode.MAIN_DC:
logging.info("Hierarchy generation not supported for main dc, skipping.")
return
if not self.config.generate_hierarchy():
logging.info("Hierarchy generation not enabled, skipping.")
return

logging.info("Generating SVG hierarchy.")
sv_triples = self.db.select_triples_by_subject_type(
sc.TYPE_STATISTICAL_VARIABLE)
if not sv_triples:
logging.info("No SV triples found, skipping SVG generating hierarchy.")
logging.info("Generating SVG hierarchy for %s SV triples.", len(sv_triples))

svg_triples = stat_var_hierarchy_generator.generate(sv_triples)
logging.info("Inserting %s SVG triples into DB.", len(svg_triples))
self.db.insert_triples(svg_triples)

def _run_imports(self):
input_fhs: list[FileHandler] = []
input_mcf_fhs: list[FileHandler] = []
Expand Down
7 changes: 7 additions & 0 deletions simple/tests/stats/db_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,13 @@ def test_sql_db(self):
db.insert_triples(_TRIPLES)
db.insert_observations(_OBSERVATIONS, "foo.csv")
db.insert_import_info(status=ImportStatus.SUCCESS)

sv_triples = db.select_triples_by_subject_type("StatisticalVariable")
self.assertListEqual(sv_triples, _TRIPLES)

svg_triples = db.select_triples_by_subject_type("StatVarGroup")
self.assertListEqual(svg_triples, [])

db.commit_and_close()

sqldb = sqlite3.connect(db_file_path)
Expand Down
3 changes: 3 additions & 0 deletions simple/tests/stats/runner_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,6 @@ def test_config_with_wildcards(self):

def test_input_dir_driven(self):
_test_runner(self, "input_dir_driven", is_config_driven=False)

def test_generate_svg_hierarchy(self):
_test_runner(self, "generate_svg_hierarchy", is_config_driven=False)
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
entity,variable,date,value,provenance
country/IND,var1,2020,0.16,c/p/1
country/IND,var2,2020,53,c/p/1
country/CHN,var1,2020,0.23,c/p/1
country/CHN,var2,2020,67,c/p/1
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
subject_id,predicate,object_id,object_value
some_var1,typeOf,StatisticalVariable,
some_var1,measuredProperty,value,
some_var1,name,,Some Variable 1 Name
some_var1,description,,Some Variable 1 Description
some_var1,populationType,Person,
some_var1,gender,Female,
some_var2,typeOf,StatisticalVariable,
some_var2,measuredProperty,value,
some_var2,name,,Some Variable 2 Name
some_var2,description,,Some Variable 2 Description
some_var2,populationType,Person,
some_var2,gender,Male,
c/s/default,typeOf,Source,
c/s/default,name,,Custom Data Commons
c/s/1,typeOf,Source,
c/s/1,name,,Source1
c/s/1,url,,http://source1.com
c/s/1,domain,,source1.com
c/p/default,typeOf,Provenance,
c/p/default,name,,Custom Import
c/p/default,source,c/s/default,
c/p/default,url,,custom-import
c/p/1,typeOf,Provenance,
c/p/1,name,,Provenance1
c/p/1,source,c/s/1,
c/p/1,url,,http://source1.com/provenance1
c/g/Person,typeOf,StatVarGroup,
c/g/Person,name,,Person
c/g/Person,specializationOf,dc/g/Root,
c/g/Person_Gender,typeOf,StatVarGroup,
c/g/Person_Gender,name,,Person With Gender
c/g/Person_Gender,specializationOf,c/g/Person,
c/g/Person_Gender-Female,typeOf,StatVarGroup,
c/g/Person_Gender-Female,name,,Person With Gender = Female
c/g/Person_Gender-Female,specializationOf,c/g/Person_Gender,
some_var1,memberOf,c/g/Person_Gender-Female,
c/g/Person_Gender-Male,typeOf,StatVarGroup,
c/g/Person_Gender-Male,name,,Person With Gender = Male
c/g/Person_Gender-Male,specializationOf,c/g/Person_Gender,
some_var2,memberOf,c/g/Person_Gender-Male,
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"inputFiles": {
"*.csv": {
"importType": "observations",
"format": "variablePerRow",
"provenance": "Provenance1"
}
},
"sources": {
"Source1": {
"url": "http://source1.com",
"provenances": {
"Provenance1": "http://source1.com/provenance1"
}
}
},
"groupStatVarsByProperty": true
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
entity,variable,date,value
country/IND,var1,2020,0.16
country/IND,var2,2020,53
country/CHN,var1,2020,0.23
country/CHN,var2,2020,67
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Node: v1
dcid:"some_var1"
typeOf: dcs:StatisticalVariable
measuredProperty: dcs:value
name: "Some Variable 1 Name"
description: "Some Variable 1 Description"
populationType: schema:Person
gender: dcs:Female

Node: dcid:some_var2
typeOf: dcs:StatisticalVariable
measuredProperty: dcs:value
name: "Some Variable 2 Name"
description: "Some Variable 2 Description"
populationType: schema:Person
gender: dcs:Male

0 comments on commit d38b9a2

Please sign in to comment.