diff --git a/run_test.sh b/run_test.sh index 8580b920..3ae6812a 100755 --- a/run_test.sh +++ b/run_test.sh @@ -21,7 +21,7 @@ function run_lint_fix { echo -e "#### Fixing Python code" python3 -m venv .env source .env/bin/activate - pip3 install yapf==0.33.0 -q + pip3 install yapf==0.40.2 -q if ! command -v isort &> /dev/null then pip3 install isort -q @@ -35,12 +35,12 @@ function run_lint_fix { function run_lint_test { python3 -m venv .env source .env/bin/activate - pip3 install yapf==0.33.0 -q + pip3 install yapf==0.40.2 -q if ! command -v isort &> /dev/null then pip3 install isort -q fi - + echo -e "#### Checking Python style" if ! yapf --recursive --diff --style='{based_on_style: google, indent_width: 2}' -p simple/ -e=*pb2.py -e=.env/*; then echo "Fix Python lint errors by running ./run_test.sh -f" @@ -74,9 +74,9 @@ function py_test { python3 -m venv .env source .env/bin/activate - + cd simple - pip3 install -r requirements.txt + pip3 install -r requirements.txt -q echo -e "#### Running stats tests" python3 -m pytest tests/stats/ -s diff --git a/simple/run_stats.sh b/simple/run_stats.sh index 161a5827..d48d0b94 100755 --- a/simple/run_stats.sh +++ b/simple/run_stats.sh @@ -9,7 +9,7 @@ Options: -c Json config file for stats importer -i Input directory to process -o Output folder for stats importer. Default: $OUTPUT_DIR - -m Mode of operation for simple importer. Default: $MODE + -m Mode of operation for simple importer. Default: $MODE -k DataCommons API Key -j DC Import java jar file. Download latest from https://github.com/datacommonsorg/import/releases/ diff --git a/simple/stats/db.py b/simple/stats/db.py index d231aafb..fc40b4e0 100644 --- a/simple/stats/db.py +++ b/simple/stats/db.py @@ -118,11 +118,14 @@ _SELECT_ENTITY_NAMES = "select subject_id, object_value from triples where subject_id in (%s) and predicate = 'name' and object_value <> ''" -_INIT_STATEMENTS = [ +_INIT_TABLE_STATEMENTS = [ _CREATE_TRIPLES_TABLE, _CREATE_OBSERVATIONS_TABLE, _CREATE_KEY_VALUE_STORE_TABLE, _CREATE_IMPORTS_TABLE, +] + +_CLEAR_TABLE_FOR_IMPORT_STATEMENTS = [ # Clearing tables for now (not the import tables though since we want to maintain its history). _DELETE_TRIPLES_STATEMENT, _DELETE_OBSERVATIONS_STATEMENT, @@ -195,6 +198,9 @@ class Db: The "DB" could be a traditional sql db or a file system with the output being files. """ + def maybe_clear_before_import(self): + pass + def insert_triples(self, triples: list[Triple]): pass @@ -285,8 +291,13 @@ class SqlDb(Db): def __init__(self, config: dict) -> None: self.engine = create_db_engine(config) + self.engine.init_or_update_tables() self.num_observations = 0 self.variables: set[str] = set() + self.indexes_cleared = False + + def maybe_clear_before_import(self): + self.engine.clear_tables_and_indexes() def insert_triples(self, triples: list[Triple]): logging.info("Writing %s triples to [%s]", len(triples), self.engine) @@ -345,6 +356,12 @@ def from_triple_tuple(tuple: tuple) -> Triple: class DbEngine: + def init_or_update_tables(self): + pass + + def clear_tables_and_indexes(self): + pass + def execute(self, sql: str, parameters=None): pass @@ -379,14 +396,8 @@ def __init__(self, db_params: dict) -> None: logging.info("Connected to SQLite: %s", self.local_db_file_path) self.cursor = self.connection.cursor() - # Drop indexes first so inserts are faster. - self._drop_indexes() - for statement in _INIT_STATEMENTS: - self.cursor.execute(statement) - # Apply schema updates. - self._schema_updates() - def _schema_updates(self) -> None: + def _maybe_update_schema(self) -> None: """ Add any sqlite schema updates here. Ensure that all schema updates always check if the update is necessary before applying it. @@ -415,6 +426,15 @@ def _create_indexes(self) -> None: def __str__(self) -> str: return f"{TYPE_SQLITE}: {self.db_file_path}" + def init_or_update_tables(self): + for statement in _INIT_TABLE_STATEMENTS: + self.cursor.execute(statement) + self._maybe_update_schema() + + def clear_tables_and_indexes(self): + for statement in _CLEAR_TABLE_FOR_IMPORT_STATEMENTS: + self.cursor.execute(statement) + def execute(self, sql: str, parameters=None): if not parameters: self.cursor.execute(sql) @@ -461,8 +481,8 @@ def commit_and_close(self): _CLOUD_MY_SQL_PARAMS = [CLOUD_MY_SQL_INSTANCE] + _CLOUD_MY_SQL_DB_CONNECT_PARAMS _CLOUD_MYSQL_PROPERTIES_COLUMN_EXISTS_STATEMENT = """ - SELECT 1 - FROM INFORMATION_SCHEMA.COLUMNS + SELECT 1 + FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = 'observations' AND COLUMN_NAME = 'properties'; """ @@ -486,14 +506,8 @@ def __init__(self, db_params: dict[str, str]) -> None: db_params[CLOUD_MY_SQL_INSTANCE], db_params[CLOUD_MY_SQL_DB]) self.description = f"{TYPE_CLOUD_SQL}: {db_params[CLOUD_MY_SQL_INSTANCE]} ({db_params[CLOUD_MY_SQL_DB]})" self.cursor: Cursor = self.connection.cursor() - # Drop indexes first so inserts are faster. - self._drop_indexes() - for statement in _INIT_STATEMENTS: - self.cursor.execute(statement) - # Apply schema updates. - self._schema_updates() - def _schema_updates(self) -> None: + def _maybe_update_schema(self) -> None: """ Add any cloud sql schema updates here. Ensure that all schema updates always check if the update is necessary before applying it. @@ -555,6 +569,16 @@ def _db_exists(cursor) -> bool: def __str__(self) -> str: return self.description + def init_or_update_tables(self): + for statement in _INIT_TABLE_STATEMENTS: + self.cursor.execute(statement) + self._maybe_update_schema() + + def clear_tables_and_indexes(self): + for statement in _CLEAR_TABLE_FOR_IMPORT_STATEMENTS: + self.cursor.execute(statement) + self._drop_indexes() + def execute(self, sql: str, parameters=None): self.cursor.execute(_pymysql(sql), parameters) @@ -599,7 +623,10 @@ def create_db_engine(config: dict) -> DbEngine: assert False -def create_db(config: dict) -> Db: +def create_and_update_db(config: dict) -> Db: + """ Creates and initializes a Db, performing any setup and updates + (e.g. table creation, table schema changes) that are needed. + """ db_type = config[FIELD_DB_TYPE] if db_type and db_type == TYPE_MAIN_DC: return MainDcDb(config) diff --git a/simple/stats/runner.py b/simple/stats/runner.py index be042c75..fa2384b1 100644 --- a/simple/stats/runner.py +++ b/simple/stats/runner.py @@ -26,7 +26,7 @@ from stats.data import ParentSVG2ChildSpecializedNames from stats.data import Triple from stats.data import VerticalSpec -from stats.db import create_db +from stats.db import create_and_update_db from stats.db import create_main_dc_config from stats.db import create_sqlite_config from stats.db import get_cloud_sql_config_from_env @@ -48,6 +48,7 @@ class RunMode(StrEnum): CUSTOM_DC = "customdc" + SCHEMA_UPDATE = "schemaupdate" MAIN_DC = "maindc" @@ -113,49 +114,22 @@ def __init__(self, self.reporter = ImportReporter(report_fh=self.process_dir_fh.make_file( constants.REPORT_JSON_FILE_NAME)) - # DB setup. - def _get_db_config() -> dict: - if self.mode == RunMode.MAIN_DC: - logging.info("Using Main DC config.") - return create_main_dc_config(self.output_dir_fh.path) - # Attempt to get from env (cloud sql, then sqlite), - # then config file, then default. - db_cfg = get_cloud_sql_config_from_env() - if db_cfg: - logging.info("Using Cloud SQL settings from env.") - return db_cfg - db_cfg = get_sqlite_config_from_env() - if db_cfg: - logging.info("Using SQLite settings from env.") - return db_cfg - logging.info("Using default DB settings.") - return create_sqlite_config( - self.output_dir_fh.make_file(constants.DB_FILE_NAME).path) - - self.db = create_db(_get_db_config()) self.nodes = Nodes(self.config) + self.db = None def run(self): try: - # Run all data imports. - self._run_imports() + if (self.db is None): + self.db = create_and_update_db(self._get_db_config()) - # Generate triples. - triples = self.nodes.triples() - # Write triples to DB. - self.db.insert_triples(triples) + if self.mode == RunMode.SCHEMA_UPDATE: + logging.info("Skipping imports because run mode is schema update.") - # Generate SVG hierarchy. - self._generate_svg_hierarchy() + elif self.mode == RunMode.CUSTOM_DC or self.mode == RunMode.MAIN_DC: + self._run_imports_and_do_post_import_work() - # Generate SVG cache. - self._generate_svg_cache() - - # Generate NL sentences for creating embeddings. - self._generate_nl_sentences() - - # Write import info to DB. - self.db.insert_import_info(status=ImportStatus.SUCCESS) + else: + raise ValueError(f"Unsupported mode: {self.mode}") # Commit and close DB. self.db.commit_and_close() @@ -163,9 +137,52 @@ def run(self): # Report done. self.reporter.report_done() except Exception as e: - logging.exception("Error running import") + logging.exception("Error updating stats") self.reporter.report_failure(error=str(e)) + def _get_db_config(self) -> dict: + if self.mode == RunMode.MAIN_DC: + logging.info("Using Main DC config.") + return create_main_dc_config(self.output_dir_fh.path) + # Attempt to get from env (cloud sql, then sqlite), + # then config file, then default. + db_cfg = get_cloud_sql_config_from_env() + if db_cfg: + logging.info("Using Cloud SQL settings from env.") + return db_cfg + db_cfg = get_sqlite_config_from_env() + if db_cfg: + logging.info("Using SQLite settings from env.") + return db_cfg + logging.info("Using default DB settings.") + return create_sqlite_config( + self.output_dir_fh.make_file(constants.DB_FILE_NAME).path) + + def _run_imports_and_do_post_import_work(self): + # (SQL only) Drop data in existing tables (except import metadata). + # Also drop indexes for faster writes. + self.db.maybe_clear_before_import() + + # Import data from all input files. + self._run_all_data_imports() + + # Generate triples. + triples = self.nodes.triples() + # Write triples to DB. + self.db.insert_triples(triples) + + # Generate SVG hierarchy. + self._generate_svg_hierarchy() + + # Generate SVG cache. + self._generate_svg_cache() + + # Generate NL sentences for creating embeddings. + self._generate_nl_sentences() + + # Write import info to DB. + self.db.insert_import_info(status=ImportStatus.SUCCESS) + def _generate_nl_sentences(self): triples: list[Triple] = [] # Get topic triples if generating topics else get SV triples. @@ -247,7 +264,7 @@ def _maybe_set_special_fh(self, fh: FileHandler) -> bool: return True return False - def _run_imports(self): + def _run_all_data_imports(self): input_fhs: list[FileHandler] = [] input_mcf_fhs: list[FileHandler] = [] for input_handler in self.input_handlers: diff --git a/simple/tests/stats/db_test.py b/simple/tests/stats/db_test.py index b8546f8c..b44e2ceb 100644 --- a/simple/tests/stats/db_test.py +++ b/simple/tests/stats/db_test.py @@ -23,7 +23,7 @@ from stats.data import Observation from stats.data import ObservationProperties from stats.data import Triple -from stats.db import create_db +from stats.db import create_and_update_db from stats.db import create_main_dc_config from stats.db import create_sqlite_config from stats.db import get_cloud_sql_config_from_env @@ -31,9 +31,11 @@ from stats.db import ImportStatus from tests.stats.test_util import compare_files from tests.stats.test_util import is_write_mode +from tests.stats.test_util import read_full_db_from_file _TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_data", "db") +_INPUT_DIR = os.path.join(_TEST_DATA_DIR, "input") _EXPECTED_DIR = os.path.join(_TEST_DATA_DIR, "expected") _TRIPLES = [ @@ -58,16 +60,66 @@ })) ] +_OLD_OBSERVATION_TUPLES_AFTER_UPDATE = [ + ("e1", "v1", "2023", "123", "p1", None, None, None, None, None), + ("e2", "v1", "2023", "456", "p1", None, None, None, None, None), + ("e3", "v1", "2023", "789", "p1", None, None, None, None, None) +] + +# The import previously recorded in sqlite_old_schema_populated.sql +_OLD_IMPORT_TUPLE = ("2022-02-02 00:00:00", "SUCCESS", + '{"numVars": 1, "numObs": 3}') + +# The import previously recorded in sqlite_current_schema_populated.sql +_DIFFERENT_IMPORT_TUPLE = ("2021-03-03 00:00:00", "SUCCESS", + '{"numVars": 2, "numObs": 2}') + +# The import performed during tests in this file. +_CURRENT_IMPORT_TUPLE = ("2023-01-01 00:00:00", "SUCCESS", + '{"numVars": 1, "numObs": 3}') + _KEY_VALUE = ("k1", "v1") class TestDb(unittest.TestCase): + def _seed_db_from_input(self, db_file_path: str, input_db_file_name: str): + input_db_file = os.path.join(_INPUT_DIR, input_db_file_name) + read_full_db_from_file(db_file_path, input_db_file) + + def _verify_db_contents(self, db_file_path: str, triples: list[tuple], + observations: list[tuple], key_values: list[tuple], + imports: list[tuple], indexes: list[tuple]): + db = sqlite3.connect(db_file_path) + + actual_triples = db.execute("select * from triples").fetchall() + self.assertListEqual(actual_triples, triples) + + actual_observations = db.execute("select * from observations").fetchall() + self.assertListEqual(actual_observations, observations) + + actual_key_values = db.execute("select * from key_value_store").fetchall() + self.assertListEqual(actual_key_values, key_values) + + actual_imports = db.execute("select * from imports").fetchall() + self.assertListEqual(actual_imports, imports) + + actual_indexes = db.execute( + "select name, tbl_name from sqlite_master where type = 'index'" + ).fetchall() + self.assertListEqual(actual_indexes, indexes) + + db.close() + @freeze_time("2023-01-01") def test_sql_db(self): + """ Tests database creation and insertion of triples, observations, and + import info in SQL mode. + Compares resulting DB contents with expected values. + """ with tempfile.TemporaryDirectory() as temp_dir: db_file_path = os.path.join(temp_dir, "datacommons.db") - db = create_db(create_sqlite_config(db_file_path)) + db = create_and_update_db(create_sqlite_config(db_file_path)) db.insert_triples(_TRIPLES) db.insert_observations(_OBSERVATIONS, "foo.csv") db.insert_key_value(_KEY_VALUE[0], _KEY_VALUE[1]) @@ -84,33 +136,102 @@ def test_sql_db(self): db.commit_and_close() - sqldb = sqlite3.connect(db_file_path) + self._verify_db_contents( + db_file_path, + triples=list(map(lambda x: x.db_tuple(), _TRIPLES)), + observations=list(map(lambda x: x.db_tuple(), _OBSERVATIONS)), + key_values=[_KEY_VALUE], + imports=[_CURRENT_IMPORT_TUPLE], + indexes=[('observations_entity_variable', 'observations'), + ('triples_subject_id', 'triples')]) - triples = sqldb.execute("select * from triples").fetchall() - self.assertListEqual(triples, list(map(lambda x: x.db_tuple(), _TRIPLES))) + @freeze_time("2023-01-01") + def test_sql_db_schema_update(self): + """ Tests that db.create() updates the schema of an existing SQL database + without modifying existing data. + """ + with tempfile.TemporaryDirectory() as temp_dir: + db_file_path = os.path.join(temp_dir, "datacommons.db") + self._seed_db_from_input(db_file_path, "sqlite_old_schema_populated.sql") - observations = sqldb.execute("select * from observations").fetchall() - self.assertListEqual(observations, - list(map(lambda x: x.db_tuple(), _OBSERVATIONS))) + db = create_and_update_db(create_sqlite_config(db_file_path)) + db.commit_and_close() + + self._verify_db_contents( + db_file_path, + triples=list(map(lambda x: x.db_tuple(), _TRIPLES)), + observations=_OLD_OBSERVATION_TUPLES_AFTER_UPDATE, + key_values=[_KEY_VALUE], + imports=[_OLD_IMPORT_TUPLE], + indexes=[('observations_entity_variable', 'observations'), + ('triples_subject_id', 'triples')]) + + @freeze_time("2023-01-01") + def test_sql_db_reimport_with_schema_update(self): + """ Tests that reimporting data to a SQL database succeeds when the existing + database has an old schema. + """ + with tempfile.TemporaryDirectory() as temp_dir: + db_file_path = os.path.join(temp_dir, "datacommons.db") + self._seed_db_from_input(db_file_path, "sqlite_old_schema_populated.sql") - key_value_tuple = sqldb.execute( - "select * from key_value_store").fetchone() - self.assertTupleEqual(key_value_tuple, (_KEY_VALUE)) + db = create_and_update_db(create_sqlite_config(db_file_path)) - import_tuple = sqldb.execute("select * from imports").fetchone() - self.assertTupleEqual( - import_tuple, - ("2023-01-01 00:00:00", "SUCCESS", '{"numVars": 1, "numObs": 3}')) + db.maybe_clear_before_import() + + db.insert_triples(_TRIPLES) + db.insert_observations(_OBSERVATIONS, "foo.csv") + db.insert_key_value(_KEY_VALUE[0], _KEY_VALUE[1]) + db.insert_import_info(status=ImportStatus.SUCCESS) + + db.commit_and_close() + + self._verify_db_contents( + db_file_path, + triples=list(map(lambda x: x.db_tuple(), _TRIPLES)), + observations=list(map(lambda x: x.db_tuple(), _OBSERVATIONS)), + key_values=[_KEY_VALUE], + imports=[_OLD_IMPORT_TUPLE, _CURRENT_IMPORT_TUPLE], + indexes=[('observations_entity_variable', 'observations'), + ('triples_subject_id', 'triples')]) + + @freeze_time("2023-01-01") + def test_sql_db_reimport_without_schema_update(self): + """ Tests that importing new data to a SQL database replaces the contents of + all tables except the imports table. + """ + with tempfile.TemporaryDirectory() as temp_dir: + db_file_path = os.path.join(temp_dir, "datacommons.db") + self._seed_db_from_input(db_file_path, + "sqlite_current_schema_populated.sql") + + db = create_and_update_db(create_sqlite_config(db_file_path)) + + db.maybe_clear_before_import() + + db.insert_triples(_TRIPLES) + db.insert_observations(_OBSERVATIONS, "foo.csv") + db.insert_key_value(_KEY_VALUE[0], _KEY_VALUE[1]) + db.insert_import_info(status=ImportStatus.SUCCESS) + + db.commit_and_close() - index_tuples = sqldb.execute( - "select name, tbl_name from sqlite_master where type = 'index'" - ).fetchall() - self.assertListEqual(index_tuples, - [('observations_entity_variable', 'observations'), - ('triples_subject_id', 'triples')]) + self._verify_db_contents( + db_file_path, + triples=list(map(lambda x: x.db_tuple(), _TRIPLES)), + observations=list(map(lambda x: x.db_tuple(), _OBSERVATIONS)), + key_values=[_KEY_VALUE], + imports=[_DIFFERENT_IMPORT_TUPLE, _CURRENT_IMPORT_TUPLE], + indexes=[('observations_entity_variable', 'observations'), + ('triples_subject_id', 'triples')]) @freeze_time("2023-01-01") def test_main_dc_db(self): + """ Tests database creation and insertion of triples, observations, and + import info in main DC mode. + Compares output observation CSV, observation TMCF, and schema MCF with goldens. + In write mode, replaces the goldens instead. + """ with tempfile.TemporaryDirectory() as temp_dir: observations_file = os.path.join(temp_dir, "observations.csv") expected_observations_file = os.path.join(_EXPECTED_DIR, @@ -120,7 +241,7 @@ def test_main_dc_db(self): mcf_file = os.path.join(temp_dir, "schema.mcf") expected_mcf_file = os.path.join(_EXPECTED_DIR, "schema.mcf") - db = create_db(create_main_dc_config(temp_dir)) + db = create_and_update_db(create_main_dc_config(temp_dir)) db.insert_triples(_TRIPLES) db.insert_observations(_OBSERVATIONS, "observations.csv") db.insert_import_info(status=ImportStatus.SUCCESS) diff --git a/simple/tests/stats/entities_importer_test.py b/simple/tests/stats/entities_importer_test.py index 024e6bf6..454a9b42 100644 --- a/simple/tests/stats/entities_importer_test.py +++ b/simple/tests/stats/entities_importer_test.py @@ -22,7 +22,7 @@ import pandas as pd from stats.config import Config from stats.data import Triple -from stats.db import create_db +from stats.db import create_and_update_db from stats.db import create_sqlite_config from stats.entities_importer import EntitiesImporter from stats.nodes import Nodes @@ -61,7 +61,7 @@ def _test_import(test: unittest.TestCase, test_name: str): config = Config(data=json.loads(input_config_fh.read_string())) nodes = Nodes(config) - db = create_db(create_sqlite_config(db_path)) + db = create_and_update_db(create_sqlite_config(db_path)) report_fh = LocalFileHandler(os.path.join(temp_dir, "report.json")) reporter = FileImportReporter(input_path, ImportReporter(report_fh)) diff --git a/simple/tests/stats/events_importer_test.py b/simple/tests/stats/events_importer_test.py index 4c0f7e68..2451df3e 100644 --- a/simple/tests/stats/events_importer_test.py +++ b/simple/tests/stats/events_importer_test.py @@ -23,7 +23,7 @@ from stats.config import Config from stats.data import Observation from stats.data import Triple -from stats.db import create_db +from stats.db import create_and_update_db from stats.db import create_sqlite_config from stats.events_importer import EventsImporter from stats.nodes import Nodes @@ -64,7 +64,7 @@ def _test_import(test: unittest.TestCase, test_name: str): config = Config(data=json.loads(input_config_fh.read_string())) nodes = Nodes(config) - db = create_db(create_sqlite_config(db_path)) + db = create_and_update_db(create_sqlite_config(db_path)) debug_resolve_fh = LocalFileHandler(os.path.join(temp_dir, "debug.csv")) report_fh = LocalFileHandler(os.path.join(temp_dir, "report.json")) reporter = FileImportReporter(input_path, ImportReporter(report_fh)) diff --git a/simple/tests/stats/mcf_importer_test.py b/simple/tests/stats/mcf_importer_test.py index 4b9ba26b..309f1fd8 100644 --- a/simple/tests/stats/mcf_importer_test.py +++ b/simple/tests/stats/mcf_importer_test.py @@ -22,7 +22,7 @@ import pandas as pd from stats.config import Config from stats.data import Triple -from stats.db import create_db +from stats.db import create_and_update_db from stats.db import create_sqlite_config from stats.mcf_importer import McfImporter from stats.nodes import Nodes @@ -59,7 +59,7 @@ def _test_import(test: unittest.TestCase, input_fh = LocalFileHandler(input_mcf_path) output_fh = LocalFileHandler(output_mcf_path) - db = create_db(create_sqlite_config(db_path)) + db = create_and_update_db(create_sqlite_config(db_path)) report_fh = LocalFileHandler(os.path.join(temp_dir, "report.json")) reporter = FileImportReporter(input_mcf_path, ImportReporter(report_fh)) diff --git a/simple/tests/stats/observations_importer_test.py b/simple/tests/stats/observations_importer_test.py index ff9d94a1..c40207c6 100644 --- a/simple/tests/stats/observations_importer_test.py +++ b/simple/tests/stats/observations_importer_test.py @@ -21,7 +21,7 @@ import pandas as pd from stats.config import Config -from stats.db import create_db +from stats.db import create_and_update_db from stats.db import create_sqlite_config from stats.nodes import Nodes from stats.observations_importer import ObservationsImporter @@ -64,7 +64,7 @@ def _test_import(test: unittest.TestCase, test_name: str): with open(config_path) as config_file: config = Config(json.load(config_file)) - db = create_db(create_sqlite_config(db_path)) + db = create_and_update_db(create_sqlite_config(db_path)) debug_resolve_fh = LocalFileHandler(os.path.join(temp_dir, "debug.csv")) report_fh = LocalFileHandler(os.path.join(temp_dir, "report.json")) reporter = FileImportReporter(input_path, ImportReporter(report_fh)) diff --git a/simple/tests/stats/runner_test.py b/simple/tests/stats/runner_test.py index 07ff064e..482b7a81 100644 --- a/simple/tests/stats/runner_test.py +++ b/simple/tests/stats/runner_test.py @@ -25,9 +25,11 @@ from stats import constants from stats.data import Observation from stats.data import Triple +from stats.runner import RunMode from stats.runner import Runner from tests.stats.test_util import compare_files from tests.stats.test_util import is_write_mode +from tests.stats.test_util import read_full_db_from_file from tests.stats.test_util import use_fake_gzip_time from tests.stats.test_util import write_key_values from tests.stats.test_util import write_observations @@ -44,7 +46,9 @@ def _test_runner(test: unittest.TestCase, test_name: str, - is_config_driven: bool = True): + is_config_driven: bool = True, + run_mode: RunMode = RunMode.CUSTOM_DC, + input_db_file_name: str = None): test.maxDiff = None with tempfile.TemporaryDirectory() as temp_dir: @@ -59,6 +63,9 @@ def _test_runner(test: unittest.TestCase, "remote_entity_types.json") db_path = os.path.join(temp_dir, "datacommons.db") + if (input_db_file_name): + input_db_file = os.path.join(input_dir, input_db_file_name) + read_full_db_from_file(db_path, input_db_file) expected_dir = os.path.join(_EXPECTED_DIR, test_name) expected_nl_dir = os.path.join(expected_dir, constants.NL_DIR_NAME) @@ -85,8 +92,10 @@ def _test_runner(test: unittest.TestCase, dc_client.get_property_of_entities = MagicMock( return_value=json.load(f)) - Runner(config_file=config_path, input_dir=input_dir, - output_dir=temp_dir).run() + Runner(config_file=config_path, + input_dir=input_dir, + output_dir=temp_dir, + mode=run_mode).run() write_triples(db_path, output_triples_path) write_observations(db_path, output_observations_path) @@ -125,6 +134,12 @@ def test_config_with_wildcards(self): def test_input_dir_driven(self): _test_runner(self, "input_dir_driven", is_config_driven=False) + def test_input_dir_driven_with_existing_old_schema_data(self): + _test_runner(self, + "input_dir_driven_with_existing_old_schema_data", + is_config_driven=False, + input_db_file_name="sqlite_old_schema_populated.sql") + def test_generate_svg_hierarchy(self): _test_runner(self, "generate_svg_hierarchy", is_config_driven=False) @@ -136,3 +151,10 @@ def test_topic_nl_sentences(self): def test_remote_entity_types(self): _test_runner(self, "remote_entity_types", is_config_driven=False) + + def test_schema_update_only(self): + _test_runner(self, + "schema_update_only", + is_config_driven=False, + run_mode=RunMode.SCHEMA_UPDATE, + input_db_file_name="sqlite_old_schema_populated.sql") diff --git a/simple/tests/stats/schema_test.py b/simple/tests/stats/schema_test.py index cca92b3b..45d3060f 100644 --- a/simple/tests/stats/schema_test.py +++ b/simple/tests/stats/schema_test.py @@ -21,7 +21,7 @@ from stats import schema from stats import schema_constants as sc from stats.data import Triple -from stats.db import create_db +from stats.db import create_and_update_db from stats.db import create_sqlite_config @@ -92,7 +92,7 @@ def test_get_schema_names(self, desc: str, db_names: dict[str, str], output_names: dict[str, str], mock_dc_client): with tempfile.TemporaryDirectory() as temp_dir: db_file_path = os.path.join(temp_dir, "datacommons.db") - db = create_db(create_sqlite_config(db_file_path)) + db = create_and_update_db(create_sqlite_config(db_file_path)) db.insert_triples(_to_triples(db_names)) mock_dc_client.return_value = remote_names diff --git a/simple/tests/stats/test_data/db/input/sqlite_current_schema_populated.sql b/simple/tests/stats/test_data/db/input/sqlite_current_schema_populated.sql new file mode 100644 index 00000000..23124800 --- /dev/null +++ b/simple/tests/stats/test_data/db/input/sqlite_current_schema_populated.sql @@ -0,0 +1,40 @@ +BEGIN TRANSACTION; +CREATE TABLE imports ( + imported_at datetime, + status varchar(16), + metadata text +); +INSERT INTO "imports" VALUES('2021-03-03 00:00:00','SUCCESS','{"numVars": 2, "numObs": 2}'); +CREATE TABLE key_value_store ( + lookup_key varchar(255), + value longtext +); +INSERT INTO "key_value_store" VALUES('k0','v0'); +CREATE TABLE observations ( + entity varchar(255), + variable varchar(255), + date varchar(255), + value varchar(255), + provenance varchar(255), + unit varchar(255), + scaling_factor varchar(255), + measurement_method varchar(255), + observation_period varchar(255), + properties TEXT +); +INSERT INTO "observations" VALUES('e4','v4','2022','654','p4','','','','',''); +INSERT INTO "observations" VALUES('e3','v1','2023','789','p1','USD','','','','{"prop1": "val1"}'); +CREATE TABLE triples ( + subject_id varchar(255), + predicate varchar(255), + object_id varchar(255), + object_value TEXT +); +INSERT INTO "triples" VALUES('sub3','typeOf','StatisticalVariable',''); +INSERT INTO "triples" VALUES('sub3','pred3','','objval3'); +INSERT INTO "triples" VALUES('sub3','name','','name3'); +INSERT INTO "triples" VALUES('sub2','typeOf','StatisticalVariable',''); +INSERT INTO "triples" VALUES('sub2','name','','name2'); +CREATE INDEX observations_entity_variable on observations (entity, variable); +CREATE INDEX triples_subject_id on triples (subject_id); +COMMIT; diff --git a/simple/tests/stats/test_data/db/input/sqlite_old_schema_populated.sql b/simple/tests/stats/test_data/db/input/sqlite_old_schema_populated.sql new file mode 100644 index 00000000..429ccd52 --- /dev/null +++ b/simple/tests/stats/test_data/db/input/sqlite_old_schema_populated.sql @@ -0,0 +1,36 @@ +BEGIN TRANSACTION; +CREATE TABLE imports ( + imported_at datetime, + status varchar(16), + metadata text +); +INSERT INTO "imports" VALUES('2022-02-02 00:00:00','SUCCESS','{"numVars": 1, "numObs": 3}'); +CREATE TABLE key_value_store ( + lookup_key varchar(255), + value longtext +); +INSERT INTO "key_value_store" VALUES('k1','v1'); +CREATE TABLE observations ( + entity varchar(255), + variable varchar(255), + date varchar(255), + value varchar(255), + provenance varchar(255) +); +INSERT INTO "observations" VALUES('e1','v1','2023','123','p1'); +INSERT INTO "observations" VALUES('e2','v1','2023','456','p1'); +INSERT INTO "observations" VALUES('e3','v1','2023','789','p1'); +CREATE TABLE triples ( + subject_id varchar(255), + predicate varchar(255), + object_id varchar(255), + object_value TEXT +); +INSERT INTO "triples" VALUES('sub1','typeOf','StatisticalVariable',''); +INSERT INTO "triples" VALUES('sub1','pred1','','objval1'); +INSERT INTO "triples" VALUES('sub1','name','','name1'); +INSERT INTO "triples" VALUES('sub2','typeOf','StatisticalVariable',''); +INSERT INTO "triples" VALUES('sub2','name','','name2'); +CREATE INDEX observations_entity_variable on observations (entity, variable); +CREATE INDEX triples_subject_id on triples (subject_id); +COMMIT; diff --git a/simple/tests/stats/test_data/runner/expected/input_dir_driven_with_existing_old_schema_data/key_value_store.db.csv b/simple/tests/stats/test_data/runner/expected/input_dir_driven_with_existing_old_schema_data/key_value_store.db.csv new file mode 100644 index 00000000..23bb062c --- /dev/null +++ b/simple/tests/stats/test_data/runner/expected/input_dir_driven_with_existing_old_schema_data/key_value_store.db.csv @@ -0,0 +1,2 @@ +lookup_key,value +StatVarGroups,H4sIAAAAAAAC/+OS5OJMSdZP1w/Kzy8R4pHi4uKA8bjcEGwhKy4B59LikvxchbDEoszEpJzUYiEhLpayxCJDKTCpBCahYkZgMSOwmBEAlEss8mMAAAA= diff --git a/simple/tests/stats/test_data/runner/expected/input_dir_driven_with_existing_old_schema_data/nl/sentences.csv b/simple/tests/stats/test_data/runner/expected/input_dir_driven_with_existing_old_schema_data/nl/sentences.csv new file mode 100644 index 00000000..fbf51f1d --- /dev/null +++ b/simple/tests/stats/test_data/runner/expected/input_dir_driven_with_existing_old_schema_data/nl/sentences.csv @@ -0,0 +1,5 @@ +dcid,sentence +some_var1,Some Variable 1 Name +some_var2,Some Variable 2 Name +var1,var1 +var2,var2 diff --git a/simple/tests/stats/test_data/runner/expected/input_dir_driven_with_existing_old_schema_data/observations.db.csv b/simple/tests/stats/test_data/runner/expected/input_dir_driven_with_existing_old_schema_data/observations.db.csv new file mode 100644 index 00000000..bea81291 --- /dev/null +++ b/simple/tests/stats/test_data/runner/expected/input_dir_driven_with_existing_old_schema_data/observations.db.csv @@ -0,0 +1,31 @@ +entity,variable,date,value,provenance,unit,scaling_factor,measurement_method,observation_period,properties +country/AFG,var1,2023,0.19,c/p/1,,,,, +country/YEM,var1,2023,0.21,c/p/1,,,,, +country/AGO,var1,2023,0.29,c/p/1,,,,, +country/ZMB,var1,2023,0.31,c/p/1,,,,, +country/ZWE,var1,2023,0.37,c/p/1,,,,, +country/ALB,var1,2023,0.5,c/p/1,,,,, +wikidataId/Q22062741,var1,2023,0.5,c/p/1,,,,, +country/DZA,var1,2023,0.52,c/p/1,,,,, +country/AND,var1,2023,0.76,c/p/1,,,,, +country/AFG,var2,2023,6,c/p/1,,,,, +country/YEM,var2,2023,56,c/p/1,,,,, +country/AGO,var2,2023,6,c/p/1,,,,, +country/ZMB,var2,2023,34,c/p/1,,,,, +country/ZWE,var2,2023,76,c/p/1,,,,, +country/ALB,var2,2023,34,c/p/1,,,,, +wikidataId/Q22062741,var2,2023,97,c/p/1,,,,, +country/DZA,var2,2023,92,c/p/1,,,,, +country/AND,var2,2023,9,c/p/1,,,,, +country/ASM,var2,2023,34,c/p/1,,,,, +country/AIA,var2,2023,42,c/p/1,,,,, +country/WLF,var2,2023,75,c/p/1,,,,, +country/ESH,var2,2023,65,c/p/1,,,,, +country/IND,var1,2020,0.16,c/p/1,,,,, +country/IND,var2,2020,53,c/p/1,,,,, +country/CHN,var1,2020,0.23,c/p/1,,,,, +country/CHN,var2,2020,67,c/p/1,,,,, +country/USA,var1,2021,555,c/p/1,,,,, +country/IND,var1,2022,321,c/p/1,,,,, +country/USA,var2,2021,666,c/p/1,,,,, +country/IND,var2,2022,123,c/p/1,,,,, diff --git a/simple/tests/stats/test_data/runner/expected/input_dir_driven_with_existing_old_schema_data/triples.db.csv b/simple/tests/stats/test_data/runner/expected/input_dir_driven_with_existing_old_schema_data/triples.db.csv new file mode 100644 index 00000000..88c72cd8 --- /dev/null +++ b/simple/tests/stats/test_data/runner/expected/input_dir_driven_with_existing_old_schema_data/triples.db.csv @@ -0,0 +1,109 @@ +subject_id,predicate,object_id,object_value +article1,typeOf,Article, +article1,includedIn,c/p/1, +article1,article_id,,article1 +article1,article_title,,Article 1 +article1,article_author,author1, +article2,typeOf,Article, +article2,includedIn,c/p/1, +article2,article_id,,article2 +article2,article_title,,Article 2 +article2,article_author,author1, +article2,article_author,author2, +article3,typeOf,Article, +article3,includedIn,c/p/1, +article3,article_id,,article3 +article3,article_title,,Article 3 +article3,article_author,author2, +article3,article_author,author3, +author1,typeOf,Author, +author1,includedIn,c/p/1, +author1,author_id,,author1 +author1,author_name,,Jane Doe +author1,author_country,country/USA, +author2,typeOf,Author, +author2,includedIn,c/p/1, +author2,author_id,,author2 +author2,author_name,,Joe Smith +author2,author_country,country/CAN, +author3,typeOf,Author, +author3,includedIn,c/p/1, +author3,author_id,,author3 +author3,author_name,,Jane Smith +author3,author_country,country/USA, +some_var1,typeOf,StatisticalVariable, +some_var1,measuredProperty,value, +some_var1,name,,Some Variable 1 Name +some_var1,description,,Some Variable 1 Description +some_var2,typeOf,StatisticalVariable, +some_var2,measuredProperty,value, +some_var2,name,,Some Variable 2 Name +some_var2,description,,Some Variable 2 Description +c/s/default,typeOf,Source, +c/s/default,name,,Custom Data Commons +c/s/1,typeOf,Source, +c/s/1,name,,Source1 Name +c/s/1,url,,http://source1.com +c/s/1,domain,,source1.com +c/p/default,typeOf,Provenance, +c/p/default,name,,Custom Import +c/p/default,source,c/s/default, +c/p/default,url,,custom-import +c/p/1,typeOf,Provenance, +c/p/1,name,,Provenance1 Name +c/p/1,source,c/s/1, +c/p/1,url,,http://source1.com/provenance1 +c/g/Root,typeOf,StatVarGroup, +c/g/Root,name,,Custom Variables +c/g/Root,specializationOf,dc/g/Root, +var1,typeOf,StatisticalVariable, +var1,name,,var1 +var1,memberOf,c/g/Root, +var1,includedIn,c/p/1, +var1,includedIn,c/s/1, +var1,populationType,Thing, +var1,statType,measuredValue, +var1,measuredProperty,var1, +var2,typeOf,StatisticalVariable, +var2,name,,var2 +var2,memberOf,c/g/Root, +var2,includedIn,c/p/1, +var2,includedIn,c/s/1, +var2,populationType,Thing, +var2,statType,measuredValue, +var2,measuredProperty,var2, +Article,typeOf,Class, +Article,name,,Article +Article,includedIn,c/p/1, +Article,includedIn,c/s/1, +Author,typeOf,Class, +Author,name,,Author +Author,includedIn,c/p/1, +Author,includedIn,c/s/1, +article_id,typeOf,Property, +article_id,name,,article_id +article_title,typeOf,Property, +article_title,name,,article_title +article_author,typeOf,Property, +article_author,name,,article_author +author_id,typeOf,Property, +author_id,name,,author_id +author_name,typeOf,Property, +author_name,name,,author_name +author_country,typeOf,Property, +author_country,name,,author_country +country/AFG,typeOf,Country, +country/YEM,typeOf,Country, +country/AGO,typeOf,Country, +country/ZMB,typeOf,Country, +country/ZWE,typeOf,Country, +country/ALB,typeOf,Country, +wikidataId/Q22062741,typeOf,Country, +country/DZA,typeOf,Country, +country/AND,typeOf,Country, +country/ASM,typeOf,Country, +country/AIA,typeOf,Country, +country/WLF,typeOf,Country, +country/ESH,typeOf,Country, +country/USA,typeOf,Country, +country/IND,typeOf,Country, diff --git a/simple/tests/stats/test_data/runner/expected/schema_update_only/key_value_store.db.csv b/simple/tests/stats/test_data/runner/expected/schema_update_only/key_value_store.db.csv new file mode 100644 index 00000000..5f95a416 --- /dev/null +++ b/simple/tests/stats/test_data/runner/expected/schema_update_only/key_value_store.db.csv @@ -0,0 +1,2 @@ +lookup_key,value +k1,v1 diff --git a/simple/tests/stats/test_data/runner/expected/schema_update_only/observations.db.csv b/simple/tests/stats/test_data/runner/expected/schema_update_only/observations.db.csv new file mode 100644 index 00000000..fe007331 --- /dev/null +++ b/simple/tests/stats/test_data/runner/expected/schema_update_only/observations.db.csv @@ -0,0 +1,4 @@ +entity,variable,date,value,provenance,unit,scaling_factor,measurement_method,observation_period,properties +e1,v1,2023,123,p1,,,,, +e2,v1,2023,456,p1,,,,, +e3,v1,2023,789,p1,,,,, diff --git a/simple/tests/stats/test_data/runner/expected/schema_update_only/triples.db.csv b/simple/tests/stats/test_data/runner/expected/schema_update_only/triples.db.csv new file mode 100644 index 00000000..546aacbe --- /dev/null +++ b/simple/tests/stats/test_data/runner/expected/schema_update_only/triples.db.csv @@ -0,0 +1,6 @@ +subject_id,predicate,object_id,object_value +sub1,typeOf,StatisticalVariable, +sub1,pred1,,objval1 +sub1,name,,name1 +sub2,typeOf,StatisticalVariable, +sub2,name,,name2 diff --git a/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/article_entities.csv b/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/article_entities.csv new file mode 100644 index 00000000..61cbba4a --- /dev/null +++ b/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/article_entities.csv @@ -0,0 +1,4 @@ +article_id,article_title,article_author +article1,"Article 1",author1 +article2,"Article 2","author1,author2" +article3,"Article 3","author2,author3" \ No newline at end of file diff --git a/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/author_entities.csv b/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/author_entities.csv new file mode 100644 index 00000000..42f0b477 --- /dev/null +++ b/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/author_entities.csv @@ -0,0 +1,4 @@ +author_id,author_name,author_country +author1,"Jane Doe",country/USA +author2,"Joe Smith",country/CAN +author3,"Jane Smith",country/USA \ No newline at end of file diff --git a/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/config.json b/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/config.json new file mode 100644 index 00000000..83199fee --- /dev/null +++ b/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/config.json @@ -0,0 +1,44 @@ +{ + "inputFiles": { + "countries.csv": { + "importType": "observations", + "format": "variablePerColumn", + "entityType": "Country", + "provenance": "Provenance1 Name" + }, + "wikidataids.csv": { + "importType": "observations", + "format": "variablePerColumn", + "entityType": "Country", + "provenance": "Provenance1 Name" + }, + "variable_per_row.csv": { + "importType": "observations", + "format": "variablePerRow", + "entityType": "Country", + "provenance": "Provenance1 Name" + }, + "author_entities.csv": { + "importType": "entities", + "rowEntityType": "Author", + "idColumn": "author_id", + "entityColumns": ["author_country"], + "provenance": "Provenance1 Name" + }, + "article_entities.csv": { + "importType": "entities", + "rowEntityType": "Article", + "idColumn": "article_id", + "entityColumns": ["article_author"], + "provenance": "Provenance1 Name" + } + }, + "sources": { + "Source1 Name": { + "url": "http://source1.com", + "provenances": { + "Provenance1 Name": "http://source1.com/provenance1" + } + } + } +} diff --git a/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/countries.csv b/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/countries.csv new file mode 100644 index 00000000..e62275b3 --- /dev/null +++ b/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/countries.csv @@ -0,0 +1,15 @@ +place,year,var1,var2 +Afghanistan,2023,0.19,6 +Yemen,2023,0.21,56 +Angola,2023,0.29,6 +Zambia,2023,0.31,34 +Zimbabwe,2023,0.37,76 +Albania,2023,0.50,34 +dcid: wikidataId/Q22062741,2023,0.50,97 +Algeria,2023,0.52,92 +West Bank and Gaza,2023,0.53,64 +Andorra,2023,0.76,9 +American Samoa,2023,#N/A,34 +Anguilla,2023,#N/A,42 +Wallis and Futuna Islands,2023,#N/A,75 +Western Sahara,2023,#N/A,65 diff --git a/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/sqlite_old_schema_populated.sql b/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/sqlite_old_schema_populated.sql new file mode 100644 index 00000000..429ccd52 --- /dev/null +++ b/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/sqlite_old_schema_populated.sql @@ -0,0 +1,36 @@ +BEGIN TRANSACTION; +CREATE TABLE imports ( + imported_at datetime, + status varchar(16), + metadata text +); +INSERT INTO "imports" VALUES('2022-02-02 00:00:00','SUCCESS','{"numVars": 1, "numObs": 3}'); +CREATE TABLE key_value_store ( + lookup_key varchar(255), + value longtext +); +INSERT INTO "key_value_store" VALUES('k1','v1'); +CREATE TABLE observations ( + entity varchar(255), + variable varchar(255), + date varchar(255), + value varchar(255), + provenance varchar(255) +); +INSERT INTO "observations" VALUES('e1','v1','2023','123','p1'); +INSERT INTO "observations" VALUES('e2','v1','2023','456','p1'); +INSERT INTO "observations" VALUES('e3','v1','2023','789','p1'); +CREATE TABLE triples ( + subject_id varchar(255), + predicate varchar(255), + object_id varchar(255), + object_value TEXT +); +INSERT INTO "triples" VALUES('sub1','typeOf','StatisticalVariable',''); +INSERT INTO "triples" VALUES('sub1','pred1','','objval1'); +INSERT INTO "triples" VALUES('sub1','name','','name1'); +INSERT INTO "triples" VALUES('sub2','typeOf','StatisticalVariable',''); +INSERT INTO "triples" VALUES('sub2','name','','name2'); +CREATE INDEX observations_entity_variable on observations (entity, variable); +CREATE INDEX triples_subject_id on triples (subject_id); +COMMIT; diff --git a/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/variable_per_row.csv b/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/variable_per_row.csv new file mode 100644 index 00000000..d5b70ebc --- /dev/null +++ b/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/variable_per_row.csv @@ -0,0 +1,5 @@ +entity,variable,date,value +country/IND,var1,2020,0.16 +country/IND,var2,2020,53 +country/CHN,var1,2020,0.23 +country/CHN,var2,2020,67 \ No newline at end of file diff --git a/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/variables.mcf b/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/variables.mcf new file mode 100644 index 00000000..7b8c0b0d --- /dev/null +++ b/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/variables.mcf @@ -0,0 +1,12 @@ +Node: v1 +dcid:"some_var1" +typeOf: dcs:StatisticalVariable +measuredProperty: dcs:value +name: "Some Variable 1 Name" +description: "Some Variable 1 Description" + +Node: dcid:some_var2 +typeOf: dcs:StatisticalVariable +measuredProperty: dcs:value +name: "Some Variable 2 Name" +description: "Some Variable 2 Description" diff --git a/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/wikidataids.csv b/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/wikidataids.csv new file mode 100644 index 00000000..a9870983 --- /dev/null +++ b/simple/tests/stats/test_data/runner/input/input_dir_driven_with_existing_old_schema_data/wikidataids.csv @@ -0,0 +1,3 @@ +wikidataid,year,var1,var2 +Q30,2021,555,666 +Q668,2022,321,123 diff --git a/simple/tests/stats/test_data/runner/input/schema_update_only/article_entities.csv b/simple/tests/stats/test_data/runner/input/schema_update_only/article_entities.csv new file mode 100644 index 00000000..61cbba4a --- /dev/null +++ b/simple/tests/stats/test_data/runner/input/schema_update_only/article_entities.csv @@ -0,0 +1,4 @@ +article_id,article_title,article_author +article1,"Article 1",author1 +article2,"Article 2","author1,author2" +article3,"Article 3","author2,author3" \ No newline at end of file diff --git a/simple/tests/stats/test_data/runner/input/schema_update_only/author_entities.csv b/simple/tests/stats/test_data/runner/input/schema_update_only/author_entities.csv new file mode 100644 index 00000000..42f0b477 --- /dev/null +++ b/simple/tests/stats/test_data/runner/input/schema_update_only/author_entities.csv @@ -0,0 +1,4 @@ +author_id,author_name,author_country +author1,"Jane Doe",country/USA +author2,"Joe Smith",country/CAN +author3,"Jane Smith",country/USA \ No newline at end of file diff --git a/simple/tests/stats/test_data/runner/input/schema_update_only/config.json b/simple/tests/stats/test_data/runner/input/schema_update_only/config.json new file mode 100644 index 00000000..83199fee --- /dev/null +++ b/simple/tests/stats/test_data/runner/input/schema_update_only/config.json @@ -0,0 +1,44 @@ +{ + "inputFiles": { + "countries.csv": { + "importType": "observations", + "format": "variablePerColumn", + "entityType": "Country", + "provenance": "Provenance1 Name" + }, + "wikidataids.csv": { + "importType": "observations", + "format": "variablePerColumn", + "entityType": "Country", + "provenance": "Provenance1 Name" + }, + "variable_per_row.csv": { + "importType": "observations", + "format": "variablePerRow", + "entityType": "Country", + "provenance": "Provenance1 Name" + }, + "author_entities.csv": { + "importType": "entities", + "rowEntityType": "Author", + "idColumn": "author_id", + "entityColumns": ["author_country"], + "provenance": "Provenance1 Name" + }, + "article_entities.csv": { + "importType": "entities", + "rowEntityType": "Article", + "idColumn": "article_id", + "entityColumns": ["article_author"], + "provenance": "Provenance1 Name" + } + }, + "sources": { + "Source1 Name": { + "url": "http://source1.com", + "provenances": { + "Provenance1 Name": "http://source1.com/provenance1" + } + } + } +} diff --git a/simple/tests/stats/test_data/runner/input/schema_update_only/countries.csv b/simple/tests/stats/test_data/runner/input/schema_update_only/countries.csv new file mode 100644 index 00000000..e62275b3 --- /dev/null +++ b/simple/tests/stats/test_data/runner/input/schema_update_only/countries.csv @@ -0,0 +1,15 @@ +place,year,var1,var2 +Afghanistan,2023,0.19,6 +Yemen,2023,0.21,56 +Angola,2023,0.29,6 +Zambia,2023,0.31,34 +Zimbabwe,2023,0.37,76 +Albania,2023,0.50,34 +dcid: wikidataId/Q22062741,2023,0.50,97 +Algeria,2023,0.52,92 +West Bank and Gaza,2023,0.53,64 +Andorra,2023,0.76,9 +American Samoa,2023,#N/A,34 +Anguilla,2023,#N/A,42 +Wallis and Futuna Islands,2023,#N/A,75 +Western Sahara,2023,#N/A,65 diff --git a/simple/tests/stats/test_data/runner/input/schema_update_only/sqlite_old_schema_populated.sql b/simple/tests/stats/test_data/runner/input/schema_update_only/sqlite_old_schema_populated.sql new file mode 100644 index 00000000..429ccd52 --- /dev/null +++ b/simple/tests/stats/test_data/runner/input/schema_update_only/sqlite_old_schema_populated.sql @@ -0,0 +1,36 @@ +BEGIN TRANSACTION; +CREATE TABLE imports ( + imported_at datetime, + status varchar(16), + metadata text +); +INSERT INTO "imports" VALUES('2022-02-02 00:00:00','SUCCESS','{"numVars": 1, "numObs": 3}'); +CREATE TABLE key_value_store ( + lookup_key varchar(255), + value longtext +); +INSERT INTO "key_value_store" VALUES('k1','v1'); +CREATE TABLE observations ( + entity varchar(255), + variable varchar(255), + date varchar(255), + value varchar(255), + provenance varchar(255) +); +INSERT INTO "observations" VALUES('e1','v1','2023','123','p1'); +INSERT INTO "observations" VALUES('e2','v1','2023','456','p1'); +INSERT INTO "observations" VALUES('e3','v1','2023','789','p1'); +CREATE TABLE triples ( + subject_id varchar(255), + predicate varchar(255), + object_id varchar(255), + object_value TEXT +); +INSERT INTO "triples" VALUES('sub1','typeOf','StatisticalVariable',''); +INSERT INTO "triples" VALUES('sub1','pred1','','objval1'); +INSERT INTO "triples" VALUES('sub1','name','','name1'); +INSERT INTO "triples" VALUES('sub2','typeOf','StatisticalVariable',''); +INSERT INTO "triples" VALUES('sub2','name','','name2'); +CREATE INDEX observations_entity_variable on observations (entity, variable); +CREATE INDEX triples_subject_id on triples (subject_id); +COMMIT; diff --git a/simple/tests/stats/test_data/runner/input/schema_update_only/variable_per_row.csv b/simple/tests/stats/test_data/runner/input/schema_update_only/variable_per_row.csv new file mode 100644 index 00000000..d5b70ebc --- /dev/null +++ b/simple/tests/stats/test_data/runner/input/schema_update_only/variable_per_row.csv @@ -0,0 +1,5 @@ +entity,variable,date,value +country/IND,var1,2020,0.16 +country/IND,var2,2020,53 +country/CHN,var1,2020,0.23 +country/CHN,var2,2020,67 \ No newline at end of file diff --git a/simple/tests/stats/test_data/runner/input/schema_update_only/variables.mcf b/simple/tests/stats/test_data/runner/input/schema_update_only/variables.mcf new file mode 100644 index 00000000..7b8c0b0d --- /dev/null +++ b/simple/tests/stats/test_data/runner/input/schema_update_only/variables.mcf @@ -0,0 +1,12 @@ +Node: v1 +dcid:"some_var1" +typeOf: dcs:StatisticalVariable +measuredProperty: dcs:value +name: "Some Variable 1 Name" +description: "Some Variable 1 Description" + +Node: dcid:some_var2 +typeOf: dcs:StatisticalVariable +measuredProperty: dcs:value +name: "Some Variable 2 Name" +description: "Some Variable 2 Description" diff --git a/simple/tests/stats/test_data/runner/input/schema_update_only/wikidataids.csv b/simple/tests/stats/test_data/runner/input/schema_update_only/wikidataids.csv new file mode 100644 index 00000000..a9870983 --- /dev/null +++ b/simple/tests/stats/test_data/runner/input/schema_update_only/wikidataids.csv @@ -0,0 +1,3 @@ +wikidataid,year,var1,var2 +Q30,2021,555,666 +Q668,2022,321,123 diff --git a/simple/tests/stats/test_util.py b/simple/tests/stats/test_util.py index 486b3e8a..f7fbefc5 100644 --- a/simple/tests/stats/test_util.py +++ b/simple/tests/stats/test_util.py @@ -38,6 +38,17 @@ def compare_files(test: unittest.TestCase, """ Compares the content of the actual and expected files and asserts their equality. """ + # Pass if neither actual nor existing file exists. + # Fail if only one exists. + actual_file_exists = os.path.exists(actual_path) + expected_file_exists = os.path.exists(expected_path) + test.assertEqual( + actual_file_exists, expected_file_exists, + f"Actual file existence does not match expected file existence: {message}" + ) + if (expected_file_exists == False): + return + with open(actual_path) as gotf: got = gotf.read() with open(expected_path) as wantf: @@ -93,6 +104,27 @@ def write_key_values(db_path: str, output_path: str): index=False) +def write_full_db_to_file(db_path: str, output_path: str): + """ + Writes a file with SQL statements that can be used to reconstruct the full + database schema and contents. + """ + with sqlite3.connect(db_path) as db: + with open(output_path, 'w') as f: + for line in db.iterdump(): + f.write('%s\n' % line) + + +def read_full_db_from_file(db_path: str, input_path: str): + """ + Reconstructs a database's schema and contents from a file with a series of + SQL commands. + """ + with sqlite3.connect(db_path) as db: + with open(input_path, 'r') as f: + db.cursor().executescript(f.read()) + + class FakeGzipTime: def __init__(self, timestamp=0) -> None: diff --git a/simple/tests/stats/variable_per_row_importer_test.py b/simple/tests/stats/variable_per_row_importer_test.py index 16dd6654..6cc7b98e 100644 --- a/simple/tests/stats/variable_per_row_importer_test.py +++ b/simple/tests/stats/variable_per_row_importer_test.py @@ -22,7 +22,7 @@ import pandas as pd from stats.config import Config from stats.data import Observation -from stats.db import create_db +from stats.db import create_and_update_db from stats.db import create_sqlite_config from stats.nodes import Nodes from stats.reporter import FileImportReporter @@ -65,7 +65,7 @@ def _test_import(test: unittest.TestCase, test_name: str): with open(config_path) as config_file: config = Config(json.load(config_file)) - db = create_db(create_sqlite_config(db_path)) + db = create_and_update_db(create_sqlite_config(db_path)) report_fh = LocalFileHandler(os.path.join(temp_dir, "report.json")) reporter = FileImportReporter(input_path, ImportReporter(report_fh)) nodes = Nodes(config)