diff --git a/CHANGES.md b/CHANGES.md index 489e36a7..bbdfba87 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,9 @@ +# 2023-12-01 (5.19.0) + +* Improve possibility to use git commit hashes when creating SQL migrations + from amsterdam schema table definitions. Now also supports schemas + with table definitions in separate files. + # 2023-12-01 (5.18.0) * Add possibility to use git commit hashes when creating SQL migrations diff --git a/setup.cfg b/setup.cfg index 9ead6c0a..0ae02d90 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = amsterdam-schema-tools -version = 5.18.0 +version = 5.19.0 url = https://github.com/amsterdam/schema-tools license = Mozilla Public 2.0 author = Team Data Diensten, van het Dataplatform onder de Directie Digitale Voorzieningen (Gemeente Amsterdam) @@ -76,7 +76,7 @@ tests = flake8-logging-format # Validate logging format strings pytest pytest-cov - pytest-django + pytest-django >= 4.7.0 pytest-sqlalchemy django = django >= 3.0, < 4.2 diff --git a/src/schematools/contrib/django/management/commands/sqlmigrate_schema.py b/src/schematools/contrib/django/management/commands/sqlmigrate_schema.py index 6a1f5341..ee8f8cf2 100644 --- a/src/schematools/contrib/django/management/commands/sqlmigrate_schema.py +++ b/src/schematools/contrib/django/management/commands/sqlmigrate_schema.py @@ -20,7 +20,7 @@ from schematools.contrib.django.factories import model_factory, schema_models_factory from schematools.contrib.django.models import Dataset from schematools.exceptions import DatasetNotFound, DatasetTableNotFound -from schematools.loaders import get_schema_loader +from schematools.loaders import SchemaLoader, get_schema_loader from schematools.naming import to_snake_case from schematools.types import DatasetSchema, DatasetTableSchema @@ -32,11 +32,11 @@ class Command(BaseCommand): ./manage.py sqlmigrate_schema -v3 meetbouten meetbouten v1.0.0 v1.1.0 or, using the schemas from local filesystem and getting the - older version of a schema from a git commit hash: + older version of a schema from a git reference (can be a branch/tag/hash): ./manage.py sqlmigrate_schema -v3 meetbouten meetbouten \ - 7d986c96:../amsterdam-schema/datasets/meetbouten/dataset.json \ - ../amsterdam-schema/datasets/meetbouten/dataset.json \ + 7d986c96 \ + master \ ---from-files The command is sped up by pointing ``SCHEMA_URL`` or ``--schema-url`` to a local filesystem repository of the schema files. Otherwise it downloads @@ -60,7 +60,7 @@ def add_arguments(self, parser: CommandParser) -> None: parser.add_argument( "--from-files", action="store_true", - help="Get the tables from a file. NB. the SCHEMA_URL also needs to be file-based!", + help="Get the tables from the filesystem. NB. the SCHEMA_URL also needs to be file-based!", ) parser.add_argument("schema", help="Schema name") parser.add_argument("table", help="Table name") @@ -69,12 +69,12 @@ def add_arguments(self, parser: CommandParser) -> None: parser.add_argument( "version1", metavar="OLDVERSION", - help="Old table version, e.g. v1.0.0, or `path-to-dataset-json` with --from-files", + help="Old table version, e.g. v1.0.0, or a git ref like `master`, `tag`, `branch` or `hash` with --from-files", ) parser.add_argument( "version2", metavar="NEWVERSION", - help="New table version, e.g. v1.1.0, , or `path-to-dataset-json` with --from-files", + help="New table version, e.g. v1.1.0, , or a git ref like `master`, `tag`, `branch` or `hash` with --from-files", ) def handle(self, *args, **options) -> None: @@ -87,16 +87,27 @@ def handle(self, *args, **options) -> None: # Load the data from the schema repository dataset = self._load_dataset(options["schema"]) + + # For the from_files option, we check out the schemas repo + # in a temporary directory. + # By checking out 2 different git references, we can + # obtain the tables for these specific references + # for comparison and sql generation. if options["from_files"]: assert not options["schema_url"].startswith( "http" ), "The --from-files can only work with a SCHEMA_URL on the local filesystem." - table1 = self._load_table_version_from_file( - dataset.id, options["table"], self._checkout_file_if_needed(options["version1"]) - ) - table2 = self._load_table_version_from_file( - dataset.id, options["table"], self._checkout_file_if_needed(options["version2"]) - ) + with tempfile.TemporaryDirectory() as tmpdir: + schemas_root = Path(options["schema_url"]).parent + subprocess.run( # nosec + ["git", "clone", schemas_root, tmpdir], + ) + table1 = self._load_table_from_checkout( + dataset.id, options["table"], tmpdir, options["version1"] + ) + table2 = self._load_table_from_checkout( + dataset.id, options["table"], tmpdir, options["version2"] + ) else: table1 = self._load_table_version(dataset, options["table"], options["version1"]) table2 = self._load_table_version(dataset, options["table"], options["version2"]) @@ -128,6 +139,17 @@ def handle(self, *args, **options) -> None: for migration in app_migrations: start_state = self._print_sql(connection, start_state, migration) + def _load_table_from_checkout( + self, dataset_id: str, table_id: str, tmpdir: str, version_ref: str + ) -> DatasetTableSchema: + """Load a DatasetTableSchema for the specified git reference.""" + subprocess.run(["git", "checkout", version_ref], cwd=tmpdir, stdout=subprocess.DEVNULL) + tmp_schema_path = Path(tmpdir) / "datasets" + # We create a specific schema loader, because it has to read in the data + # associated with a specific git checkout. + loader = get_schema_loader(str(tmp_schema_path), loaded_callback=self._loaded_callback) + return self._load_table_version_from_file(loader, dataset_id, table_id) + def _loaded_callback(self, schema: DatasetSchema): """Track which schema's get loaded. This is also used for dependency tracking.""" if self.verbosity >= 1: @@ -138,7 +160,7 @@ def _loaded_callback(self, schema: DatasetSchema): def _load_dataset(self, dataset_id: str) -> DatasetSchema: """Load a dataset, bail out with a proper CLI message.""" try: - return self.loader.get_dataset(dataset_id) + return self.loader.get_dataset(dataset_id, prefetch_related=True) except DatasetNotFound as e: raise CommandError(str(e)) from e @@ -160,35 +182,10 @@ def _load_table_version( raise CommandError(f"Table version '{table_id}/{version}' does not exist.") from e - def _checkout_file_if_needed(self, file_path): - """Git check out the file if needed. - - If the file_path points to a git hash, - get the content of the file and put this in a temp file. - So e.g. file_path can be `7d986c96:../amsterdam-schema/datasets/bag/dataset.json` - Assumption is that the `git` binary is available on the system. - """ - if ":" in file_path: - git_hash, bare_file_path = file_path.split(":") - pl_path = Path(bare_file_path) - result = subprocess.run( # nosec - ["git", "show", f"{git_hash}:./{pl_path.name}"], - cwd=pl_path.parent, - capture_output=True, - ) - handle, tmp_path = tempfile.mkstemp() - with os.fdopen(handle, "wb") as fp: - fp.write(result.stdout) - fp.close() - return tmp_path - - return file_path - def _load_table_version_from_file( - self, dataset_id: str, table_id: str, file_path: str + self, loader: SchemaLoader, dataset_id: str, table_id: str ) -> DatasetTableSchema: - dataset = self.loader.get_dataset_from_file(file_path, allow_external_files=True) - assert dataset.id == dataset_id, f"The id in '{file_path}' does not match '{dataset_id}'" + dataset = loader.get_dataset(dataset_id, prefetch_related=True) return dataset.get_table_by_id(table_id) def _load_dependencies(self, dataset: DatasetSchema) -> list[str]: @@ -202,7 +199,7 @@ def _load_dependencies(self, dataset: DatasetSchema) -> list[str]: # Load first, and this fills the cache. for dataset_id in related_ids - {dataset.id}: - self.loader.get_dataset(dataset_id) + self.loader.get_dataset(dataset_id, prefetch_related=True) # Turn any loaded schema into a model. # And when a call to model_factory() triggers loading of more schemas, diff --git a/src/schematools/loaders.py b/src/schematools/loaders.py index 04e6f8d2..86b9eb3d 100644 --- a/src/schematools/loaders.py +++ b/src/schematools/loaders.py @@ -219,7 +219,7 @@ def get_dataset(self, dataset_id: str, prefetch_related: bool = False) -> Datase """Gets a dataset from the filesystem for dataset_id.""" schema_json = self._read_dataset(dataset_id) view_sql = self._read_view(dataset_id) - return self._as_dataset(schema_json, view_sql) + return self._as_dataset(schema_json, view_sql, prefetch_related=prefetch_related) def _as_dataset( self, schema_json: dict, view_sql: str = None, prefetch_related: bool = False