Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SQL migrate improved #528

Merged
merged 2 commits into from
Dec 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# 2023-12-01 (5.19.0)

* Improve possibility to use git commit hashes when creating SQL migrations
from amsterdam schema table definitions. Now also supports schemas
with table definitions in separate files.

# 2023-12-01 (5.18.0)

* Add possibility to use git commit hashes when creating SQL migrations
Expand Down
4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = amsterdam-schema-tools
version = 5.18.0
version = 5.19.0
url = https://github.com/amsterdam/schema-tools
license = Mozilla Public 2.0
author = Team Data Diensten, van het Dataplatform onder de Directie Digitale Voorzieningen (Gemeente Amsterdam)
Expand Down Expand Up @@ -76,7 +76,7 @@ tests =
flake8-logging-format # Validate logging format strings
pytest
pytest-cov
pytest-django
pytest-django >= 4.7.0
pytest-sqlalchemy
django =
django >= 3.0, < 4.2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from schematools.contrib.django.factories import model_factory, schema_models_factory
from schematools.contrib.django.models import Dataset
from schematools.exceptions import DatasetNotFound, DatasetTableNotFound
from schematools.loaders import get_schema_loader
from schematools.loaders import SchemaLoader, get_schema_loader
from schematools.naming import to_snake_case
from schematools.types import DatasetSchema, DatasetTableSchema

Expand All @@ -32,11 +32,11 @@ class Command(BaseCommand):
./manage.py sqlmigrate_schema -v3 meetbouten meetbouten v1.0.0 v1.1.0

or, using the schemas from local filesystem and getting the
older version of a schema from a git commit hash:
older version of a schema from a git reference (can be a branch/tag/hash):

./manage.py sqlmigrate_schema -v3 meetbouten meetbouten \
7d986c96:../amsterdam-schema/datasets/meetbouten/dataset.json \
../amsterdam-schema/datasets/meetbouten/dataset.json \
7d986c96 \
master \
---from-files
The command is sped up by pointing ``SCHEMA_URL`` or ``--schema-url``
to a local filesystem repository of the schema files. Otherwise it downloads
Expand All @@ -60,7 +60,7 @@ def add_arguments(self, parser: CommandParser) -> None:
parser.add_argument(
"--from-files",
action="store_true",
help="Get the tables from a file. NB. the SCHEMA_URL also needs to be file-based!",
help="Get the tables from the filesystem. NB. the SCHEMA_URL also needs to be file-based!",
)
parser.add_argument("schema", help="Schema name")
parser.add_argument("table", help="Table name")
Expand All @@ -69,12 +69,12 @@ def add_arguments(self, parser: CommandParser) -> None:
parser.add_argument(
"version1",
metavar="OLDVERSION",
help="Old table version, e.g. v1.0.0, or `path-to-dataset-json` with --from-files",
help="Old table version, e.g. v1.0.0, or a git ref like `master`, `tag`, `branch` or `hash` with --from-files",
)
parser.add_argument(
"version2",
metavar="NEWVERSION",
help="New table version, e.g. v1.1.0, , or `path-to-dataset-json` with --from-files",
help="New table version, e.g. v1.1.0, , or a git ref like `master`, `tag`, `branch` or `hash` with --from-files",
)

def handle(self, *args, **options) -> None:
Expand All @@ -87,16 +87,27 @@ def handle(self, *args, **options) -> None:

# Load the data from the schema repository
dataset = self._load_dataset(options["schema"])

# For the from_files option, we check out the schemas repo
# in a temporary directory.
# By checking out 2 different git references, we can
# obtain the tables for these specific references
# for comparison and sql generation.
if options["from_files"]:
assert not options["schema_url"].startswith(
"http"
), "The --from-files can only work with a SCHEMA_URL on the local filesystem."
table1 = self._load_table_version_from_file(
dataset.id, options["table"], self._checkout_file_if_needed(options["version1"])
)
table2 = self._load_table_version_from_file(
dataset.id, options["table"], self._checkout_file_if_needed(options["version2"])
)
with tempfile.TemporaryDirectory() as tmpdir:
schemas_root = Path(options["schema_url"]).parent
subprocess.run( # nosec
["git", "clone", schemas_root, tmpdir],
)
table1 = self._load_table_from_checkout(
dataset.id, options["table"], tmpdir, options["version1"]
)
table2 = self._load_table_from_checkout(
dataset.id, options["table"], tmpdir, options["version2"]
)
else:
table1 = self._load_table_version(dataset, options["table"], options["version1"])
table2 = self._load_table_version(dataset, options["table"], options["version2"])
Expand Down Expand Up @@ -128,6 +139,17 @@ def handle(self, *args, **options) -> None:
for migration in app_migrations:
start_state = self._print_sql(connection, start_state, migration)

def _load_table_from_checkout(
self, dataset_id: str, table_id: str, tmpdir: str, version_ref: str
) -> DatasetTableSchema:
"""Load a DatasetTableSchema for the specified git reference."""
subprocess.run(["git", "checkout", version_ref], cwd=tmpdir, stdout=subprocess.DEVNULL)
tmp_schema_path = Path(tmpdir) / "datasets"
# We create a specific schema loader, because it has to read in the data
# associated with a specific git checkout.
loader = get_schema_loader(str(tmp_schema_path), loaded_callback=self._loaded_callback)
return self._load_table_version_from_file(loader, dataset_id, table_id)

def _loaded_callback(self, schema: DatasetSchema):
"""Track which schema's get loaded. This is also used for dependency tracking."""
if self.verbosity >= 1:
Expand All @@ -138,7 +160,7 @@ def _loaded_callback(self, schema: DatasetSchema):
def _load_dataset(self, dataset_id: str) -> DatasetSchema:
"""Load a dataset, bail out with a proper CLI message."""
try:
return self.loader.get_dataset(dataset_id)
return self.loader.get_dataset(dataset_id, prefetch_related=True)
except DatasetNotFound as e:
raise CommandError(str(e)) from e

Expand All @@ -160,35 +182,10 @@ def _load_table_version(

raise CommandError(f"Table version '{table_id}/{version}' does not exist.") from e

def _checkout_file_if_needed(self, file_path):
"""Git check out the file if needed.

If the file_path points to a git hash,
get the content of the file and put this in a temp file.
So e.g. file_path can be `7d986c96:../amsterdam-schema/datasets/bag/dataset.json`
Assumption is that the `git` binary is available on the system.
"""
if ":" in file_path:
git_hash, bare_file_path = file_path.split(":")
pl_path = Path(bare_file_path)
result = subprocess.run( # nosec
["git", "show", f"{git_hash}:./{pl_path.name}"],
cwd=pl_path.parent,
capture_output=True,
)
handle, tmp_path = tempfile.mkstemp()
with os.fdopen(handle, "wb") as fp:
fp.write(result.stdout)
fp.close()
return tmp_path

return file_path

def _load_table_version_from_file(
self, dataset_id: str, table_id: str, file_path: str
self, loader: SchemaLoader, dataset_id: str, table_id: str
) -> DatasetTableSchema:
dataset = self.loader.get_dataset_from_file(file_path, allow_external_files=True)
assert dataset.id == dataset_id, f"The id in '{file_path}' does not match '{dataset_id}'"
dataset = loader.get_dataset(dataset_id, prefetch_related=True)
return dataset.get_table_by_id(table_id)

def _load_dependencies(self, dataset: DatasetSchema) -> list[str]:
Expand All @@ -202,7 +199,7 @@ def _load_dependencies(self, dataset: DatasetSchema) -> list[str]:

# Load first, and this fills the cache.
for dataset_id in related_ids - {dataset.id}:
self.loader.get_dataset(dataset_id)
self.loader.get_dataset(dataset_id, prefetch_related=True)

# Turn any loaded schema into a model.
# And when a call to model_factory() triggers loading of more schemas,
Expand Down
2 changes: 1 addition & 1 deletion src/schematools/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def get_dataset(self, dataset_id: str, prefetch_related: bool = False) -> Datase
"""Gets a dataset from the filesystem for dataset_id."""
schema_json = self._read_dataset(dataset_id)
view_sql = self._read_view(dataset_id)
return self._as_dataset(schema_json, view_sql)
return self._as_dataset(schema_json, view_sql, prefetch_related=prefetch_related)

def _as_dataset(
self, schema_json: dict, view_sql: str = None, prefetch_related: bool = False
Expand Down