From 10006db4ceb45fe2374326d2f713ecc1a847bbb8 Mon Sep 17 00:00:00 2001 From: Themis Valtinos <73662635+themisvaltinos@users.noreply.github.com> Date: Mon, 23 Dec 2024 16:01:43 +0200 Subject: [PATCH] Revise VirtualUpdateStatement to contain expressions; update docs --- docs/concepts/macros/macro_variables.md | 2 +- docs/concepts/models/python_models.md | 35 +++++++++++++++++++++++ docs/concepts/models/seed_models.md | 29 +++++++++++++++++++ docs/concepts/models/sql_models.md | 38 +++++++++++++++++++++++-- sqlmesh/core/dialect.py | 29 ++++++++++--------- sqlmesh/core/model/definition.py | 2 +- sqlmesh/core/snapshot/evaluator.py | 2 +- tests/core/test_model.py | 34 ++++++++++++++++++++++ 8 files changed, 153 insertions(+), 18 deletions(-) diff --git a/docs/concepts/macros/macro_variables.md b/docs/concepts/macros/macro_variables.md index 0774dd3fa..3fab383b5 100644 --- a/docs/concepts/macros/macro_variables.md +++ b/docs/concepts/macros/macro_variables.md @@ -128,6 +128,6 @@ SQLMesh provides two other predefined variables used to modify model behavior ba * 'evaluating' - The model query logic is being evaluated. * 'testing' - The model query logic is being evaluated in the context of a unit test. * @gateway - A string value containing the name of the current [gateway](../../guides/connections.md). -* @this_model - A string value containing the name of the physical table the model view selects from. Typically used to create [generic audits](../audits.md#generic-audits). +* @this_model - A string value containing the name of the physical table the model view selects from. Typically used to create [generic audits](../audits.md#generic-audits). In the case of [on_virtual_update statements](../models/sql_models.md#optional-on-virtual-update-statements) it contains the qualified view name instead. * Can be used in model definitions when SQLGlot cannot fully parse a statement and you need to reference the model's underlying physical table directly. * Can be passed as an argument to macros that access or interact with the underlying physical table. diff --git a/docs/concepts/models/python_models.md b/docs/concepts/models/python_models.md index 3b0faade3..a66a5649f 100644 --- a/docs/concepts/models/python_models.md +++ b/docs/concepts/models/python_models.md @@ -164,6 +164,41 @@ def execute( context.fetchdf("CREATE INDEX idx ON example.pre_post_statements (id);") ``` +## Optional on-virtual-update statements + +The optional on-virtual-update statements allow you to execute SQL commands after the completion of the [Virtual Update](#virtual-update). + +These can be used, for example, to grant privileges on views of the virtual layer. + +Similar to pre/post-statements you can set the `on_virtual_update` argument in the `@model` decorator to a list of SQL strings, SQLGlot expressions, or macro calls. + +``` python linenums="1" hl_lines="8" +@model( + "db.test_model", + kind="full", + columns={ + "id": "int", + "name": "text", + }, + on_virtual_update=["GRANT SELECT ON VIEW @this_model TO ROLE dev_role"], +) +def execute( + context: ExecutionContext, + start: datetime, + end: datetime, + execution_time: datetime, + **kwargs: t.Any, +) -> pd.DataFrame: + + return pd.DataFrame([ + {"id": 1, "name": "name"} + ]) +``` + +!!! note + + Table resolution for these statements occurs at the virtual layer. This means that table names, including `@this_model` macro, are resolved to their qualified view names. For instance, when running the plan in an environment named `dev`, `db.test_model` and `@this_model` would resolve to `db__dev.test_model` and not to the physical table name. + ## Dependencies In order to fetch data from an upstream model, you first get the table name using `context`'s `resolve_table` method. This returns the appropriate table name for the current runtime [environment](../environments.md): diff --git a/docs/concepts/models/seed_models.md b/docs/concepts/models/seed_models.md index d1970f958..ec3e6a128 100644 --- a/docs/concepts/models/seed_models.md +++ b/docs/concepts/models/seed_models.md @@ -194,3 +194,32 @@ ALTER SESSION SET TIMEZONE = 'UTC'; -- These are post-statements ALTER SESSION SET TIMEZONE = 'PST'; ``` + +## On-virtual-update statements + +Seed models also support on-virtual-update statements, which are executed after the completion of the [Virtual Update](#virtual-update). + +These must be enclosed within an `ON_VIRTUAL_UPDATE_BEGIN;` ...; `ON_VIRTUAL_UPDATE_END;` block: + +```sql linenums="1" hl_lines="8-13" +MODEL ( + name test_db.national_holidays, + kind SEED ( + path 'national_holidays.csv' + ) +); + +ON_VIRTUAL_UPDATE_BEGIN; +GRANT SELECT ON VIEW @this_model TO ROLE dev_role; +JINJA_STATEMENT_BEGIN; +GRANT SELECT ON VIEW {{ this_model }} TO ROLE admin_role; +JINJA_END; +ON_VIRTUAL_UPDATE_END; +``` + + +[Jinja expressions](../macros/jinja_macros.md) can also be used within them, as demonstrated in the example above. These expressions must be properly nested within a `JINJA_STATEMENT_BEGIN;` and `JINJA_END;` block. + +!!! note + + Table resolution for these statements occurs at the virtual layer. This means that table names, including `@this_model` macro, are resolved to their qualified view names. For instance, when running the plan in an environment named `dev`, `db.customers` and `@this_model` would resolve to `db__dev.customers` and not to the physical table name. \ No newline at end of file diff --git a/docs/concepts/models/sql_models.md b/docs/concepts/models/sql_models.md index d5f6d910f..56e2a5955 100644 --- a/docs/concepts/models/sql_models.md +++ b/docs/concepts/models/sql_models.md @@ -10,6 +10,7 @@ The SQL-based definition of SQL models is the most common one, and consists of t * Optional pre-statements * A single query * Optional post-statements +* Optional on-virtual-update-statements These models are designed to look and feel like you're simply using SQL, but they can be customized for advanced use cases. @@ -90,6 +91,38 @@ MODEL ( Note that the SQL command `UNCACHE TABLE countries` inside the `@IF()` macro does **not** end with a semi-colon. Instead, the semi-colon comes after the `@IF()` macro's closing parenthesis. +### Optional on-virtual-update statements + +The optional on-virtual-update statements allow you to execute SQL commands after the completion of the [Virtual Update](#virtual-update). + +These can be used, for example, to grant privileges on views of the virtual layer. + +These SQL statements must be enclosed within an `ON_VIRTUAL_UPDATE_BEGIN;` ...; `ON_VIRTUAL_UPDATE_END;` block like this: + +```sql linenums="1" hl_lines="10-15" +MODEL ( + name db.customers, + kind FULL +); + +SELECT + r.id::INT +FROM raw.restaurants AS r; + +ON_VIRTUAL_UPDATE_BEGIN; +GRANT SELECT ON VIEW @this_model TO ROLE role_name; +JINJA_STATEMENT_BEGIN; +GRANT SELECT ON VIEW {{ this_model }} TO ROLE admin; +JINJA_END; +ON_VIRTUAL_UPDATE_END; +``` + +[Jinja expressions](../macros/jinja_macros.md) can also be used within them, as demonstrated in the example above. These expressions must be properly nested within a `JINJA_STATEMENT_BEGIN;` and `JINJA_END;` block. + +!!! note + + Table resolution for these statements occurs at the virtual layer. This means that table names, including `@this_model` macro, are resolved to their qualified view names. For instance, when running the plan in an environment named `dev`, `db.customers` and `@this_model` would resolve to `db__dev.customers` and not to the physical table name. + ### The model query The model must contain a standalone query, which can be a single `SELECT` expression, or multiple `SELECT` expressions combined with the `UNION`, `INTERSECT`, or `EXCEPT` operators. The result of this query will be used to populate the model's table or view. @@ -98,7 +131,7 @@ The model must contain a standalone query, which can be a single `SELECT` expres The Python-based definition of SQL models consists of a single python function, decorated with SQLMesh's `@model` [decorator](https://wiki.python.org/moin/PythonDecorators). The decorator is required to have the `is_sql` keyword argument set to `True` to distinguish it from [Python models](./python_models.md) that return DataFrame instances. -This function's return value serves as the model's query, and it must be either a SQL string or a [SQLGlot expression](https://github.com/tobymao/sqlglot/blob/main/sqlglot/expressions.py). The `@model` decorator is used to define the model's [metadata](#MODEL-DDL) and, optionally its pre/post-statements that are also in the form of SQL strings or SQLGlot expressions. +This function's return value serves as the model's query, and it must be either a SQL string or a [SQLGlot expression](https://github.com/tobymao/sqlglot/blob/main/sqlglot/expressions.py). The `@model` decorator is used to define the model's [metadata](#MODEL-DDL) and, optionally its pre/post-statements or on-virtual-update-statements that are also in the form of SQL strings or SQLGlot expressions. Defining a SQL model using Python can be beneficial in cases where its query is too complex to express cleanly in SQL, for example due to having many dynamic components that would require heavy use of [macros](../macros/overview/). Since Python-based models generate SQL, they support the same features as regular SQL models, such as column-level [lineage](../glossary/#lineage). @@ -120,6 +153,7 @@ from sqlmesh.core.macros import MacroEvaluator kind="FULL", pre_statements=["CACHE TABLE countries AS SELECT * FROM raw.countries"], post_statements=["UNCACHE TABLE countries"], + on_virtual_update=["GRANT SELECT ON VIEW @this_model TO ROLE dev_role"], ) def entrypoint(evaluator: MacroEvaluator) -> str | exp.Expression: return ( @@ -139,7 +173,7 @@ One could also define this model by simply returning a string that contained the The `@model` decorator is the Python equivalent of the `MODEL` DDL. -In addition to model metadata and configuration information, one can also set the keyword arguments `pre_statements` and `post_statements` to a list of SQL strings and/or SQLGlot expressions to define the pre/post-statements of the model, respectively. +In addition to model metadata and configuration information, one can also set the keyword arguments `pre_statements`, `post_statements` and `on_virtual_update` to a list of SQL strings and/or SQLGlot expressions to define the pre/post-statements and on-virtual-update-statements of the model, respectively. !!! note diff --git a/sqlmesh/core/dialect.py b/sqlmesh/core/dialect.py index eda75a521..2b0ab8dd1 100644 --- a/sqlmesh/core/dialect.py +++ b/sqlmesh/core/dialect.py @@ -62,7 +62,7 @@ class JinjaStatement(Jinja): class VirtualUpdateStatement(exp.Expression): - pass + arg_types = {"expressions": True} class ModelKind(exp.Expression): @@ -789,8 +789,8 @@ def _is_virtual_statement_end(tokens: t.List[Token], pos: int) -> bool: return _is_command_statement(ON_VIRTUAL_UPDATE_END, tokens, pos) -def virtual_statement(statement: t.List[exp.Expression]) -> VirtualUpdateStatement: - return VirtualUpdateStatement(this=statement) +def virtual_statement(statements: t.List[exp.Expression]) -> VirtualUpdateStatement: + return VirtualUpdateStatement(expressions=statements) class ChunkType(Enum): @@ -882,24 +882,25 @@ def parse( parser = dialect.parser() expressions: t.List[exp.Expression] = [] - def parse_sql_chunk(chunk: t.List[Token]) -> t.List[exp.Expression]: + def parse_sql_chunk(chunk: t.List[Token], meta_sql: bool = True) -> t.List[exp.Expression]: parsed_expressions: t.List[t.Optional[exp.Expression]] = ( parser.parse(chunk, sql) if into is None else parser.parse_into(into, chunk, sql) ) expressions = [] for expression in parsed_expressions: if expression: - expression.meta["sql"] = parser._find_sql(chunk[0], chunk[-1]) + if meta_sql: + expression.meta["sql"] = parser._find_sql(chunk[0], chunk[-1]) expressions.append(expression) return expressions - def parse_jinja_chunk(chunk: t.List[Token]) -> exp.Expression: + def parse_jinja_chunk(chunk: t.List[Token], meta_sql: bool = True) -> exp.Expression: start, *_, end = chunk segment = sql[start.end + 2 : end.start - 1] factory = jinja_query if chunk_type == ChunkType.JINJA_QUERY else jinja_statement expression = factory(segment.strip()) - meta_sql = sql[start.start : end.end + 1] - expression.meta["sql"] = meta_sql + if meta_sql: + expression.meta["sql"] = sql[start.start : end.end + 1] return expression def parse_virtual_statement( @@ -912,18 +913,20 @@ def parse_virtual_statement( while chunks[pos - 1][0] == [] or chunks[pos - 1][0][-1].text != ON_VIRTUAL_UPDATE_END: chunk, chunk_type = chunks[pos] if chunk_type == ChunkType.JINJA_STATEMENT: - virtual_update_statements.append(parse_jinja_chunk(chunk)) + virtual_update_statements.append(parse_jinja_chunk(chunk, False)) else: virtual_update_statements.extend( - parse_sql_chunk(chunk[int(chunk[0].text == "ON_VIRTUAL_UPDATE_BEGIN") : -1]) + parse_sql_chunk( + chunk[int(chunk[0].text == "ON_VIRTUAL_UPDATE_BEGIN") : -1], False + ), ) pos += 1 if virtual_update_statements: - statement = virtual_statement(virtual_update_statements) + statements = virtual_statement(virtual_update_statements) end = chunk[-1].end + 1 - statement.meta["sql"] = sql[start:end] - return [statement], pos + statements.meta["sql"] = sql[start:end] + return [statements], pos return [], pos diff --git a/sqlmesh/core/model/definition.py b/sqlmesh/core/model/definition.py index beec2dc22..56b8989e8 100644 --- a/sqlmesh/core/model/definition.py +++ b/sqlmesh/core/model/definition.py @@ -2115,7 +2115,7 @@ def _split_sql_model_statements( inline_audits[loaded_audit.name] = loaded_audit idx += 2 elif isinstance(expr, d.VirtualUpdateStatement): - for statement in expr.this: + for statement in expr.expressions: on_virtual_update.append(statement) idx += 1 else: diff --git a/sqlmesh/core/snapshot/evaluator.py b/sqlmesh/core/snapshot/evaluator.py index a1469b224..555cfffdc 100644 --- a/sqlmesh/core/snapshot/evaluator.py +++ b/sqlmesh/core/snapshot/evaluator.py @@ -1008,7 +1008,7 @@ def _execute_virtual_statements( start=start, end=end, execution_time=execution_time, - snapshots=parent_snapshots_by_name(snapshot, snapshots), + snapshots=snapshots, deployability_index=deployability_index, engine_adapter=adapter, table_mapping=table_mapping, diff --git a/tests/core/test_model.py b/tests/core/test_model.py index ca479e542..012f51704 100644 --- a/tests/core/test_model.py +++ b/tests/core/test_model.py @@ -984,6 +984,40 @@ def test_seed_pre_statements_only(): assert not model.post_statements +def test_seed_on_virtual_update_statements(): + expressions = d.parse( + """ + MODEL ( + name db.seed, + kind SEED ( + path '../seeds/waiter_names.csv', + batch_size 100, + ) + ); + + JINJA_STATEMENT_BEGIN; + CREATE TABLE x{{ 1 + 1 }}; + JINJA_END; + + ON_VIRTUAL_UPDATE_BEGIN; + JINJA_STATEMENT_BEGIN; + GRANT SELECT ON VIEW {{ this_model }} TO ROLE dev_role; + JINJA_END; + DROP TABLE x2; + ON_VIRTUAL_UPDATE_END; + + """ + ) + + model = load_sql_based_model(expressions, path=Path("./examples/sushi/models/test_model.sql")) + + assert model.pre_statements == [d.jinja_statement("CREATE TABLE x{{ 1 + 1 }};")] + assert model.on_virtual_update == [ + d.jinja_statement("GRANT SELECT ON VIEW {{ this_model }} TO ROLE dev_role;"), + *d.parse("DROP TABLE x2;"), + ] + + def test_seed_model_custom_types(tmp_path): model_csv_path = (tmp_path / "model.csv").absolute()