From e7cb7dab87f66ebba1f24e1b7da338314ebcae72 Mon Sep 17 00:00:00 2001 From: "Michael J. Sullivan" Date: Fri, 6 Sep 2024 16:25:22 -0700 Subject: [PATCH] [Upgrades] Support rolling back a prepared upgrade This is basically a straightforward application of our existing code to delete schemas from *old* versions, applied to the new version instead. --- .github/workflows.src/tests-inplace.tpl.yml | 14 ++- .github/workflows/tests-inplace.yml | 14 ++- edb/server/args.py | 10 +- edb/server/inplace_upgrade.py | 111 ++++++++++++++------ edb/server/main.py | 1 + tests/inplace-testing/test.sh | 40 +++++++ 6 files changed, 155 insertions(+), 35 deletions(-) diff --git a/.github/workflows.src/tests-inplace.tpl.yml b/.github/workflows.src/tests-inplace.tpl.yml index 8cd1225dafd..66760848950 100644 --- a/.github/workflows.src/tests-inplace.tpl.yml +++ b/.github/workflows.src/tests-inplace.tpl.yml @@ -25,6 +25,18 @@ jobs: test: runs-on: ubuntu-latest needs: build + strategy: + fail-fast: false + matrix: + include: + - flags: + tests: + - flags: --rollback-and-test + tests: + # Do the reapply test on a smaller selection of tests, since + # it is slower. + - flags: --rollback-and-reapply + tests: -k test_link_on_target_delete -k test_edgeql_select -k test_dump steps: <<- restore_cache() >> @@ -33,7 +45,7 @@ jobs: # TODO: Would it be better to split this up into multiple jobs? - name: Test performing in-place upgrades run: | - ./tests/inplace-testing/test.sh vt + ./tests/inplace-testing/test.sh ${{ matrix.flags }} vt ${{ matrix.tests }} workflow-notifications: if: failure() && github.event_name != 'pull_request' diff --git a/.github/workflows/tests-inplace.yml b/.github/workflows/tests-inplace.yml index 5e1de19e637..de02e35e85c 100644 --- a/.github/workflows/tests-inplace.yml +++ b/.github/workflows/tests-inplace.yml @@ -330,6 +330,18 @@ jobs: test: runs-on: ubuntu-latest needs: build + strategy: + fail-fast: false + matrix: + include: + - flags: + tests: + - flags: --rollback-and-test + tests: + # Do the reapply test on a smaller selection of tests, since + # it is slower. + - flags: --rollback-and-reapply + tests: -k test_link_on_target_delete -k test_edgeql_select -k test_dump steps: - uses: actions/checkout@v4 @@ -474,7 +486,7 @@ jobs: # TODO: Would it be better to split this up into multiple jobs? - name: Test performing in-place upgrades run: | - ./tests/inplace-testing/test.sh vt + ./tests/inplace-testing/test.sh ${{ matrix.flags }} vt ${{ matrix.tests }} workflow-notifications: if: failure() && github.event_name != 'pull_request' diff --git a/edb/server/args.py b/edb/server/args.py index dea268be435..1d1c91874a5 100644 --- a/edb/server/args.py +++ b/edb/server/args.py @@ -229,6 +229,7 @@ class ServerConfig(NamedTuple): bootstrap_only: bool inplace_upgrade_prepare: Optional[pathlib.Path] inplace_upgrade_finalize: bool + inplace_upgrade_rollback: bool bootstrap_command: str bootstrap_command_file: pathlib.Path default_branch: Optional[str] @@ -679,12 +680,17 @@ def resolve_envvar_value(self, ctx: click.Context): click.option( '--inplace-upgrade-prepare', type=PathPath(), envvar="EDGEDB_SERVER_INPLACE_UPGRADE_PREPARE", - cls=EnvvarResolver, # XXX? + cls=EnvvarResolver, help='try to do an in-place upgrade with the specified dump file'), + click.option( + '--inplace-upgrade-rollback', type=bool, is_flag=True, + envvar="EDGEDB_SERVER_INPLACE_UPGRADE_ROLLBACK", + cls=EnvvarResolver, + help='rollback a prepared upgrade'), click.option( '--inplace-upgrade-finalize', type=bool, is_flag=True, envvar="EDGEDB_SERVER_INPLACE_UPGRADE_FINALIZE", - cls=EnvvarResolver, # XXX? + cls=EnvvarResolver, help='finalize an in-place upgrade'), click.option( '--default-branch', type=str, diff --git a/edb/server/inplace_upgrade.py b/edb/server/inplace_upgrade.py index 065372b0c40..08a8e55d5ac 100644 --- a/edb/server/inplace_upgrade.py +++ b/edb/server/inplace_upgrade.py @@ -21,6 +21,7 @@ from typing import ( Any, Optional, + Sequence, ) import dataclasses @@ -51,6 +52,7 @@ from edb.server import defines as edbdef from edb.server import instdata from edb.server import pgcluster +from edb.server import pgcon from edb.pgsql import common as pg_common from edb.pgsql import dbops @@ -59,6 +61,8 @@ logger = logging.getLogger('edb.server') +PGCon = bootstrap.PGConnectionProxy | pgcon.PGConnection + async def _load_schema( ctx: bootstrap.BootstrapContext, state: edbcompiler.CompilerState @@ -343,35 +347,10 @@ async def _upgrade_one( ''' -async def _cleanup_one( - ctx: bootstrap.BootstrapContext, +async def _delete_schemas( + conn: PGCon, + to_delete: Sequence[str] ) -> None: - conn = ctx.conn - - # If the upgrade is already finalized, skip it. This lets us be - # resilient to crashes during the finalization process, which may - # leave some databases upgraded but not all. - if (await instdata.get_instdata(conn, 'upgrade_finalized', 'text')) == b'1': - logger.info(f"Database already pivoted") - return - - trampoline_query = await instdata.get_instdata( - conn, 'trampoline_pivot_query', 'text') - fixup_query = await instdata.get_instdata( - conn, 'schema_fixup_query', 'text') - - await conn.sql_execute(trampoline_query) - if fixup_query: - await conn.sql_execute(fixup_query) - - namespaces = json.loads(await conn.sql_fetch_val(""" - select json_agg(nspname) from pg_namespace - where nspname like 'edgedb%\\_v%' - """.encode('utf-8'))) - - cur_suffix = pg_common.versioned_schema("") - to_delete = [x for x in namespaces if not x.endswith(cur_suffix)] - # To add a bit more safety, check whether there are any # dependencies on the modules we want to delete from outside those # modules since the only way to delete non-empty schemas in @@ -404,6 +383,44 @@ async def _cleanup_one( drop schema {', '.join(to_delete)} cascade """.encode('utf-8')) + +async def _get_namespaces( + conn: PGCon, +) -> list[str]: + return json.loads(await conn.sql_fetch_val(""" + select json_agg(nspname) from pg_namespace + where nspname like 'edgedb%\\_v%' + """.encode('utf-8'))) + + +async def _finalize_one( + ctx: bootstrap.BootstrapContext, +) -> None: + conn = ctx.conn + + # If the upgrade is already finalized, skip it. This lets us be + # resilient to crashes during the finalization process, which may + # leave some databases upgraded but not all. + if (await instdata.get_instdata(conn, 'upgrade_finalized', 'text')) == b'1': + logger.info(f"Database upgrade already finalized") + return + + trampoline_query = await instdata.get_instdata( + conn, 'trampoline_pivot_query', 'text') + fixup_query = await instdata.get_instdata( + conn, 'schema_fixup_query', 'text') + + await conn.sql_execute(trampoline_query) + if fixup_query: + await conn.sql_execute(fixup_query) + + namespaces = await _get_namespaces(ctx.conn) + + cur_suffix = pg_common.versioned_schema("") + to_delete = [x for x in namespaces if not x.endswith(cur_suffix)] + + await _delete_schemas(conn, to_delete) + await bootstrap._store_static_text_cache( ctx, f'upgrade_finalized', @@ -437,6 +454,35 @@ async def _get_databases( return databases +async def _rollback_one( + ctx: bootstrap.BootstrapContext, +) -> None: + namespaces = await _get_namespaces(ctx.conn) + + cur_suffix = pg_common.versioned_schema("") + to_delete = [x for x in namespaces if x.endswith(cur_suffix)] + + await _delete_schemas(ctx.conn, to_delete) + + +async def _rollback_all( + ctx: bootstrap.BootstrapContext, +) -> None: + cluster = ctx.cluster + databases = await _get_databases(ctx) + + for database in databases: + conn = bootstrap.PGConnectionProxy( + cluster, cluster.get_db_name(database)) + try: + subctx = dataclasses.replace(ctx, conn=conn) + + logger.info(f"Rolling back preparation of database '{database}'") + await _rollback_one(ctx=subctx) + finally: + conn.terminate() + + async def _upgrade_all( ctx: bootstrap.BootstrapContext, ) -> None: @@ -471,7 +517,7 @@ async def _upgrade_all( conn.terminate() -async def _finish_all( +async def _finalize_all( ctx: bootstrap.BootstrapContext, ) -> None: cluster = ctx.cluster @@ -493,7 +539,7 @@ async def go( if database == inject_failure_on: raise AssertionError(f'failure injected on {database}') - await _cleanup_one(subctx) + await _finalize_one(subctx) await conn.sql_execute(final_command) finally: conn.terminate() @@ -529,11 +575,14 @@ async def inplace_upgrade( mode = await bootstrap._get_cluster_mode(ctx) ctx = dataclasses.replace(ctx, mode=mode) + if args.inplace_upgrade_rollback: + await _rollback_all(ctx) + if args.inplace_upgrade_prepare: await _upgrade_all(ctx) if args.inplace_upgrade_finalize: - await _finish_all(ctx) + await _finalize_all(ctx) finally: pgconn.terminate() diff --git a/edb/server/main.py b/edb/server/main.py index e2b6e75b0c3..9b9309e703b 100644 --- a/edb/server/main.py +++ b/edb/server/main.py @@ -605,6 +605,7 @@ async def run_server( if ( args.inplace_upgrade_prepare or args.inplace_upgrade_finalize + or args.inplace_upgrade_rollback ): from . import inplace_upgrade await inplace_upgrade.inplace_upgrade(cluster, args) diff --git a/tests/inplace-testing/test.sh b/tests/inplace-testing/test.sh index ecab0cfa3b2..66c280c7fde 100755 --- a/tests/inplace-testing/test.sh +++ b/tests/inplace-testing/test.sh @@ -1,5 +1,22 @@ #!/bin/bash -ex +while [[ $# -gt 0 ]]; do + case $1 in + --rollback-and-test) + ROLLBACK=1 + shift + ;; + --rollback-and-reapply) + REAPPLY=1 + shift + ;; + *) + break + ;; + esac +done + + DIR="$1" shift @@ -44,6 +61,29 @@ edb server --inplace-upgrade-prepare "$DIR"/upgrade.json --backend-dsn="$DSN" # Check the server is still working edgedb -H localhost -P $PORT --tls-security insecure -b select query 'select count(User)' | grep 2 +if [ $ROLLBACK = 1 ]; then + edb server --inplace-upgrade-rollback --backend-dsn="$DSN" + + # Rollback and then run the tests on the old database + kill $SPID + wait $SPID + SPID= + patch -R -f -p1 < tests/inplace-testing/upgrade.patch + make parsers + edb test --data-dir "$DIR" --use-data-dir-dbs -v "$@" + exit 0 +fi + +if [ $REAPPLY = 1 ]; then + # Rollback and then reapply + edb server --inplace-upgrade-rollback --backend-dsn="$DSN" + + edb server --inplace-upgrade-prepare "$DIR"/upgrade.json --backend-dsn="$DSN" +fi + +# Check the server is still working +edgedb -H localhost -P $PORT --tls-security insecure -b select query 'select count(User)' | grep 2 + # Kill the old version so we can finalize the upgrade kill $SPID wait $SPID