From e7cb7dab87f66ebba1f24e1b7da338314ebcae72 Mon Sep 17 00:00:00 2001
From: "Michael J. Sullivan" <sully@msully.net>
Date: Fri, 6 Sep 2024 16:25:22 -0700
Subject: [PATCH] [Upgrades] Support rolling back a prepared upgrade

This is basically a straightforward application of our existing code
to delete schemas from *old* versions, applied to the new version
instead.
---
 .github/workflows.src/tests-inplace.tpl.yml |  14 ++-
 .github/workflows/tests-inplace.yml         |  14 ++-
 edb/server/args.py                          |  10 +-
 edb/server/inplace_upgrade.py               | 111 ++++++++++++++------
 edb/server/main.py                          |   1 +
 tests/inplace-testing/test.sh               |  40 +++++++
 6 files changed, 155 insertions(+), 35 deletions(-)

diff --git a/.github/workflows.src/tests-inplace.tpl.yml b/.github/workflows.src/tests-inplace.tpl.yml
index 8cd1225dafd..66760848950 100644
--- a/.github/workflows.src/tests-inplace.tpl.yml
+++ b/.github/workflows.src/tests-inplace.tpl.yml
@@ -25,6 +25,18 @@ jobs:
   test:
     runs-on: ubuntu-latest
     needs: build
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - flags:
+            tests:
+          - flags: --rollback-and-test
+            tests:
+          # Do the reapply test on a smaller selection of tests, since
+          # it is slower.
+          - flags: --rollback-and-reapply
+            tests: -k test_link_on_target_delete -k test_edgeql_select -k test_dump
 
     steps:
     <<- restore_cache() >>
@@ -33,7 +45,7 @@ jobs:
     # TODO: Would it be better to split this up into multiple jobs?
     - name: Test performing in-place upgrades
       run: |
-        ./tests/inplace-testing/test.sh vt
+        ./tests/inplace-testing/test.sh ${{ matrix.flags }} vt ${{ matrix.tests }}
 
   workflow-notifications:
     if: failure() && github.event_name != 'pull_request'
diff --git a/.github/workflows/tests-inplace.yml b/.github/workflows/tests-inplace.yml
index 5e1de19e637..de02e35e85c 100644
--- a/.github/workflows/tests-inplace.yml
+++ b/.github/workflows/tests-inplace.yml
@@ -330,6 +330,18 @@ jobs:
   test:
     runs-on: ubuntu-latest
     needs: build
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - flags:
+            tests:
+          - flags: --rollback-and-test
+            tests:
+          # Do the reapply test on a smaller selection of tests, since
+          # it is slower.
+          - flags: --rollback-and-reapply
+            tests: -k test_link_on_target_delete -k test_edgeql_select -k test_dump
 
     steps:
     - uses: actions/checkout@v4
@@ -474,7 +486,7 @@ jobs:
     # TODO: Would it be better to split this up into multiple jobs?
     - name: Test performing in-place upgrades
       run: |
-        ./tests/inplace-testing/test.sh vt
+        ./tests/inplace-testing/test.sh ${{ matrix.flags }} vt ${{ matrix.tests }}
 
   workflow-notifications:
     if: failure() && github.event_name != 'pull_request'
diff --git a/edb/server/args.py b/edb/server/args.py
index dea268be435..1d1c91874a5 100644
--- a/edb/server/args.py
+++ b/edb/server/args.py
@@ -229,6 +229,7 @@ class ServerConfig(NamedTuple):
     bootstrap_only: bool
     inplace_upgrade_prepare: Optional[pathlib.Path]
     inplace_upgrade_finalize: bool
+    inplace_upgrade_rollback: bool
     bootstrap_command: str
     bootstrap_command_file: pathlib.Path
     default_branch: Optional[str]
@@ -679,12 +680,17 @@ def resolve_envvar_value(self, ctx: click.Context):
     click.option(
         '--inplace-upgrade-prepare', type=PathPath(),
         envvar="EDGEDB_SERVER_INPLACE_UPGRADE_PREPARE",
-        cls=EnvvarResolver,  # XXX?
+        cls=EnvvarResolver,
         help='try to do an in-place upgrade with the specified dump file'),
+    click.option(
+        '--inplace-upgrade-rollback', type=bool, is_flag=True,
+        envvar="EDGEDB_SERVER_INPLACE_UPGRADE_ROLLBACK",
+        cls=EnvvarResolver,
+        help='rollback a prepared upgrade'),
     click.option(
         '--inplace-upgrade-finalize', type=bool, is_flag=True,
         envvar="EDGEDB_SERVER_INPLACE_UPGRADE_FINALIZE",
-        cls=EnvvarResolver,  # XXX?
+        cls=EnvvarResolver,
         help='finalize an in-place upgrade'),
     click.option(
         '--default-branch', type=str,
diff --git a/edb/server/inplace_upgrade.py b/edb/server/inplace_upgrade.py
index 065372b0c40..08a8e55d5ac 100644
--- a/edb/server/inplace_upgrade.py
+++ b/edb/server/inplace_upgrade.py
@@ -21,6 +21,7 @@
 from typing import (
     Any,
     Optional,
+    Sequence,
 )
 
 import dataclasses
@@ -51,6 +52,7 @@
 from edb.server import defines as edbdef
 from edb.server import instdata
 from edb.server import pgcluster
+from edb.server import pgcon
 
 from edb.pgsql import common as pg_common
 from edb.pgsql import dbops
@@ -59,6 +61,8 @@
 
 logger = logging.getLogger('edb.server')
 
+PGCon = bootstrap.PGConnectionProxy | pgcon.PGConnection
+
 
 async def _load_schema(
     ctx: bootstrap.BootstrapContext, state: edbcompiler.CompilerState
@@ -343,35 +347,10 @@ async def _upgrade_one(
 '''
 
 
-async def _cleanup_one(
-    ctx: bootstrap.BootstrapContext,
+async def _delete_schemas(
+    conn: PGCon,
+    to_delete: Sequence[str]
 ) -> None:
-    conn = ctx.conn
-
-    # If the upgrade is already finalized, skip it. This lets us be
-    # resilient to crashes during the finalization process, which may
-    # leave some databases upgraded but not all.
-    if (await instdata.get_instdata(conn, 'upgrade_finalized', 'text')) == b'1':
-        logger.info(f"Database already pivoted")
-        return
-
-    trampoline_query = await instdata.get_instdata(
-        conn, 'trampoline_pivot_query', 'text')
-    fixup_query = await instdata.get_instdata(
-        conn, 'schema_fixup_query', 'text')
-
-    await conn.sql_execute(trampoline_query)
-    if fixup_query:
-        await conn.sql_execute(fixup_query)
-
-    namespaces = json.loads(await conn.sql_fetch_val("""
-        select json_agg(nspname) from pg_namespace
-        where nspname like 'edgedb%\\_v%'
-    """.encode('utf-8')))
-
-    cur_suffix = pg_common.versioned_schema("")
-    to_delete = [x for x in namespaces if not x.endswith(cur_suffix)]
-
     # To add a bit more safety, check whether there are any
     # dependencies on the modules we want to delete from outside those
     # modules since the only way to delete non-empty schemas in
@@ -404,6 +383,44 @@ async def _cleanup_one(
         drop schema {', '.join(to_delete)} cascade
     """.encode('utf-8'))
 
+
+async def _get_namespaces(
+    conn: PGCon,
+) -> list[str]:
+    return json.loads(await conn.sql_fetch_val("""
+        select json_agg(nspname) from pg_namespace
+        where nspname like 'edgedb%\\_v%'
+    """.encode('utf-8')))
+
+
+async def _finalize_one(
+    ctx: bootstrap.BootstrapContext,
+) -> None:
+    conn = ctx.conn
+
+    # If the upgrade is already finalized, skip it. This lets us be
+    # resilient to crashes during the finalization process, which may
+    # leave some databases upgraded but not all.
+    if (await instdata.get_instdata(conn, 'upgrade_finalized', 'text')) == b'1':
+        logger.info(f"Database upgrade already finalized")
+        return
+
+    trampoline_query = await instdata.get_instdata(
+        conn, 'trampoline_pivot_query', 'text')
+    fixup_query = await instdata.get_instdata(
+        conn, 'schema_fixup_query', 'text')
+
+    await conn.sql_execute(trampoline_query)
+    if fixup_query:
+        await conn.sql_execute(fixup_query)
+
+    namespaces = await _get_namespaces(ctx.conn)
+
+    cur_suffix = pg_common.versioned_schema("")
+    to_delete = [x for x in namespaces if not x.endswith(cur_suffix)]
+
+    await _delete_schemas(conn, to_delete)
+
     await bootstrap._store_static_text_cache(
         ctx,
         f'upgrade_finalized',
@@ -437,6 +454,35 @@ async def _get_databases(
     return databases
 
 
+async def _rollback_one(
+    ctx: bootstrap.BootstrapContext,
+) -> None:
+    namespaces = await _get_namespaces(ctx.conn)
+
+    cur_suffix = pg_common.versioned_schema("")
+    to_delete = [x for x in namespaces if x.endswith(cur_suffix)]
+
+    await _delete_schemas(ctx.conn, to_delete)
+
+
+async def _rollback_all(
+    ctx: bootstrap.BootstrapContext,
+) -> None:
+    cluster = ctx.cluster
+    databases = await _get_databases(ctx)
+
+    for database in databases:
+        conn = bootstrap.PGConnectionProxy(
+            cluster, cluster.get_db_name(database))
+        try:
+            subctx = dataclasses.replace(ctx, conn=conn)
+
+            logger.info(f"Rolling back preparation of database '{database}'")
+            await _rollback_one(ctx=subctx)
+        finally:
+            conn.terminate()
+
+
 async def _upgrade_all(
     ctx: bootstrap.BootstrapContext,
 ) -> None:
@@ -471,7 +517,7 @@ async def _upgrade_all(
             conn.terminate()
 
 
-async def _finish_all(
+async def _finalize_all(
     ctx: bootstrap.BootstrapContext,
 ) -> None:
     cluster = ctx.cluster
@@ -493,7 +539,7 @@ async def go(
                 if database == inject_failure_on:
                     raise AssertionError(f'failure injected on {database}')
 
-                await _cleanup_one(subctx)
+                await _finalize_one(subctx)
                 await conn.sql_execute(final_command)
             finally:
                 conn.terminate()
@@ -529,11 +575,14 @@ async def inplace_upgrade(
         mode = await bootstrap._get_cluster_mode(ctx)
         ctx = dataclasses.replace(ctx, mode=mode)
 
+        if args.inplace_upgrade_rollback:
+            await _rollback_all(ctx)
+
         if args.inplace_upgrade_prepare:
             await _upgrade_all(ctx)
 
         if args.inplace_upgrade_finalize:
-            await _finish_all(ctx)
+            await _finalize_all(ctx)
 
     finally:
         pgconn.terminate()
diff --git a/edb/server/main.py b/edb/server/main.py
index e2b6e75b0c3..9b9309e703b 100644
--- a/edb/server/main.py
+++ b/edb/server/main.py
@@ -605,6 +605,7 @@ async def run_server(
         if (
             args.inplace_upgrade_prepare
             or args.inplace_upgrade_finalize
+            or args.inplace_upgrade_rollback
         ):
             from . import inplace_upgrade
             await inplace_upgrade.inplace_upgrade(cluster, args)
diff --git a/tests/inplace-testing/test.sh b/tests/inplace-testing/test.sh
index ecab0cfa3b2..66c280c7fde 100755
--- a/tests/inplace-testing/test.sh
+++ b/tests/inplace-testing/test.sh
@@ -1,5 +1,22 @@
 #!/bin/bash -ex
 
+while [[ $# -gt 0 ]]; do
+  case $1 in
+    --rollback-and-test)
+        ROLLBACK=1
+        shift
+        ;;
+    --rollback-and-reapply)
+        REAPPLY=1
+        shift
+        ;;
+    *)
+        break
+        ;;
+  esac
+done
+
+
 DIR="$1"
 shift
 
@@ -44,6 +61,29 @@ edb server --inplace-upgrade-prepare "$DIR"/upgrade.json --backend-dsn="$DSN"
 # Check the server is still working
 edgedb -H localhost -P $PORT --tls-security insecure -b select query 'select count(User)' | grep 2
 
+if [ $ROLLBACK = 1 ]; then
+    edb server --inplace-upgrade-rollback --backend-dsn="$DSN"
+
+    # Rollback and then run the tests on the old database
+    kill $SPID
+    wait $SPID
+    SPID=
+    patch -R -f -p1 < tests/inplace-testing/upgrade.patch
+    make parsers
+    edb test --data-dir "$DIR" --use-data-dir-dbs -v "$@"
+    exit 0
+fi
+
+if [ $REAPPLY = 1 ]; then
+    # Rollback and then reapply
+    edb server --inplace-upgrade-rollback --backend-dsn="$DSN"
+
+    edb server --inplace-upgrade-prepare "$DIR"/upgrade.json --backend-dsn="$DSN"
+fi
+
+# Check the server is still working
+edgedb -H localhost -P $PORT --tls-security insecure -b select query 'select count(User)' | grep 2
+
 # Kill the old version so we can finalize the upgrade
 kill $SPID
 wait $SPID