Skip to content

Commit

Permalink
[Upgrades] Make upgrade finalization failure tolerant
Browse files Browse the repository at this point in the history
Two parts to this:

1. Test all of the pivots inside rolled-back transactions before
   applying. This ensures that if there is a bug in the pivot
   that prevents a database from being upgraded, we fail before
   any irreversible change is made to any database.
2. Track which databases have been finalized and skip them
   when finalizing. This lets us be resilent to crashes, since
   the finalization can be retried.

Progress on #6697.
  • Loading branch information
msullivan committed Sep 6, 2024
1 parent bed0b65 commit f6e350d
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 16 deletions.
66 changes: 50 additions & 16 deletions edb/server/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -2946,9 +2946,15 @@ async def _upgrade_one(
async def _cleanup_one(
ctx: BootstrapContext,
) -> None:
# TODO: Handle it already being committed
conn = ctx.conn

# If the upgrade is already finalized, skip it. This lets us be
# resilient to crashes during the finalization process, which may
# leave some databases upgraded but not all.
if (await instdata.get_instdata(conn, 'upgrade_finalized', 'text')) == b'1':
logger.info(f"Database already pivoted")
return

trampoline_query = await instdata.get_instdata(
conn, 'trampoline_pivot_query', 'text')
fixup_query = await instdata.get_instdata(
Expand Down Expand Up @@ -2998,6 +3004,12 @@ async def _cleanup_one(
drop schema {', '.join(to_delete)} cascade
""".encode('utf-8'))

await _store_static_text_cache(
ctx,
f'upgrade_finalized',
'1',
)


async def _get_databases(
ctx: BootstrapContext,
Expand Down Expand Up @@ -3062,23 +3074,45 @@ async def _finish_all(
ctx: BootstrapContext,
) -> None:
cluster = ctx.cluster

databases = await _get_databases(ctx)

for database in databases:
conn = await cluster.connect(database=cluster.get_db_name(database))
try:
subctx = dataclasses.replace(ctx, conn=conn)

logger.info(f"Pivoting database '{database}'")
# TODO: Try running each cleanup in a transaction to test
# that they all work, before applying them for real. (We
# would *like* to just run them all in open transactions
# and then commit them all, but we run into memory
# concerns.)
await _cleanup_one(subctx)
finally:
conn.terminate()
async def go(
message: str,
final_command: bytes,
inject_failure_on: Optional[str]=None,
) -> None:
for database in databases:
conn = await cluster.connect(database=cluster.get_db_name(database))
try:
subctx = dataclasses.replace(ctx, conn=conn)

logger.info(f"{message} database '{database}'")
await conn.sql_execute(b'START TRANSACTION')
# DEBUG HOOK: Inject a failure if specified
if database == inject_failure_on:
raise AssertionError(f'failure injected on {database}')

await _cleanup_one(subctx)
await conn.sql_execute(final_command)
finally:
conn.terminate()

inject_failure = os.environ.get('EDGEDB_UPGRADE_FINALIZE_ERROR_INJECTION')

# Test all of the pivots in transactions we rollback, to make sure
# that they work. This ensures that if there is a bug in the pivot
# scripts on some database, we fail before any irreversible
# changes are made to any database.
#
# *Then*, apply them all for real. They may fail
# when applying for real, but that should be due to a crash or
# some such, and so the user should be able to retry.
#
# We wanted to apply them all inside transactions and then commit
# the transactions, but that requires holding open potentially too
# many connections.
await go("Testing pivot of", b'ROLLBACK')
await go("Pivoting", b'COMMIT', inject_failure)


async def ensure_bootstrapped(
Expand Down
6 changes: 6 additions & 0 deletions tests/inplace-testing/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ SPID=

tar cf "$DIR"-prepped.tar "$DIR"

# Try to finalize the upgrade, but inject a failure
if EDGEDB_UPGRADE_FINALIZE_ERROR_INJECTION=main edb server --bootstrap-only --inplace-upgrade-finalize --data-dir "$DIR"; then
echo Unexpected upgrade success despite failure injection
exit 4
fi

# Finalize the upgrade
edb server --bootstrap-only --inplace-upgrade-finalize --data-dir "$DIR"
tar cf "$DIR"-cooked.tar "$DIR"
Expand Down

0 comments on commit f6e350d

Please sign in to comment.