From 505400a19cc6f54a34189726b44a00a02bccf83e Mon Sep 17 00:00:00 2001
From: "vitess-bot[bot]" <108069721+vitess-bot[bot]@users.noreply.github.com>
Date: Tue, 29 Oct 2024 13:05:55 -0600
Subject: [PATCH] Cherry-pick 690bb79db5fc12393d861c68d876fedeea745372 with
conflicts
---
changelog/21.0/21.0.0/release_notes.md | 263 ++++++++++++++++++++
changelog/21.0/21.0.0/summary.md | 254 +++++++++++++++++++
go/vt/mysqlctl/backup_blackbox_race_test.go | 152 +++++++++++
go/vt/mysqlctl/builtinbackupengine.go | 5 +-
4 files changed, 671 insertions(+), 3 deletions(-)
create mode 100644 changelog/21.0/21.0.0/release_notes.md
create mode 100644 changelog/21.0/21.0.0/summary.md
create mode 100644 go/vt/mysqlctl/backup_blackbox_race_test.go
diff --git a/changelog/21.0/21.0.0/release_notes.md b/changelog/21.0/21.0.0/release_notes.md
new file mode 100644
index 00000000000..6fe295fbb3b
--- /dev/null
+++ b/changelog/21.0/21.0.0/release_notes.md
@@ -0,0 +1,263 @@
+# Release of Vitess v21.0.0
+## Summary
+
+### Table of Contents
+
+- **[Known Issues](#known-issues)**
+ - **[Backup reports itself as successful despite failures](#backup-reports-as-successful)**
+- **[Major Changes](#major-changes)**
+ - **[Deprecations and Deletions](#deprecations-and-deletions)**
+ - [Deprecated VTTablet Flags](#vttablet-flags)
+ - [Deletion of deprecated metrics](#metric-deletion)
+ - [Deprecated Metrics](#deprecations-metrics)
+ - **[Traffic Mirroring](#traffic-mirroring)**
+ - **[Atomic Distributed Transaction Support](#atomic-transaction)**
+ - **[New VTGate Shutdown Behavior](#new-vtgate-shutdown-behavior)**
+ - **[Tablet Throttler: Multi-Metric support](#tablet-throttler)**
+ - **[Allow Cross Cell Promotion in PRS](#allow-cross-cell)**
+ - **[Support for recursive CTEs](#recursive-cte)**
+ - **[VTGate Tablet Balancer](#tablet-balancer)**
+ - **[Query Timeout Override](#query-timeout)**
+ - **[New Backup Engine](#new-backup-engine)**
+ - **[Dynamic VReplication Configuration](#dynamic-vreplication-configuration)**
+ - **[Reference Table Materialization](#reference-table-materialization)**
+ - **[New VEXPLAIN Modes: TRACE and KEYS](#new-vexplain-modes)**
+ - **[Automatically Replace MySQL auto_increment Clauses with Vitess Sequences](#auto-replace-mysql-autoinc-with-seq)**
+ - **[Experimental MySQL 8.4 support](#experimental-mysql-84)**
+ - **[Current Errant GTIDs Count Metric](#errant-gtid-metric)**
+ - **[vtctldclient ChangeTabletTags](#vtctldclient-changetablettags)**
+ - **[Support for specifying expected primary in reparents](#reparents-expectedprimary)**
+
+## Known Issues
+
+### Backup reports itself as successful despite failures
+
+In this release, we have identified an issue where a backup may succeed even if one of the underlying files fails to be backed up.
+The underlying errors are ignored and the backup action reports success.
+This issue exists only with the `builtin` backup engine, and it can occur only when the engine has already started backing up all files.
+Please refer to https://github.com/vitessio/vitess/issues/17063 for more details.
+
+## Major Changes
+
+### Deprecations and Deletions
+
+#### Deprecated VTTablet Flags
+
+- `queryserver-enable-settings-pool` flag, added in `v15`, has been on by default since `v17`.
+ It is now deprecated and will be removed in a future release.
+
+#### Deletion of deprecated metrics
+
+The following VTOrc metrics were deprecated in `v20`. They have now been deleted.
+
+| Metric Name |
+|:--------------------------------------------:|
+| `analysis.change.write` |
+| `audit.write` |
+| `discoveries.attempt` |
+| `discoveries.fail` |
+| `discoveries.instance_poll_seconds_exceeded` |
+| `discoveries.queue_length` |
+| `discoveries.recent_count` |
+| `instance.read` |
+| `instance.read_topology` |
+| `emergency_reparent_counts` |
+| `planned_reparent_counts` |
+| `reparent_shard_operation_timings` |
+
+#### Deprecated Metrics
+
+The following metrics are now deprecated and will be deleted in a future release, please use their replacements.
+
+| Component | Metric Name | Replaced By |
+|------------|:---------------------:|:-------------------------------:|
+| `vttablet` | `QueryCacheLength` | `QueryEnginePlanCacheLength` |
+| `vttablet` | `QueryCacheSize` | `QueryEnginePlanCacheSize` |
+| `vttablet` | `QueryCacheCapacity` | `QueryEnginePlanCacheCapacity` |
+| `vttablet` | `QueryCacheEvictions` | `QueryEnginePlanCacheEvictions` |
+| `vttablet` | `QueryCacheHits` | `QueryEnginePlanCacheHits` |
+| `vttablet` | `QueryCacheMisses` | `QueryEnginePlanCacheMisses` |
+
+### Traffic Mirroring
+
+Traffic mirroring is intended to help reduce some of the uncertainty inherent to `MoveTables SwitchTraffic`. When
+traffic mirroring is enabled, VTGate will mirror a percentage of traffic from one keyspace to another.
+
+Mirror rules may be enabled through `vtctldclient` with `MoveTables MirrorTraffic`. For example:
+
+```bash
+$ vtctldclient --server :15999 MoveTables --target-keyspace customer --workflow commerce2customer MirrorTraffic --percent 5.0
+```
+
+Mirror rules can be inspected with `GetMirrorRules`.
+
+### Atomic Distributed Transaction Support
+
+We have introduced atomic distributed transactions as an experimental feature.
+Users can now run multi-shard transactions with stronger guarantees.
+Vitess now provides two modes of transactional guarantees for multi-shard transactions: Best Effort and Atomic.
+These can be selected based on the user’s requirements and the trade-offs they are willing to make.
+
+Follow the documentation to enable [Atomic Distributed Transaction](https://vitess.io/docs/21.0/reference/features/distributed-transaction/)
+
+For more details on the implementation and trade-offs, please refer to the [RFC](https://github.com/vitessio/vitess/issues/16245)
+
+### New VTGate Shutdown Behavior
+
+We added a new option to VTGate to disallow new connections while VTGate is shutting down,
+while allowing existing connections to finish their work until they manually disconnect or until
+the `--onterm_timeout` is reached, without getting a `Server shutdown in progress` error.
+
+This new behavior can be enabled by specifying the new `--mysql-server-drain-onterm` flag to VTGate.
+
+You can find more information about this option in the [RFC](https://github.com/vitessio/vitess/issues/15971).
+
+### Tablet Throttler: Multi-Metric support
+
+Up until `v20`, the tablet throttler would only monitor and use a single metric. That would be replication lag, by
+default, or could be the result of a custom query. In this release, we introduce a major redesign so that the throttler
+monitors and uses multiple metrics at the same time, including the above two.
+
+The default behavior now is to monitor all metrics, but only use `lag` (if the custom query is undefined) or the `custom`
+metric (if the custom query is defined). This is backwards-compatible with `v20`. A `v20` `PRIMARY` is compatible with
+a `v21` `REPLICA`, and a `v21` `PRIMARY` is compatible with a `v20` `REPLICA`.
+
+However, it is now possible to assign any combination of one or more metrics for a given app. The throttler
+would then accept or reject the app's requests based on the health of _all_ assigned metrics. We have provided a pre-defined
+list of metrics:
+
+- `lag`: replication lag based on heartbeat injection.
+- `threads_running`: concurrent active threads on the MySQL server.
+- `loadavg`: per core load average measured on the tablet instance/pod.
+- `custom`: the result of a custom query executed on the MySQL server.
+
+Each metric has a default threshold which can be overridden by the `UpdateThrottlerConfig` command.
+
+The throttler also supports the catch-all `"all"` app name, and it is thus possible to assign metrics to **all** apps.
+Explicit app to metric assignments will override the catch-all configuration.
+
+Metrics are assigned a default _scope_, which could be `self` (isolated to the tablet) or `shard` (max, aka **worst**
+value among shard tablets). It is further possible to require a different scope for each metric.
+
+### Allow Cross Cell Promotion in PRS
+
+Up until now if the users wanted to promote a replica in a different cell from the current primary
+using `PlannedReparentShard`, they had to specify the new primary with the `--new-primary` flag.
+
+We have now added a new flag `--allow-cross-cell-promotion` that lets `PlannedReparentShard` choose a primary in a
+different cell even if no new primary is provided explicitly.
+
+### Experimental support for recursive CTEs
+
+We have added experimental support for recursive CTEs in Vitess. We are marking it as experimental because it is not yet
+fully tested and may have some limitations. We are looking for feedback from the community to improve this feature.
+
+### VTGate Tablet Balancer
+
+When a VTGate routes a query and has multiple available tablets for a given shard / tablet type (e.g. REPLICA), the
+current default behavior routes the query with local cell affinity and round robin policy. The VTGate Tablet Balancer
+provides an alternate mechanism that routes queries to maintain an even distribution of query load to each tablet, while
+preferentially routing to tablets in the same cell as the VTGate.
+
+The tablet balancer is enabled by a new flag `--enable-balancer` and configured by `--balancer-vtgate-cells`
+and `--balancer-keyspaces`.
+
+See the [RFC ](https://github.com/vitessio/vitess/issues/12241) for more details on the design and configuration of this feature.
+
+### Query Timeout Override
+
+VTGate sends an authoritative query timeout to VTTablet when the `QUERY_TIMEOUT_MS` comment directive,
+`query_timeout` session system variable, or `query-timeout` flag is set.
+The order of precedence is: comment directive > session variable > VTGate flag.
+VTTablet overrides its default query timeout with the value received from VTGate.
+All timeouts are specified in milliseconds.
+
+When a query is executed inside a transaction, there is an additional nuance. The actual timeout used will be the smaller
+of the transaction timeout and the query timeout.
+
+A query can also be set to have no timeout by using the `QUERY_TIMEOUT_MS` comment directive with a value of `0`.
+
+Example usage:
+`select /*vt+ QUERY_TIMEOUT_MS=30 */ col from tbl`
+
+### New Backup Engine (EXPERIMENTAL)
+
+We are introducing a new backup engine for logical backups in order to support use cases that require something other
+than physical backups. This feature is experimental and is based on [MySQL Shell](https://dev.mysql.com/doc/mysql-shell/8.0/en/).
+
+The new engine is enabled by using `--backup_engine_implementation=mysqlshell`. There are other options that are required,
+so please read the [documentation](https://vitess.io/docs/21.0/user-guides/operating-vitess/backup-and-restore/creating-a-backup/) to learn which options are required and how to configure them.
+
+### Dynamic VReplication Configuration
+
+Previously, many of the configuration options for VReplication Workflows had to be provided using VTTablet flags. This
+meant that any change to VReplication configuration required restarting VTTablets. We now allow these to be overridden
+while creating a workflow or dynamically after the workflow is already in progress.
+
+### Reference Table Materialization
+
+There is a new option in [`Materialize` workflows](https://vitess.io/docs/reference/vreplication/materialize/) to keep a synced copy of [reference or lookup tables](https://vitess.io/docs/reference/vreplication/reference_tables/)
+(countries, states, zip codes, etc) from an unsharded keyspace, which holds the source of truth for the reference
+table, to all shards in a sharded keyspace.
+
+### New VEXPLAIN Modes: TRACE and KEYS
+
+#### VEXPLAIN TRACE
+
+The new `TRACE` mode for `VEXPLAIN` provides a detailed execution trace of queries, showing how they're processed through various
+operators and interactions with tablets. This mode is particularly useful for:
+
+- Identifying performance bottlenecks
+- Understanding query execution patterns
+- Optimizing complex queries
+- Debugging unexpected query behavior
+
+`TRACE` mode runs the query and logs all interactions, returning a JSON representation of the query execution plan with additional
+statistics like number of calls, average rows processed, and number of shards queried.
+
+#### VEXPLAIN KEYS
+
+The `KEYS` mode for `VEXPLAIN` offers a concise summary of query structure, highlighting columns used in joins, filters, and
+grouping operations. This information is crucial for:
+
+- Identifying potential sharding key candidates
+- Optimizing query performance
+- Analyzing query patterns to inform database design decisions
+
+`KEYS` mode analyzes the query structure without executing it, providing JSON output that includes grouping columns, join columns,
+filter columns (potential candidates for indexes, primary keys, or sharding keys), and the statement type.
+
+These new `VEXPLAIN` modes enhance Vitess's query analysis capabilities, allowing for more informed decisions about sharding
+strategies and query optimization.
+
+### Automatically Replace MySQL auto_increment Clauses with Vitess Sequences
+
+In https://github.com/vitessio/vitess/pull/16860 we added support for replacing MySQL `auto_increment` clauses with [Vitess Sequences](https://vitess.io/docs/reference/features/vitess-sequences/), performing all of the setup and initialization
+work automatically during the [`MoveTables`](https://vitess.io/docs/reference/vreplication/movetables/) workflow. As part of that work we have deprecated the
+[`--remove-sharded-auto-increment` boolean flag](https://vitess.io/docs/20.0/reference/programs/vtctldclient/vtctldclient_movetables/vtctldclient_movetables_create/) and you should begin using the new
+[`--sharded-auto-increment-handling` flag](https://vitess.io/docs/21.0/reference/programs/vtctldclient/vtctldclient_movetables/vtctldclient_movetables_create/) instead. Please see the new
+[`MoveTables` Auto Increment Handling](https://vitess.io/docs/21.0/reference/vreplication/movetables/#auto-increment-handling) documentation for additional details.
+
+### Experimental MySQL 8.4 support
+
+We have added experimental support for MySQL 8.4. It passes the Vitess test suite, but it is otherwise not yet tested. We are looking for feedback from the community to improve this to move support out of the experimental phase in a future release.
+
+### Current Errant GTIDs Count Metric
+A new metric called `CurrentErrantGTIDCount` has been added to the `VTOrc` component.
+This metric shows the current count of the errant GTIDs in the tablets.
+
+### `vtctldclient ChangeTabletTags` command
+
+The `vtctldclient` command `ChangeTabletTags` was added to allow the tags of a tablet to be changed dynamically.
+
+### Support specifying expected primary in reparents
+
+The `EmergencyReparentShard` and `PlannedReparentShard` commands and RPCs now support specifying a primary we expect to still be the current primary in order for a reparent operation to be processed. This allows reparents to be conditional on a specific state being true.
+
+------------
+The entire changelog for this release can be found [here](https://github.com/vitessio/vitess/blob/main/changelog/21.0/21.0.0/changelog.md).
+
+The release includes 364 merged Pull Requests.
+
+Thanks to all our contributors: @GrahamCampbell, @GuptaManan100, @Utkar5hM, @anshikavashistha, @app/dependabot, @app/vitess-bot, @arthurschreiber, @beingnoble03, @brendar, @cameronmccord2, @chrism1001, @cuishuang, @dbussink, @deepthi, @demmer, @frouioui, @harshit-gangal, @harshitasao, @icyflame, @kirtanchandak, @mattlord, @mattrobenolt, @maxenglander, @mcrauwel, @notfelineit, @perminov, @rafer, @rohit-nayak-ps, @runewake2, @rvrangel, @shanth96, @shlomi-noach, @systay, @timvaillancourt, @vitess-bot
+
diff --git a/changelog/21.0/21.0.0/summary.md b/changelog/21.0/21.0.0/summary.md
new file mode 100644
index 00000000000..512aa45a12f
--- /dev/null
+++ b/changelog/21.0/21.0.0/summary.md
@@ -0,0 +1,254 @@
+## Summary
+
+### Table of Contents
+
+- **[Known Issues](#known-issues)**
+ - **[Backup reports itself as successful despite failures](#backup-reports-as-successful)**
+- **[Major Changes](#major-changes)**
+ - **[Deprecations and Deletions](#deprecations-and-deletions)**
+ - [Deprecated VTTablet Flags](#vttablet-flags)
+ - [Deletion of deprecated metrics](#metric-deletion)
+ - [Deprecated Metrics](#deprecations-metrics)
+ - **[Traffic Mirroring](#traffic-mirroring)**
+ - **[Atomic Distributed Transaction Support](#atomic-transaction)**
+ - **[New VTGate Shutdown Behavior](#new-vtgate-shutdown-behavior)**
+ - **[Tablet Throttler: Multi-Metric support](#tablet-throttler)**
+ - **[Allow Cross Cell Promotion in PRS](#allow-cross-cell)**
+ - **[Support for recursive CTEs](#recursive-cte)**
+ - **[VTGate Tablet Balancer](#tablet-balancer)**
+ - **[Query Timeout Override](#query-timeout)**
+ - **[New Backup Engine](#new-backup-engine)**
+ - **[Dynamic VReplication Configuration](#dynamic-vreplication-configuration)**
+ - **[Reference Table Materialization](#reference-table-materialization)**
+ - **[New VEXPLAIN Modes: TRACE and KEYS](#new-vexplain-modes)**
+ - **[Automatically Replace MySQL auto_increment Clauses with Vitess Sequences](#auto-replace-mysql-autoinc-with-seq)**
+ - **[Experimental MySQL 8.4 support](#experimental-mysql-84)**
+ - **[Current Errant GTIDs Count Metric](#errant-gtid-metric)**
+ - **[vtctldclient ChangeTabletTags](#vtctldclient-changetablettags)**
+ - **[Support for specifying expected primary in reparents](#reparents-expectedprimary)**
+
+## Known Issues
+
+### Backup reports itself as successful despite failures
+
+In this release, we have identified an issue where a backup may succeed even if one of the underlying files fails to be backed up.
+The underlying errors are ignored and the backup action reports success.
+This issue exists only with the `builtin` backup engine, and it can occur only when the engine has already started backing up all files.
+Please refer to https://github.com/vitessio/vitess/issues/17063 for more details.
+
+## Major Changes
+
+### Deprecations and Deletions
+
+#### Deprecated VTTablet Flags
+
+- `queryserver-enable-settings-pool` flag, added in `v15`, has been on by default since `v17`.
+ It is now deprecated and will be removed in a future release.
+
+#### Deletion of deprecated metrics
+
+The following VTOrc metrics were deprecated in `v20`. They have now been deleted.
+
+| Metric Name |
+|:--------------------------------------------:|
+| `analysis.change.write` |
+| `audit.write` |
+| `discoveries.attempt` |
+| `discoveries.fail` |
+| `discoveries.instance_poll_seconds_exceeded` |
+| `discoveries.queue_length` |
+| `discoveries.recent_count` |
+| `instance.read` |
+| `instance.read_topology` |
+| `emergency_reparent_counts` |
+| `planned_reparent_counts` |
+| `reparent_shard_operation_timings` |
+
+#### Deprecated Metrics
+
+The following metrics are now deprecated and will be deleted in a future release, please use their replacements.
+
+| Component | Metric Name | Replaced By |
+|------------|:---------------------:|:-------------------------------:|
+| `vttablet` | `QueryCacheLength` | `QueryEnginePlanCacheLength` |
+| `vttablet` | `QueryCacheSize` | `QueryEnginePlanCacheSize` |
+| `vttablet` | `QueryCacheCapacity` | `QueryEnginePlanCacheCapacity` |
+| `vttablet` | `QueryCacheEvictions` | `QueryEnginePlanCacheEvictions` |
+| `vttablet` | `QueryCacheHits` | `QueryEnginePlanCacheHits` |
+| `vttablet` | `QueryCacheMisses` | `QueryEnginePlanCacheMisses` |
+
+### Traffic Mirroring
+
+Traffic mirroring is intended to help reduce some of the uncertainty inherent to `MoveTables SwitchTraffic`. When
+traffic mirroring is enabled, VTGate will mirror a percentage of traffic from one keyspace to another.
+
+Mirror rules may be enabled through `vtctldclient` with `MoveTables MirrorTraffic`. For example:
+
+```bash
+$ vtctldclient --server :15999 MoveTables --target-keyspace customer --workflow commerce2customer MirrorTraffic --percent 5.0
+```
+
+Mirror rules can be inspected with `GetMirrorRules`.
+
+### Atomic Distributed Transaction Support
+
+We have introduced atomic distributed transactions as an experimental feature.
+Users can now run multi-shard transactions with stronger guarantees.
+Vitess now provides two modes of transactional guarantees for multi-shard transactions: Best Effort and Atomic.
+These can be selected based on the user’s requirements and the trade-offs they are willing to make.
+
+Follow the documentation to enable [Atomic Distributed Transaction](https://vitess.io/docs/21.0/reference/features/distributed-transaction/)
+
+For more details on the implementation and trade-offs, please refer to the [RFC](https://github.com/vitessio/vitess/issues/16245)
+
+### New VTGate Shutdown Behavior
+
+We added a new option to VTGate to disallow new connections while VTGate is shutting down,
+while allowing existing connections to finish their work until they manually disconnect or until
+the `--onterm_timeout` is reached, without getting a `Server shutdown in progress` error.
+
+This new behavior can be enabled by specifying the new `--mysql-server-drain-onterm` flag to VTGate.
+
+You can find more information about this option in the [RFC](https://github.com/vitessio/vitess/issues/15971).
+
+### Tablet Throttler: Multi-Metric support
+
+Up until `v20`, the tablet throttler would only monitor and use a single metric. That would be replication lag, by
+default, or could be the result of a custom query. In this release, we introduce a major redesign so that the throttler
+monitors and uses multiple metrics at the same time, including the above two.
+
+The default behavior now is to monitor all metrics, but only use `lag` (if the custom query is undefined) or the `custom`
+metric (if the custom query is defined). This is backwards-compatible with `v20`. A `v20` `PRIMARY` is compatible with
+a `v21` `REPLICA`, and a `v21` `PRIMARY` is compatible with a `v20` `REPLICA`.
+
+However, it is now possible to assign any combination of one or more metrics for a given app. The throttler
+would then accept or reject the app's requests based on the health of _all_ assigned metrics. We have provided a pre-defined
+list of metrics:
+
+- `lag`: replication lag based on heartbeat injection.
+- `threads_running`: concurrent active threads on the MySQL server.
+- `loadavg`: per core load average measured on the tablet instance/pod.
+- `custom`: the result of a custom query executed on the MySQL server.
+
+Each metric has a default threshold which can be overridden by the `UpdateThrottlerConfig` command.
+
+The throttler also supports the catch-all `"all"` app name, and it is thus possible to assign metrics to **all** apps.
+Explicit app to metric assignments will override the catch-all configuration.
+
+Metrics are assigned a default _scope_, which could be `self` (isolated to the tablet) or `shard` (max, aka **worst**
+value among shard tablets). It is further possible to require a different scope for each metric.
+
+### Allow Cross Cell Promotion in PRS
+
+Up until now if the users wanted to promote a replica in a different cell from the current primary
+using `PlannedReparentShard`, they had to specify the new primary with the `--new-primary` flag.
+
+We have now added a new flag `--allow-cross-cell-promotion` that lets `PlannedReparentShard` choose a primary in a
+different cell even if no new primary is provided explicitly.
+
+### Experimental support for recursive CTEs
+
+We have added experimental support for recursive CTEs in Vitess. We are marking it as experimental because it is not yet
+fully tested and may have some limitations. We are looking for feedback from the community to improve this feature.
+
+### VTGate Tablet Balancer
+
+When a VTGate routes a query and has multiple available tablets for a given shard / tablet type (e.g. REPLICA), the
+current default behavior routes the query with local cell affinity and round robin policy. The VTGate Tablet Balancer
+provides an alternate mechanism that routes queries to maintain an even distribution of query load to each tablet, while
+preferentially routing to tablets in the same cell as the VTGate.
+
+The tablet balancer is enabled by a new flag `--enable-balancer` and configured by `--balancer-vtgate-cells`
+and `--balancer-keyspaces`.
+
+See the [RFC ](https://github.com/vitessio/vitess/issues/12241) for more details on the design and configuration of this feature.
+
+### Query Timeout Override
+
+VTGate sends an authoritative query timeout to VTTablet when the `QUERY_TIMEOUT_MS` comment directive,
+`query_timeout` session system variable, or `query-timeout` flag is set.
+The order of precedence is: comment directive > session variable > VTGate flag.
+VTTablet overrides its default query timeout with the value received from VTGate.
+All timeouts are specified in milliseconds.
+
+When a query is executed inside a transaction, there is an additional nuance. The actual timeout used will be the smaller
+of the transaction timeout and the query timeout.
+
+A query can also be set to have no timeout by using the `QUERY_TIMEOUT_MS` comment directive with a value of `0`.
+
+Example usage:
+`select /*vt+ QUERY_TIMEOUT_MS=30 */ col from tbl`
+
+### New Backup Engine (EXPERIMENTAL)
+
+We are introducing a new backup engine for logical backups in order to support use cases that require something other
+than physical backups. This feature is experimental and is based on [MySQL Shell](https://dev.mysql.com/doc/mysql-shell/8.0/en/).
+
+The new engine is enabled by using `--backup_engine_implementation=mysqlshell`. There are other options that are required,
+so please read the [documentation](https://vitess.io/docs/21.0/user-guides/operating-vitess/backup-and-restore/creating-a-backup/) to learn which options are required and how to configure them.
+
+### Dynamic VReplication Configuration
+
+Previously, many of the configuration options for VReplication Workflows had to be provided using VTTablet flags. This
+meant that any change to VReplication configuration required restarting VTTablets. We now allow these to be overridden
+while creating a workflow or dynamically after the workflow is already in progress.
+
+### Reference Table Materialization
+
+There is a new option in [`Materialize` workflows](https://vitess.io/docs/reference/vreplication/materialize/) to keep a synced copy of [reference or lookup tables](https://vitess.io/docs/reference/vreplication/reference_tables/)
+(countries, states, zip codes, etc) from an unsharded keyspace, which holds the source of truth for the reference
+table, to all shards in a sharded keyspace.
+
+### New VEXPLAIN Modes: TRACE and KEYS
+
+#### VEXPLAIN TRACE
+
+The new `TRACE` mode for `VEXPLAIN` provides a detailed execution trace of queries, showing how they're processed through various
+operators and interactions with tablets. This mode is particularly useful for:
+
+- Identifying performance bottlenecks
+- Understanding query execution patterns
+- Optimizing complex queries
+- Debugging unexpected query behavior
+
+`TRACE` mode runs the query and logs all interactions, returning a JSON representation of the query execution plan with additional
+statistics like number of calls, average rows processed, and number of shards queried.
+
+#### VEXPLAIN KEYS
+
+The `KEYS` mode for `VEXPLAIN` offers a concise summary of query structure, highlighting columns used in joins, filters, and
+grouping operations. This information is crucial for:
+
+- Identifying potential sharding key candidates
+- Optimizing query performance
+- Analyzing query patterns to inform database design decisions
+
+`KEYS` mode analyzes the query structure without executing it, providing JSON output that includes grouping columns, join columns,
+filter columns (potential candidates for indexes, primary keys, or sharding keys), and the statement type.
+
+These new `VEXPLAIN` modes enhance Vitess's query analysis capabilities, allowing for more informed decisions about sharding
+strategies and query optimization.
+
+### Automatically Replace MySQL auto_increment Clauses with Vitess Sequences
+
+In https://github.com/vitessio/vitess/pull/16860 we added support for replacing MySQL `auto_increment` clauses with [Vitess Sequences](https://vitess.io/docs/reference/features/vitess-sequences/), performing all of the setup and initialization
+work automatically during the [`MoveTables`](https://vitess.io/docs/reference/vreplication/movetables/) workflow. As part of that work we have deprecated the
+[`--remove-sharded-auto-increment` boolean flag](https://vitess.io/docs/20.0/reference/programs/vtctldclient/vtctldclient_movetables/vtctldclient_movetables_create/) and you should begin using the new
+[`--sharded-auto-increment-handling` flag](https://vitess.io/docs/21.0/reference/programs/vtctldclient/vtctldclient_movetables/vtctldclient_movetables_create/) instead. Please see the new
+[`MoveTables` Auto Increment Handling](https://vitess.io/docs/21.0/reference/vreplication/movetables/#auto-increment-handling) documentation for additional details.
+
+### Experimental MySQL 8.4 support
+
+We have added experimental support for MySQL 8.4. It passes the Vitess test suite, but it is otherwise not yet tested. We are looking for feedback from the community to improve this to move support out of the experimental phase in a future release.
+
+### Current Errant GTIDs Count Metric
+A new metric called `CurrentErrantGTIDCount` has been added to the `VTOrc` component.
+This metric shows the current count of the errant GTIDs in the tablets.
+
+### `vtctldclient ChangeTabletTags` command
+
+The `vtctldclient` command `ChangeTabletTags` was added to allow the tags of a tablet to be changed dynamically.
+
+### Support specifying expected primary in reparents
+
+The `EmergencyReparentShard` and `PlannedReparentShard` commands and RPCs now support specifying a primary we expect to still be the current primary in order for a reparent operation to be processed. This allows reparents to be conditional on a specific state being true.
diff --git a/go/vt/mysqlctl/backup_blackbox_race_test.go b/go/vt/mysqlctl/backup_blackbox_race_test.go
new file mode 100644
index 00000000000..5414ebc5fa6
--- /dev/null
+++ b/go/vt/mysqlctl/backup_blackbox_race_test.go
@@ -0,0 +1,152 @@
+//go:build !race
+
+/*
+Copyright 2024 The Vitess Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// Package mysqlctl_test is the blackbox tests for package mysqlctl.
+package mysqlctl_test
+
+import (
+ "fmt"
+ "os"
+ "path"
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/require"
+
+ "vitess.io/vitess/go/mysql"
+ "vitess.io/vitess/go/mysql/fakesqldb"
+ "vitess.io/vitess/go/test/utils"
+ "vitess.io/vitess/go/vt/logutil"
+ "vitess.io/vitess/go/vt/mysqlctl"
+ "vitess.io/vitess/go/vt/mysqlctl/backupstats"
+ "vitess.io/vitess/go/vt/mysqlctl/filebackupstorage"
+ "vitess.io/vitess/go/vt/proto/topodata"
+ "vitess.io/vitess/go/vt/proto/vttime"
+ "vitess.io/vitess/go/vt/topo"
+ "vitess.io/vitess/go/vt/topo/memorytopo"
+)
+
+// This test triggers a certain code path that only happens when a backup file fails to be backed up,
+// only and only if, all the other backup files have either started or finished. When we reach
+// this scenario, files no longer try to acquire the semaphore and thus the backup cannot fail
+// because of context deadline when acquiring it. At this point, the only place where the backup
+// can fail, is if the return of be.backupFiles fails, and we record the error correctly.
+// This test specifically test this scenario and arose because of issue https://github.com/vitessio/vitess/issues/17063
+// The test does:
+// 1. Create the backup and data directory
+// 2. Create a keyspace and shard
+// 3. Already create the last backup file that would be created
+// 4. Remove all permissions on this file
+// 5. Execute the restore
+// 6. The restore must fail due to an error on file number 3 ("cannot add file: 3")
+//
+// This test is extracted into its own file that won't be run if we do 'go test -race' as this test
+// exposes an old race condition that will be fixed after https://github.com/vitessio/vitess/pull/17062
+// Link to the race condition issue: https://github.com/vitessio/vitess/issues/17065
+func TestExecuteBackupWithFailureOnLastFile(t *testing.T) {
+ ctx := utils.LeakCheckContext(t)
+
+ // Set up local backup directory
+ id := fmt.Sprintf("%d", time.Now().UnixNano())
+ backupRoot := fmt.Sprintf("testdata/builtinbackup_test_%s", id)
+ filebackupstorage.FileBackupStorageRoot = backupRoot
+ require.NoError(t, createBackupDir(backupRoot, "innodb", "log", "datadir"))
+ dataDir := path.Join(backupRoot, "datadir")
+ // Add some files under data directory to force backup to execute semaphore acquire inside
+ // backupFiles() method (https://github.com/vitessio/vitess/blob/main/go/vt/mysqlctl/builtinbackupengine.go#L483).
+ require.NoError(t, createBackupDir(dataDir, "test1"))
+ require.NoError(t, createBackupDir(dataDir, "test2"))
+ require.NoError(t, createBackupFiles(path.Join(dataDir, "test1"), 2, "ibd"))
+ require.NoError(t, createBackupFiles(path.Join(dataDir, "test2"), 2, "ibd"))
+ defer os.RemoveAll(backupRoot)
+
+ needIt, err := needInnoDBRedoLogSubdir()
+ require.NoError(t, err)
+ if needIt {
+ fpath := path.Join("log", mysql.DynamicRedoLogSubdir)
+ if err := createBackupDir(backupRoot, fpath); err != nil {
+ require.Failf(t, err.Error(), "failed to create directory: %s", fpath)
+ }
+ }
+
+ // Set up topo
+ keyspace, shard := "mykeyspace", "-"
+ ts := memorytopo.NewServer(ctx, "cell1")
+ defer ts.Close()
+
+ require.NoError(t, ts.CreateKeyspace(ctx, keyspace, &topodata.Keyspace{}))
+ require.NoError(t, ts.CreateShard(ctx, keyspace, shard))
+
+ tablet := topo.NewTablet(100, "cell1", "mykeyspace-00-80-0100")
+ tablet.Keyspace = keyspace
+ tablet.Shard = shard
+
+ require.NoError(t, ts.CreateTablet(ctx, tablet))
+
+ _, err = ts.UpdateShardFields(ctx, keyspace, shard, func(si *topo.ShardInfo) error {
+ si.PrimaryAlias = &topodata.TabletAlias{Uid: 100, Cell: "cell1"}
+
+ now := time.Now()
+ si.PrimaryTermStartTime = &vttime.Time{Seconds: int64(now.Second()), Nanoseconds: int32(now.Nanosecond())}
+
+ return nil
+ })
+
+ require.NoError(t, err)
+
+ be := &mysqlctl.BuiltinBackupEngine{}
+ bh := filebackupstorage.NewBackupHandle(nil, "", "", false)
+ // Spin up a fake daemon to be used in backups. It needs to be allowed to receive:
+ // "STOP REPLICA", "START REPLICA", in that order.
+ fakedb := fakesqldb.New(t)
+ defer fakedb.Close()
+ mysqld := mysqlctl.NewFakeMysqlDaemon(fakedb)
+ defer mysqld.Close()
+ mysqld.ExpectedExecuteSuperQueryList = []string{"STOP REPLICA", "START REPLICA"}
+
+ // With this setup, 4 backup files will be created (0, 1, 2, 3). For the last file (3), we create
+ // it in advance and remove all permission on the file so that the backup be.ExecuteBackup will not
+ // be able to override the file and thus will fail. Triggering the error mechanism after calling be.backupFile.
+ lastBackupFile := path.Join(backupRoot, "3")
+ f, err := os.Create(lastBackupFile)
+ require.NoError(t, err)
+ _, err = f.Write(make([]byte, 1024))
+ require.NoError(t, err)
+ require.NoError(t, f.Chmod(0444))
+ require.NoError(t, f.Close())
+
+ backupResult, err := be.ExecuteBackup(ctx, mysqlctl.BackupParams{
+ Logger: logutil.NewConsoleLogger(),
+ Mysqld: mysqld,
+ Cnf: &mysqlctl.Mycnf{
+ InnodbDataHomeDir: path.Join(backupRoot, "innodb"),
+ InnodbLogGroupHomeDir: path.Join(backupRoot, "log"),
+ DataDir: path.Join(backupRoot, "datadir"),
+ },
+ Stats: backupstats.NewFakeStats(),
+ Concurrency: 4,
+ HookExtraEnv: map[string]string{},
+ TopoServer: ts,
+ Keyspace: keyspace,
+ Shard: shard,
+ MysqlShutdownTimeout: mysqlShutdownTimeout,
+ }, bh)
+
+ require.ErrorContains(t, err, "cannot add file: 3")
+ require.Equal(t, mysqlctl.BackupUnusable, backupResult)
+}
diff --git a/go/vt/mysqlctl/builtinbackupengine.go b/go/vt/mysqlctl/builtinbackupengine.go
index 2bbec4abe96..1b3ad16003f 100644
--- a/go/vt/mysqlctl/builtinbackupengine.go
+++ b/go/vt/mysqlctl/builtinbackupengine.go
@@ -639,9 +639,8 @@ func (be *BuiltinBackupEngine) backupFiles(
// Backup the individual file.
name := fmt.Sprintf("%v", i)
- err := be.backupFile(ctxCancel, params, bh, fe, name)
- if err != nil {
- bh.RecordError(acqErr)
+ if err := be.backupFile(ctxCancel, params, bh, fe, name); err != nil {
+ bh.RecordError(err)
cancel()
}
}(i)