diff --git a/pkg/sql/BUILD.bazel b/pkg/sql/BUILD.bazel index debe6be233c2..74967ccfe80d 100644 --- a/pkg/sql/BUILD.bazel +++ b/pkg/sql/BUILD.bazel @@ -863,6 +863,7 @@ go_test( "//pkg/sql/privilege", "//pkg/sql/querycache", "//pkg/sql/randgen", + "//pkg/sql/regions", "//pkg/sql/row", "//pkg/sql/rowenc", "//pkg/sql/rowenc/keyside", diff --git a/pkg/sql/region_util.go b/pkg/sql/region_util.go index 004422d887a2..48467e0d2233 100644 --- a/pkg/sql/region_util.go +++ b/pkg/sql/region_util.go @@ -108,14 +108,6 @@ func CheckClusterRegionIsLive( return nil } -func makeRequiredConstraintForRegion(r catpb.RegionName) zonepb.Constraint { - return zonepb.Constraint{ - Type: zonepb.Constraint_REQUIRED, - Key: "region", - Value: string(r), - } -} - // TestingConvertRegionToZoneConfig converts a given region config into a zone // configuration, ensuring the result is fully hydrated. Refer to the // zoneConfigForMultiRegionDatabase function for details on how the conversion @@ -150,24 +142,24 @@ func TestingConvertRegionToZoneConfig( // voter_constraints = '{"+region=A": 2}' // lease_preferences = [["+region=A"]] // -// See synthesizeVoterConstraints() for explanation on why `voter_constraints` +// See SynthesizeVoterConstraints() for explanation on why `voter_constraints` // are set the way they are. func zoneConfigForMultiRegionDatabase( regionConfig multiregion.RegionConfig, ) (zonepb.ZoneConfig, error) { - numVoters, numReplicas := getNumVotersAndNumReplicas(regionConfig) + numVoters, numReplicas := regions.GetNumVotersAndNumReplicas(regionConfig) - constraints, err := synthesizeReplicaConstraints(regionConfig.Regions(), regionConfig.Placement()) + constraints, err := regions.SynthesizeReplicaConstraints(regionConfig.Regions(), regionConfig.Placement()) if err != nil { return zonepb.ZoneConfig{}, err } - voterConstraints, err := synthesizeVoterConstraints(regionConfig.PrimaryRegion(), regionConfig) + voterConstraints, err := regions.SynthesizeVoterConstraints(regionConfig.PrimaryRegion(), regionConfig) if err != nil { return zonepb.ZoneConfig{}, err } - leasePreferences := synthesizeLeasePreferences(regionConfig.PrimaryRegion(), regionConfig.SecondaryRegion()) + leasePreferences := regions.SynthesizeLeasePreferences(regionConfig.PrimaryRegion(), regionConfig.SecondaryRegion()) zc := zonepb.ZoneConfig{ NumReplicas: &numReplicas, @@ -184,333 +176,6 @@ func zoneConfigForMultiRegionDatabase( return regionConfig.ExtendZoneConfigWithRegionalIn(zc, regionConfig.PrimaryRegion()) } -// addConstraintsForSuperRegion updates the ZoneConfig.Constraints field such -// that every replica is guaranteed to be constrained to a region within the -// super region. -// If !regionConfig.IsMemberOfExplicitSuperRegion(affinityRegion), and error -// will be returned. -func addConstraintsForSuperRegion( - zc *zonepb.ZoneConfig, regionConfig multiregion.RegionConfig, affinityRegion catpb.RegionName, -) error { - regions, ok := regionConfig.GetSuperRegionRegionsForRegion(affinityRegion) - if !ok { - return errors.AssertionFailedf("region %s is not part of a super region", affinityRegion) - } - _, numReplicas := getNumVotersAndNumReplicas(regionConfig.WithRegions(regions)) - - zc.NumReplicas = &numReplicas - zc.Constraints = nil - zc.InheritedConstraints = false - - switch regionConfig.SurvivalGoal() { - case descpb.SurvivalGoal_ZONE_FAILURE: - for _, region := range regions { - zc.Constraints = append(zc.Constraints, zonepb.ConstraintsConjunction{ - NumReplicas: 1, - Constraints: []zonepb.Constraint{makeRequiredConstraintForRegion(region)}, - }) - } - return nil - case descpb.SurvivalGoal_REGION_FAILURE: - // There is a special case where we have 3 regions under survival goal - // region failure where we have to constrain an extra replica to any - // region within the super region to guarantee that all replicas are - // accounted for. In our case, we assign it to the first non-primary region - // in sorted order. - // This happens because we have 5 replicas and 3 regions. 2 voters are - // constrained to the primary region, the other 2 regions each are given a - // replica, the last non-voting replica is not guaranteed to be constrained - // anywhere. - // If we have more than 3 regions, all replicas are accounted for and - // constrained within the super region. - // See: https://github.com/cockroachdb/cockroach/issues/63617 for more. - extraReplicaToConstrain := len(regions) == 3 - for _, region := range regions { - n := int32(1) - if region != affinityRegion && extraReplicaToConstrain { - n = 2 - extraReplicaToConstrain = false - } - zc.Constraints = append(zc.Constraints, zonepb.ConstraintsConjunction{ - NumReplicas: n, - Constraints: []zonepb.Constraint{makeRequiredConstraintForRegion(region)}, - }) - } - return nil - default: - return errors.AssertionFailedf("unknown survival goal: %v", regionConfig.SurvivalGoal()) - } -} - -// zoneConfigForMultiRegionPartition generates a ZoneConfig stub for a partition -// that belongs to a regional by row table in a multi-region database. -// -// At the table/partition level, the only attributes that are set are -// `num_voters`, `voter_constraints`, and `lease_preferences`. We expect that -// the attributes `num_replicas` and `constraints` will be inherited from the -// database level zone config. -func zoneConfigForMultiRegionPartition( - partitionRegion catpb.RegionName, regionConfig multiregion.RegionConfig, -) (zonepb.ZoneConfig, error) { - zc := *zonepb.NewZoneConfig() - - numVoters, numReplicas := getNumVotersAndNumReplicas(regionConfig) - zc.NumVoters = &numVoters - - if regionConfig.IsMemberOfExplicitSuperRegion(partitionRegion) { - err := addConstraintsForSuperRegion(&zc, regionConfig, partitionRegion) - if err != nil { - return zonepb.ZoneConfig{}, err - } - } else if !regionConfig.RegionalInTablesInheritDatabaseConstraints(partitionRegion) { - // If the database constraints can't be inherited to serve as the - // constraints for this partition, define the constraints ourselves. - zc.NumReplicas = &numReplicas - - constraints, err := synthesizeReplicaConstraints(regionConfig.Regions(), regionConfig.Placement()) - if err != nil { - return zonepb.ZoneConfig{}, err - } - zc.Constraints = constraints - zc.InheritedConstraints = false - } - - voterConstraints, err := synthesizeVoterConstraints(partitionRegion, regionConfig) - if err != nil { - return zonepb.ZoneConfig{}, err - } - zc.VoterConstraints = voterConstraints - zc.NullVoterConstraintsIsEmpty = true - zc.LeasePreferences = synthesizeLeasePreferences(partitionRegion, regionConfig.SecondaryRegion()) - zc.InheritedLeasePreferences = false - - return regionConfig.ExtendZoneConfigWithRegionalIn(zc, partitionRegion) -} - -// maxFailuresBeforeUnavailability returns the maximum number of individual -// failures that can be tolerated, among `numVoters` voting replicas, before a -// given range is unavailable. -func maxFailuresBeforeUnavailability(numVoters int32) int32 { - return ((numVoters + 1) / 2) - 1 -} - -// getNumVotersAndNumReplicas computes the number of voters and the total number -// of replicas needed for a given region config. -func getNumVotersAndNumReplicas( - regionConfig multiregion.RegionConfig, -) (numVoters, numReplicas int32) { - const numVotersForZoneSurvival = 3 - // Under region survivability, we use 5 voting replicas to allow for a - // theoretical (2-2-1) voting replica configuration, where the primary region - // has 2 voting replicas and the next closest region has another 2. This - // allows for stable read/write latencies even under single node failures. - // - // TODO(aayush): Until we add allocator heuristics to coalesce voting replicas - // together based on their relative latencies to the leaseholder, we can't - // actually ensure that the region closest to the leaseholder has 2 voting - // replicas. - // - // Until the above TODO is addressed, the non-leaseholder voting replicas will - // be allowed to "float" around among the other regions in the database. They - // may or may not be placed geographically close to the leaseholder replica. - const numVotersForRegionSurvival = 5 - - numRegions := int32(len(regionConfig.Regions())) - switch regionConfig.SurvivalGoal() { - // NB: See mega-comment inside `synthesizeVoterConstraints()` for why these - // are set the way they are. - case descpb.SurvivalGoal_ZONE_FAILURE: - numVoters = numVotersForZoneSurvival - switch regionConfig.Placement() { - case descpb.DataPlacement_DEFAULT: - // + <1 replica for every other region> - numReplicas = (numVotersForZoneSurvival) + (numRegions - 1) - case descpb.DataPlacement_RESTRICTED: - numReplicas = numVoters - default: - panic(errors.AssertionFailedf("unknown data placement: %v", regionConfig.Placement())) - } - case descpb.SurvivalGoal_REGION_FAILURE: - // The primary and secondary region each have two voters. - // maxFailuresBeforeUnavailability(numVotersForRegionSurvival) = 2. - // We have 5 voters for survival mode region failure such that we can - // get quorum with 2 voters in the primary region + one voter outside. - // Every other region has one replica. - numVoters = numVotersForRegionSurvival - - // There are always 2 (i.e. maxFailuresBeforeUnavailability) replicas in the - // primary region, and 1 replica in every other region. - numReplicas = maxFailuresBeforeUnavailability(numVotersForRegionSurvival) + (numRegions - 1) - if regionConfig.HasSecondaryRegion() { - // If there is a secondary region, it gets an additional replica. - numReplicas++ - } - if numReplicas < numVoters { - // NumReplicas cannot be less than NumVoters. If we have <= 4 regions, all - // replicas will be voting replicas. - numReplicas = numVoters - } - } - return numVoters, numReplicas -} - -// synthesizeVoterConstraints generates a ConstraintsConjunction clause -// representing the `voter_constraints` field to be set for the primary region -// of a multi-region database or the home region of a table/partition in such a -// database. -// -// Under zone survivability, we will constrain all voting replicas to be inside -// the primary/home region. -// -// Under region survivability, we will constrain exactly voting -// replicas in the primary/home region. -func synthesizeVoterConstraints( - region catpb.RegionName, regionConfig multiregion.RegionConfig, -) ([]zonepb.ConstraintsConjunction, error) { - switch regionConfig.SurvivalGoal() { - case descpb.SurvivalGoal_ZONE_FAILURE: - return []zonepb.ConstraintsConjunction{ - { - // We don't specify `NumReplicas` here to indicate that we want _all_ - // voting replicas to be constrained to this one region. - // - // Constraining all voting replicas to be inside the primary/home region - // is necessary and sufficient to ensure zone survivability, even though - // it might appear that these zone configs don't seem to spell out the - // requirement of being resilient to zone failures. This is because, by - // default, the allocator (see kv/kvserver/allocator.go) will maximize - // survivability due to it's diversity heuristic (see - // Locality.DiversityScore()) by spreading the replicas of a range - // across nodes with the most mutual difference in their locality - // hierarchies. - // - // For instance, in a 2 region deployment, each with 3 AZs, this is - // expected to result in a configuration like the following: - // - // +---- Region A -----+ +---- Region B -----+ - // | | | | - // | +------------+ | | +------------+ | - // | | VOTER | | | | | | - // | | | | | | | | - // | +------------+ | | +------------+ | - // | +------------+ | | +------------+ | - // | | VOTER | | | | | | - // | | | | | | NON-VOTER | | - // | +------------+ | | | | | - // | +------------+ | | +------------+ | - // | | | | | +------------+ | - // | | VOTER | | | | | | - // | | | | | | | | - // | +------------+ | | +------------+ | - // +-------------------+ +-------------------+ - // - Constraints: []zonepb.Constraint{makeRequiredConstraintForRegion(region)}, - }, - }, nil - case descpb.SurvivalGoal_REGION_FAILURE: - // We constrain voting replicas to the primary region and - // allow the rest to "float" around. This allows the allocator inside KV - // to make dynamic placement decisions for the voting replicas that lie - // outside the primary/home region. - // - // It might appear that constraining just voting replicas - // to the primary region leaves open the possibility of a majority - // quorum coalescing inside of some other region. However, similar to - // the case above, the diversity heuristic in the allocator prevents - // this from happening as it will spread the unconstrained replicas out - // across nodes with the most diverse locality hierarchies. - // - // For instance, in a 3 region deployment (minimum for a database with - // "region" survivability), each with 3 AZs, we'd expect to see a - // configuration like the following: - // - // +---- Region A ------+ +---- Region B -----+ +----- Region C -----+ - // | | | | | | - // | +------------+ | | +------------+ | | +------------+ | - // | | VOTER | | | | VOTER | | | | | | - // | | | | | | | | | | | | - // | +------------+ | | +------------+ | | +------------+ | - // | +------------+ | | +------------+ | | +------------+ | - // | | | | | | VOTER | | | | VOTER | | - // | | | | | | | | | | | | - // | +------------+ | | +------------+ | | +------------+ | - // | +------------+ | | +------------+ | | +------------+ | - // | | VOTER | | | | | | | | | | - // | | | | | | | | | | | | - // | +------------+ | | +------------+ | | +------------+ | - // +--------------------+ +-------------------+ +--------------------+ - // - numVoters, _ := getNumVotersAndNumReplicas(regionConfig) - ret := []zonepb.ConstraintsConjunction{ - { - NumReplicas: maxFailuresBeforeUnavailability(numVoters), - Constraints: []zonepb.Constraint{makeRequiredConstraintForRegion(region)}, - }, - } - if regionConfig.HasSecondaryRegion() && regionConfig.SecondaryRegion() != region { - ret = append(ret, zonepb.ConstraintsConjunction{ - NumReplicas: maxFailuresBeforeUnavailability(numVoters), - Constraints: []zonepb.Constraint{makeRequiredConstraintForRegion(regionConfig.SecondaryRegion())}, - }) - } - return ret, nil - default: - return nil, errors.AssertionFailedf("unknown survival goal: %v", regionConfig.SurvivalGoal()) - } -} - -// synthesizeReplicaConstraints generates a ConstraintsConjunction clause -// representing the `constraints` field to be set for a multi-region database. -func synthesizeReplicaConstraints( - regions catpb.RegionNames, placement descpb.DataPlacement, -) ([]zonepb.ConstraintsConjunction, error) { - switch placement { - case descpb.DataPlacement_DEFAULT: - constraints := make([]zonepb.ConstraintsConjunction, len(regions)) - for i, region := range regions { - // Constrain at least 1 (voting or non-voting) replica per region. - constraints[i] = zonepb.ConstraintsConjunction{ - NumReplicas: 1, - Constraints: []zonepb.Constraint{makeRequiredConstraintForRegion(region)}, - } - } - return constraints, nil - case descpb.DataPlacement_RESTRICTED: - // In a RESTRICTED placement policy, the database zone config has no - // non-voters so that REGIONAL BY [TABLE | ROW] can inherit the RESTRICTED - // placement. Voter placement will be set at the table/partition level to - // the table/partition region. - - // NB: When setting empty constraints, use nil as opposed to []. When - // constraints are deserialized from the database, empty constraints are - // always deserialized as nil. Therefore, if constraints are set as [] here, - // the database will have a difference in its expected constraints vs the - // actual constraints when comparing using the multi-region validation - // builtins. - return nil, nil - default: - return nil, errors.AssertionFailedf("unknown data placement: %v", placement) - } -} - -// synthesizeLeasePreferences generates a LeasePreferences -// clause representing the `lease_preferences` field to be set for the primary -// region and secondary region of a multi-region database or the home region of -// a table in such a database. -func synthesizeLeasePreferences( - region catpb.RegionName, secondaryRegion catpb.RegionName, -) []zonepb.LeasePreference { - ret := []zonepb.LeasePreference{ - {Constraints: []zonepb.Constraint{makeRequiredConstraintForRegion(region)}}, - } - if secondaryRegion != "" && secondaryRegion != region { - ret = append(ret, zonepb.LeasePreference{ - Constraints: []zonepb.Constraint{makeRequiredConstraintForRegion(secondaryRegion)}, - }) - } - return ret -} - // zoneConfigForMultiRegionTable generates a ZoneConfig stub for a // regional-by-table or global table in a multi-region database. // @@ -541,24 +206,24 @@ func zoneConfigForMultiRegionTable( // RESTRICTED placement. regionConfig = regionConfig.WithPlacementDefault() - numVoters, numReplicas := getNumVotersAndNumReplicas(regionConfig) + numVoters, numReplicas := regions.GetNumVotersAndNumReplicas(regionConfig) zc.NumVoters = &numVoters zc.NumReplicas = &numReplicas - constraints, err := synthesizeReplicaConstraints(regionConfig.Regions(), regionConfig.Placement()) + constraints, err := regions.SynthesizeReplicaConstraints(regionConfig.Regions(), regionConfig.Placement()) if err != nil { return zonepb.ZoneConfig{}, err } zc.Constraints = constraints zc.InheritedConstraints = false - voterConstraints, err := synthesizeVoterConstraints(regionConfig.PrimaryRegion(), regionConfig) + voterConstraints, err := regions.SynthesizeVoterConstraints(regionConfig.PrimaryRegion(), regionConfig) if err != nil { return zonepb.ZoneConfig{}, err } zc.VoterConstraints = voterConstraints zc.NullVoterConstraintsIsEmpty = true - zc.LeasePreferences = synthesizeLeasePreferences(regionConfig.PrimaryRegion(), "" /* secondaryRegion */) + zc.LeasePreferences = regions.SynthesizeLeasePreferences(regionConfig.PrimaryRegion(), "" /* secondaryRegion */) zc.InheritedLeasePreferences = false zc, err = regionConfig.ExtendZoneConfigWithGlobal(zc) @@ -581,11 +246,11 @@ func zoneConfigForMultiRegionTable( return zc, nil } - numVoters, numReplicas := getNumVotersAndNumReplicas(regionConfig) + numVoters, numReplicas := regions.GetNumVotersAndNumReplicas(regionConfig) zc.NumVoters = &numVoters if regionConfig.IsMemberOfExplicitSuperRegion(affinityRegion) { - err := addConstraintsForSuperRegion(&zc, regionConfig, affinityRegion) + err := regions.AddConstraintsForSuperRegion(&zc, regionConfig, affinityRegion) if err != nil { return zonepb.ZoneConfig{}, err } @@ -594,7 +259,7 @@ func zoneConfigForMultiRegionTable( // constraints for this table, define the constraints ourselves. zc.NumReplicas = &numReplicas - constraints, err := synthesizeReplicaConstraints(regionConfig.Regions(), regionConfig.Placement()) + constraints, err := regions.SynthesizeReplicaConstraints(regionConfig.Regions(), regionConfig.Placement()) if err != nil { return zonepb.ZoneConfig{}, err } @@ -603,13 +268,13 @@ func zoneConfigForMultiRegionTable( } // If the table has a user-specified affinity region, use it. - voterConstraints, err := synthesizeVoterConstraints(affinityRegion, regionConfig) + voterConstraints, err := regions.SynthesizeVoterConstraints(affinityRegion, regionConfig) if err != nil { return zonepb.ZoneConfig{}, err } zc.VoterConstraints = voterConstraints zc.NullVoterConstraintsIsEmpty = true - zc.LeasePreferences = synthesizeLeasePreferences(affinityRegion, "" /* secondaryRegion */) + zc.LeasePreferences = regions.SynthesizeLeasePreferences(affinityRegion, "" /* secondaryRegion */) zc.InheritedLeasePreferences = false return regionConfig.ExtendZoneConfigWithRegionalIn(zc, affinityRegion) @@ -644,7 +309,7 @@ func applyZoneConfigForMultiRegionTableOptionNewIndexes( ) (hasNewSubzones bool, newZoneConfig zonepb.ZoneConfig, err error) { for _, indexID := range indexIDs { for _, region := range regionConfig.Regions() { - zc, err := zoneConfigForMultiRegionPartition(region, regionConfig) + zc, err := regions.ZoneConfigForMultiRegionPartition(region, regionConfig) if err != nil { return false, zoneConfig, err } @@ -683,21 +348,6 @@ func dropZoneConfigsForMultiRegionIndexes( } } -// isPlaceholderZoneConfigForMultiRegion returns whether a given zone config -// should be marked as a placeholder config for a multi-region object. -// See zonepb.IsSubzonePlaceholder for why this is necessary. -func isPlaceholderZoneConfigForMultiRegion(zc zonepb.ZoneConfig) bool { - // Placeholders must have at least 1 subzone. - if len(zc.Subzones) == 0 { - return false - } - // Strip Subzones / SubzoneSpans, as these may contain items if migrating - // from one REGIONAL BY ROW table to another. - strippedZC := zc - strippedZC.Subzones, strippedZC.SubzoneSpans = nil, nil - return strippedZC.Equal(zonepb.NewZoneConfig()) -} - // applyZoneConfigForMultiRegionTableOptionTableNewConfig applies table zone // configs on the entire table with the given new locality config. func applyZoneConfigForMultiRegionTableOptionTableNewConfig( @@ -752,7 +402,7 @@ var ApplyZoneConfigForMultiRegionTableOptionTableAndIndexes = func( hasNewSubzones := table.IsLocalityRegionalByRow() if hasNewSubzones { for _, region := range regionConfig.Regions() { - subzoneConfig, err := zoneConfigForMultiRegionPartition(region, regionConfig) + subzoneConfig, err := regions.ZoneConfigForMultiRegionPartition(region, regionConfig) if err != nil { return false, zc, err } @@ -818,12 +468,12 @@ func prepareZoneConfigForMultiRegionTable( case table.IsLocalityRegionalByTable(): localityConfig := table.TableDesc().LocalityConfig.GetRegionalByTable() if region := localityConfig.Region; region != nil { - newLeasePreferences = synthesizeLeasePreferences(*region, regionConfig.SecondaryRegion()) + newLeasePreferences = regions.SynthesizeLeasePreferences(*region, regionConfig.SecondaryRegion()) } else { - newLeasePreferences = synthesizeLeasePreferences(regionConfig.PrimaryRegion(), regionConfig.SecondaryRegion()) + newLeasePreferences = regions.SynthesizeLeasePreferences(regionConfig.PrimaryRegion(), regionConfig.SecondaryRegion()) } default: - newLeasePreferences = synthesizeLeasePreferences(regionConfig.PrimaryRegion(), regionConfig.SecondaryRegion()) + newLeasePreferences = regions.SynthesizeLeasePreferences(regionConfig.PrimaryRegion(), regionConfig.SecondaryRegion()) } newZoneConfig.LeasePreferences = newLeasePreferences } @@ -832,7 +482,7 @@ func prepareZoneConfigForMultiRegionTable( // in the zone config. This signifies a placeholder. // Note we do not use hasNewSubzones here as there may be existing subzones // on the zone config which may still be a placeholder. - if isPlaceholderZoneConfigForMultiRegion(newZoneConfig) { + if regions.IsPlaceholderZoneConfigForMultiRegion(newZoneConfig) { newZoneConfig.NumReplicas = proto.Int32(0) } @@ -2157,7 +1807,7 @@ func (p *planner) validateZoneConfigForMultiRegionTable( // do not fudge num replicas to be equal to 0 -- otherwise the // check fails when num_replicas is different, but that is // expected as the current zone config is no longer a placeholder. - if currentZoneConfig.IsSubzonePlaceholder() && isPlaceholderZoneConfigForMultiRegion(expectedZoneConfig) { + if currentZoneConfig.IsSubzonePlaceholder() && regions.IsPlaceholderZoneConfigForMultiRegion(expectedZoneConfig) { expectedZoneConfig.NumReplicas = proto.Int32(0) } @@ -2169,12 +1819,12 @@ func (p *planner) validateZoneConfigForMultiRegionTable( case desc.IsLocalityRegionalByTable(): rbt := desc.GetLocalityConfig().GetRegionalByTable() if rbt.Region != nil { - leasePreferences = synthesizeLeasePreferences(*rbt.Region, regionConfig.SecondaryRegion) + leasePreferences = regions.SynthesizeLeasePreferences(*rbt.Region, regionConfig.SecondaryRegion) } else { - leasePreferences = synthesizeLeasePreferences(regionConfig.PrimaryRegion, regionConfig.SecondaryRegion) + leasePreferences = regions.SynthesizeLeasePreferences(regionConfig.PrimaryRegion, regionConfig.SecondaryRegion) } default: - leasePreferences = synthesizeLeasePreferences(regionConfig.PrimaryRegion, regionConfig.SecondaryRegion) + leasePreferences = regions.SynthesizeLeasePreferences(regionConfig.PrimaryRegion, regionConfig.SecondaryRegion) } expectedZoneConfig.LeasePreferences = leasePreferences diff --git a/pkg/sql/region_util_test.go b/pkg/sql/region_util_test.go index 17e28d4f8d73..5d5dabe85c4e 100644 --- a/pkg/sql/region_util_test.go +++ b/pkg/sql/region_util_test.go @@ -12,6 +12,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/catalog/catpb" "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" "github.com/cockroachdb/cockroach/pkg/sql/catalog/multiregion" + "github.com/cockroachdb/cockroach/pkg/sql/regions" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/gogo/protobuf/proto" @@ -1899,7 +1900,7 @@ func TestZoneConfigForMultiRegionPartition(t *testing.T) { } for _, tc := range testCases { t.Run(tc.desc, func(t *testing.T) { - zc, err := zoneConfigForMultiRegionPartition(tc.region, tc.regionConfig) + zc, err := regions.ZoneConfigForMultiRegionPartition(tc.region, tc.regionConfig) require.NoError(t, err) require.Equal(t, tc.expected, zc) }) diff --git a/pkg/sql/regions/BUILD.bazel b/pkg/sql/regions/BUILD.bazel index f016db54b608..cd632a6a0abb 100644 --- a/pkg/sql/regions/BUILD.bazel +++ b/pkg/sql/regions/BUILD.bazel @@ -6,15 +6,18 @@ go_library( "cached_db_regions.go", "db_regions.go", "region_provider.go", + "region_util.go", ], importpath = "github.com/cockroachdb/cockroach/pkg/sql/regions", visibility = ["//visibility:public"], deps = [ + "//pkg/config/zonepb", "//pkg/keys", "//pkg/kv", "//pkg/roachpb", "//pkg/server/serverpb", "//pkg/sql/catalog", + "//pkg/sql/catalog/catpb", "//pkg/sql/catalog/descpb", "//pkg/sql/catalog/descs", "//pkg/sql/catalog/lease", diff --git a/pkg/sql/regions/region_util.go b/pkg/sql/regions/region_util.go new file mode 100644 index 000000000000..751f4c2a8fa9 --- /dev/null +++ b/pkg/sql/regions/region_util.go @@ -0,0 +1,364 @@ +// Copyright 2025 The Cockroach Authors. +// +// Use of this software is governed by the CockroachDB Software License +// included in the /LICENSE file. + +package regions + +import ( + "github.com/cockroachdb/cockroach/pkg/config/zonepb" + "github.com/cockroachdb/cockroach/pkg/sql/catalog/catpb" + "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" + "github.com/cockroachdb/cockroach/pkg/sql/catalog/multiregion" + "github.com/cockroachdb/errors" +) + +// ZoneConfigForMultiRegionPartition generates a ZoneConfig stub for a partition +// that belongs to a regional by row table in a multi-region database. +// +// At the table/partition level, the only attributes that are set are +// `num_voters`, `voter_constraints`, and `lease_preferences`. We expect that +// the attributes `num_replicas` and `constraints` will be inherited from the +// database level zone config. +func ZoneConfigForMultiRegionPartition( + partitionRegion catpb.RegionName, regionConfig multiregion.RegionConfig, +) (zonepb.ZoneConfig, error) { + zc := *zonepb.NewZoneConfig() + + numVoters, numReplicas := GetNumVotersAndNumReplicas(regionConfig) + zc.NumVoters = &numVoters + + if regionConfig.IsMemberOfExplicitSuperRegion(partitionRegion) { + err := AddConstraintsForSuperRegion(&zc, regionConfig, partitionRegion) + if err != nil { + return zonepb.ZoneConfig{}, err + } + } else if !regionConfig.RegionalInTablesInheritDatabaseConstraints(partitionRegion) { + // If the database constraints can't be inherited to serve as the + // constraints for this partition, define the constraints ourselves. + zc.NumReplicas = &numReplicas + + constraints, err := SynthesizeReplicaConstraints(regionConfig.Regions(), regionConfig.Placement()) + if err != nil { + return zonepb.ZoneConfig{}, err + } + zc.Constraints = constraints + zc.InheritedConstraints = false + } + + voterConstraints, err := SynthesizeVoterConstraints(partitionRegion, regionConfig) + if err != nil { + return zonepb.ZoneConfig{}, err + } + zc.VoterConstraints = voterConstraints + zc.NullVoterConstraintsIsEmpty = true + zc.LeasePreferences = SynthesizeLeasePreferences(partitionRegion, regionConfig.SecondaryRegion()) + zc.InheritedLeasePreferences = false + + return regionConfig.ExtendZoneConfigWithRegionalIn(zc, partitionRegion) +} + +// IsPlaceholderZoneConfigForMultiRegion returns whether a given zone config +// should be marked as a placeholder config for a multi-region object. +// See zonepb.IsSubzonePlaceholder for why this is necessary. +func IsPlaceholderZoneConfigForMultiRegion(zc zonepb.ZoneConfig) bool { + // Placeholders must have at least 1 subzone. + if len(zc.Subzones) == 0 { + return false + } + // Strip Subzones / SubzoneSpans, as these may contain items if migrating + // from one REGIONAL BY ROW table to another. + strippedZC := zc + strippedZC.Subzones, strippedZC.SubzoneSpans = nil, nil + return strippedZC.Equal(zonepb.NewZoneConfig()) +} + +// SynthesizeLeasePreferences generates a LeasePreferences +// clause representing the `lease_preferences` field to be set for the primary +// region and secondary region of a multi-region database or the home region of +// a table in such a database. +func SynthesizeLeasePreferences( + region catpb.RegionName, secondaryRegion catpb.RegionName, +) []zonepb.LeasePreference { + ret := []zonepb.LeasePreference{ + {Constraints: []zonepb.Constraint{MakeRequiredConstraintForRegion(region)}}, + } + if secondaryRegion != "" && secondaryRegion != region { + ret = append(ret, zonepb.LeasePreference{ + Constraints: []zonepb.Constraint{MakeRequiredConstraintForRegion(secondaryRegion)}, + }) + } + return ret +} + +// SynthesizeReplicaConstraints generates a ConstraintsConjunction clause +// representing the `constraints` field to be set for a multi-region database. +func SynthesizeReplicaConstraints( + regions catpb.RegionNames, placement descpb.DataPlacement, +) ([]zonepb.ConstraintsConjunction, error) { + switch placement { + case descpb.DataPlacement_DEFAULT: + constraints := make([]zonepb.ConstraintsConjunction, len(regions)) + for i, region := range regions { + // Constrain at least 1 (voting or non-voting) replica per region. + constraints[i] = zonepb.ConstraintsConjunction{ + NumReplicas: 1, + Constraints: []zonepb.Constraint{MakeRequiredConstraintForRegion(region)}, + } + } + return constraints, nil + case descpb.DataPlacement_RESTRICTED: + // In a RESTRICTED placement policy, the database zone config has no + // non-voters so that REGIONAL BY [TABLE | ROW] can inherit the RESTRICTED + // placement. Voter placement will be set at the table/partition level to + // the table/partition region. + + // NB: When setting empty constraints, use nil as opposed to []. When + // constraints are deserialized from the database, empty constraints are + // always deserialized as nil. Therefore, if constraints are set as [] here, + // the database will have a difference in its expected constraints vs the + // actual constraints when comparing using the multi-region validation + // builtins. + return nil, nil + default: + return nil, errors.AssertionFailedf("unknown data placement: %v", placement) + } +} + +// SynthesizeVoterConstraints generates a ConstraintsConjunction clause +// representing the `voter_constraints` field to be set for the primary region +// of a multi-region database or the home region of a table/partition in such a +// database. +// +// Under zone survivability, we will constrain all voting replicas to be inside +// the primary/home region. +// +// Under region survivability, we will constrain exactly voting +// replicas in the primary/home region. +func SynthesizeVoterConstraints( + region catpb.RegionName, regionConfig multiregion.RegionConfig, +) ([]zonepb.ConstraintsConjunction, error) { + switch regionConfig.SurvivalGoal() { + case descpb.SurvivalGoal_ZONE_FAILURE: + return []zonepb.ConstraintsConjunction{ + { + // We don't specify `NumReplicas` here to indicate that we want _all_ + // voting replicas to be constrained to this one region. + // + // Constraining all voting replicas to be inside the primary/home region + // is necessary and sufficient to ensure zone survivability, even though + // it might appear that these zone configs don't seem to spell out the + // requirement of being resilient to zone failures. This is because, by + // default, the allocator (see kv/kvserver/allocator.go) will maximize + // survivability due to it's diversity heuristic (see + // Locality.DiversityScore()) by spreading the replicas of a range + // across nodes with the most mutual difference in their locality + // hierarchies. + // + // For instance, in a 2 region deployment, each with 3 AZs, this is + // expected to result in a configuration like the following: + // + // +---- Region A -----+ +---- Region B -----+ + // | | | | + // | +------------+ | | +------------+ | + // | | VOTER | | | | | | + // | | | | | | | | + // | +------------+ | | +------------+ | + // | +------------+ | | +------------+ | + // | | VOTER | | | | | | + // | | | | | | NON-VOTER | | + // | +------------+ | | | | | + // | +------------+ | | +------------+ | + // | | | | | +------------+ | + // | | VOTER | | | | | | + // | | | | | | | | + // | +------------+ | | +------------+ | + // +-------------------+ +-------------------+ + // + Constraints: []zonepb.Constraint{MakeRequiredConstraintForRegion(region)}, + }, + }, nil + case descpb.SurvivalGoal_REGION_FAILURE: + // We constrain voting replicas to the primary region and + // allow the rest to "float" around. This allows the allocator inside KV + // to make dynamic placement decisions for the voting replicas that lie + // outside the primary/home region. + // + // It might appear that constraining just voting replicas + // to the primary region leaves open the possibility of a majority + // quorum coalescing inside of some other region. However, similar to + // the case above, the diversity heuristic in the allocator prevents + // this from happening as it will spread the unconstrained replicas out + // across nodes with the most diverse locality hierarchies. + // + // For instance, in a 3 region deployment (minimum for a database with + // "region" survivability), each with 3 AZs, we'd expect to see a + // configuration like the following: + // + // +---- Region A ------+ +---- Region B -----+ +----- Region C -----+ + // | | | | | | + // | +------------+ | | +------------+ | | +------------+ | + // | | VOTER | | | | VOTER | | | | | | + // | | | | | | | | | | | | + // | +------------+ | | +------------+ | | +------------+ | + // | +------------+ | | +------------+ | | +------------+ | + // | | | | | | VOTER | | | | VOTER | | + // | | | | | | | | | | | | + // | +------------+ | | +------------+ | | +------------+ | + // | +------------+ | | +------------+ | | +------------+ | + // | | VOTER | | | | | | | | | | + // | | | | | | | | | | | | + // | +------------+ | | +------------+ | | +------------+ | + // +--------------------+ +-------------------+ +--------------------+ + // + numVoters, _ := GetNumVotersAndNumReplicas(regionConfig) + ret := []zonepb.ConstraintsConjunction{ + { + NumReplicas: MaxFailuresBeforeUnavailability(numVoters), + Constraints: []zonepb.Constraint{MakeRequiredConstraintForRegion(region)}, + }, + } + if regionConfig.HasSecondaryRegion() && regionConfig.SecondaryRegion() != region { + ret = append(ret, zonepb.ConstraintsConjunction{ + NumReplicas: MaxFailuresBeforeUnavailability(numVoters), + Constraints: []zonepb.Constraint{MakeRequiredConstraintForRegion(regionConfig.SecondaryRegion())}, + }) + } + return ret, nil + default: + return nil, errors.AssertionFailedf("unknown survival goal: %v", regionConfig.SurvivalGoal()) + } +} + +// MaxFailuresBeforeUnavailability returns the maximum number of individual +// failures that can be tolerated, among `numVoters` voting replicas, before a +// given range is unavailable. +func MaxFailuresBeforeUnavailability(numVoters int32) int32 { + return ((numVoters + 1) / 2) - 1 +} + +// GetNumVotersAndNumReplicas computes the number of voters and the total number +// of replicas needed for a given region config. +func GetNumVotersAndNumReplicas( + regionConfig multiregion.RegionConfig, +) (numVoters, numReplicas int32) { + const numVotersForZoneSurvival = 3 + // Under region survivability, we use 5 voting replicas to allow for a + // theoretical (2-2-1) voting replica configuration, where the primary region + // has 2 voting replicas and the next closest region has another 2. This + // allows for stable read/write latencies even under single node failures. + // + // TODO(aayush): Until we add allocator heuristics to coalesce voting replicas + // together based on their relative latencies to the leaseholder, we can't + // actually ensure that the region closest to the leaseholder has 2 voting + // replicas. + // + // Until the above TODO is addressed, the non-leaseholder voting replicas will + // be allowed to "float" around among the other regions in the database. They + // may or may not be placed geographically close to the leaseholder replica. + const numVotersForRegionSurvival = 5 + + numRegions := int32(len(regionConfig.Regions())) + switch regionConfig.SurvivalGoal() { + // NB: See mega-comment inside `SynthesizeVoterConstraints()` for why these + // are set the way they are. + case descpb.SurvivalGoal_ZONE_FAILURE: + numVoters = numVotersForZoneSurvival + switch regionConfig.Placement() { + case descpb.DataPlacement_DEFAULT: + // + <1 replica for every other region> + numReplicas = (numVotersForZoneSurvival) + (numRegions - 1) + case descpb.DataPlacement_RESTRICTED: + numReplicas = numVoters + default: + panic(errors.AssertionFailedf("unknown data placement: %v", regionConfig.Placement())) + } + case descpb.SurvivalGoal_REGION_FAILURE: + // The primary and secondary region each have two voters. + // MaxFailuresBeforeUnavailability(numVotersForRegionSurvival) = 2. + // We have 5 voters for survival mode region failure such that we can + // get quorum with 2 voters in the primary region + one voter outside. + // Every other region has one replica. + numVoters = numVotersForRegionSurvival + + // There are always 2 (i.e. MaxFailuresBeforeUnavailability) replicas in the + // primary region, and 1 replica in every other region. + numReplicas = MaxFailuresBeforeUnavailability(numVotersForRegionSurvival) + (numRegions - 1) + if regionConfig.HasSecondaryRegion() { + // If there is a secondary region, it gets an additional replica. + numReplicas++ + } + if numReplicas < numVoters { + // NumReplicas cannot be less than NumVoters. If we have <= 4 regions, all + // replicas will be voting replicas. + numReplicas = numVoters + } + } + return numVoters, numReplicas +} + +func MakeRequiredConstraintForRegion(r catpb.RegionName) zonepb.Constraint { + return zonepb.Constraint{ + Type: zonepb.Constraint_REQUIRED, + Key: "region", + Value: string(r), + } +} + +// AddConstraintsForSuperRegion updates the ZoneConfig.Constraints field such +// that every replica is guaranteed to be constrained to a region within the +// super region. +// If !regionConfig.IsMemberOfExplicitSuperRegion(affinityRegion), and error +// will be returned. +func AddConstraintsForSuperRegion( + zc *zonepb.ZoneConfig, regionConfig multiregion.RegionConfig, affinityRegion catpb.RegionName, +) error { + regions, ok := regionConfig.GetSuperRegionRegionsForRegion(affinityRegion) + if !ok { + return errors.AssertionFailedf("region %s is not part of a super region", affinityRegion) + } + _, numReplicas := GetNumVotersAndNumReplicas(regionConfig.WithRegions(regions)) + + zc.NumReplicas = &numReplicas + zc.Constraints = nil + zc.InheritedConstraints = false + + switch regionConfig.SurvivalGoal() { + case descpb.SurvivalGoal_ZONE_FAILURE: + for _, region := range regions { + zc.Constraints = append(zc.Constraints, zonepb.ConstraintsConjunction{ + NumReplicas: 1, + Constraints: []zonepb.Constraint{MakeRequiredConstraintForRegion(region)}, + }) + } + return nil + case descpb.SurvivalGoal_REGION_FAILURE: + // There is a special case where we have 3 regions under survival goal + // region failure where we have to constrain an extra replica to any + // region within the super region to guarantee that all replicas are + // accounted for. In our case, we assign it to the first non-primary region + // in sorted order. + // This happens because we have 5 replicas and 3 regions. 2 voters are + // constrained to the primary region, the other 2 regions each are given a + // replica, the last non-voting replica is not guaranteed to be constrained + // anywhere. + // If we have more than 3 regions, all replicas are accounted for and + // constrained within the super region. + // See: https://github.com/cockroachdb/cockroach/issues/63617 for more. + extraReplicaToConstrain := len(regions) == 3 + for _, region := range regions { + n := int32(1) + if region != affinityRegion && extraReplicaToConstrain { + n = 2 + extraReplicaToConstrain = false + } + zc.Constraints = append(zc.Constraints, zonepb.ConstraintsConjunction{ + NumReplicas: n, + Constraints: []zonepb.Constraint{MakeRequiredConstraintForRegion(region)}, + }) + } + return nil + default: + return errors.AssertionFailedf("unknown survival goal: %v", regionConfig.SurvivalGoal()) + } +}