From 7baf9cd524da3b6b8591f3c3f79f0aac3a29c05f Mon Sep 17 00:00:00 2001 From: Ioannis Kavvadias Date: Mon, 28 Oct 2024 17:40:30 +0000 Subject: [PATCH 1/3] features: Add sanction binding for enterprise properties --- src/v/config/property.h | 30 ++- .../config/tests/enterprise_property_test.cc | 14 +- src/v/features/BUILD | 1 + src/v/features/enterprise_features.h | 59 ++++++ src/v/features/tests/BUILD | 16 +- src/v/features/tests/CMakeLists.txt | 11 +- .../tests/enterprise_features_test.cc | 179 ++++++++++++++++++ 7 files changed, 294 insertions(+), 16 deletions(-) create mode 100644 src/v/features/tests/enterprise_features_test.cc diff --git a/src/v/config/property.h b/src/v/config/property.h index dbbe267aa72ee..737189f930e8c 100644 --- a/src/v/config/property.h +++ b/src/v/config/property.h @@ -197,6 +197,10 @@ class property : public base_property { return std::nullopt; } + bool check_restricted(const T& value) const { + return do_check_restricted(value); + } + void reset() override { auto v = default_value(); update_value(std::move(v)); @@ -301,6 +305,11 @@ class property : public base_property { const std::optional> _legacy_default; private: + virtual bool do_check_restricted(const T&) const { + // Config properties are unrestricted by default + return false; + } + validator _validator; friend class binding_base; @@ -1060,6 +1069,9 @@ class enterprise : public P { assert_no_default_conflict(); } + // Needed because the following override shadows the rest of the overloads + using P::check_restricted; + /** * Decodes the given YAML node into the underlying property's value_type and * checks whether that value should be restricted to enterprise clusters @@ -1067,7 +1079,7 @@ class enterprise : public P { */ std::optional check_restricted(YAML::Node n) const final { auto v = std::move(n.as()); - if (check_restricted(v)) { + if (do_check_restricted(v)) { return std::make_optional( P::name().data(), ssx::sformat( @@ -1077,14 +1089,7 @@ class enterprise : public P { } private: - void assert_no_default_conflict() const { - vassert( - !check_restricted(this->default_value()), - "Enterprise properties must not restrict the default value of the " - "underlying property!"); - } - - bool check_restricted(const T& setting) const { + bool do_check_restricted(const T& setting) const final { // depending on how the restriction was defined, construct an applicable // check function for bare instances of the underlying value type auto restriction_check = [this](const val_t& v) -> bool { @@ -1113,6 +1118,13 @@ class enterprise : public P { } } + void assert_no_default_conflict() const { + vassert( + !do_check_restricted(this->default_value()), + "Enterprise properties must not restrict the default value of the " + "underlying property!"); + } + restrict_variant_t _restriction; }; diff --git a/src/v/config/tests/enterprise_property_test.cc b/src/v/config/tests/enterprise_property_test.cc index c19d05dcfcceb..3bda3965fba5d 100644 --- a/src/v/config/tests/enterprise_property_test.cc +++ b/src/v/config/tests/enterprise_property_test.cc @@ -7,13 +7,16 @@ // the Business Source License, use of this software will be governed // by the Apache License, Version 2.0 +#include "config/base_property.h" #include "config/config_store.h" #include "config/property.h" #include "config/types.h" +#include #include #include +#include #include namespace config { @@ -34,7 +37,7 @@ struct test_config : public config_store { true, "enterprise_bool", "An enterprise-only bool config", - meta{}, + meta{.needs_restart = needs_restart::no}, false, property::noop_validator, std::nullopt) @@ -43,27 +46,28 @@ struct test_config : public config_store { std::vector{"bar"}, "enterprise_str_enum", "An enterprise-only enum property", - meta{}, + meta{.needs_restart = needs_restart::no}, "foo", std::vector{"foo", "bar", "baz"}) , enterprise_str_vec( *this, std::vector{"GSSAPI"}, "enterprise_str_vec", - "An enterprise-only vector of strings") + "An enterprise-only vector of strings", + meta{.needs_restart = needs_restart::no}) , enterprise_opt_int( *this, [](const int& v) -> bool { return v > 1000; }, "enterprise_opt_int", "An enterprise-only optional int", - meta{}, + meta{.needs_restart = needs_restart::no}, 0) , enterprise_enum( *this, std::vector{tls_version::v1_3}, "enterprise_str_enum", "An enterprise-only enum property", - meta{}, + meta{.needs_restart = needs_restart::no}, tls_version::v1_1, std::vector{ tls_version::v1_0, diff --git a/src/v/features/BUILD b/src/v/features/BUILD index f6d2cfb856b8d..ab232bc48c4ee 100644 --- a/src/v/features/BUILD +++ b/src/v/features/BUILD @@ -50,6 +50,7 @@ redpanda_cc_library( visibility = ["//visibility:public"], deps = [ "//src/v/base", + "//src/v/config", "@abseil-cpp//absl/container:flat_hash_set", "@boost//:range", ], diff --git a/src/v/features/enterprise_features.h b/src/v/features/enterprise_features.h index b869c23090a9c..fcc0d8a4dd8c0 100644 --- a/src/v/features/enterprise_features.h +++ b/src/v/features/enterprise_features.h @@ -11,6 +11,9 @@ #pragma once +#include "config/configuration.h" +#include "config/property.h" + #include #include @@ -75,4 +78,60 @@ class enterprise_feature_report { vtype _disabled; }; +template +class sanctioning_binding { +public: + using T = P::value_type; + explicit sanctioning_binding(config::enterprise

& prop) + : _prop(prop) + , _binding(_prop.bind()) { + update_sanctioned_state(); + _binding.watch([this] { update_sanctioned_state(); }); + } + + config::binding& binding() { return _binding; } + const config::binding& binding() const { return _binding; } + + std::pair operator()(bool should_sanction) const { + const auto& val = _binding(); + if (should_sanction && _is_sanctioned) [[unlikely]] { + return std::make_pair(_prop.default_value(), true); + } else { + return std::make_pair(val, false); + } + } + +private: + config::enterprise

& _prop; + config::binding _binding; + bool _is_sanctioned{false}; + + void update_sanctioned_state() { + _is_sanctioned = _prop.check_restricted(_binding()); + } +}; + +namespace details { +template +concept NonVoid = !std::is_void_v; + +} // namespace details +template +details::NonVoid auto make_sanctioning_binding() { + if constexpr ( + F == license_required_feature::partition_auto_balancing_continuous) { + return sanctioning_binding( + config::shard_local_cfg().partition_autobalancing_mode); + } + + if constexpr (F == license_required_feature::core_balancing_continuous) { + return sanctioning_binding( + config::shard_local_cfg().core_balancing_continuous); + } + + if constexpr (F == license_required_feature::leadership_pinning) { + return sanctioning_binding( + config::shard_local_cfg().default_leaders_preference); + } +} } // namespace features diff --git a/src/v/features/tests/BUILD b/src/v/features/tests/BUILD index 59b20e0c909a5..d08dbbd850515 100644 --- a/src/v/features/tests/BUILD +++ b/src/v/features/tests/BUILD @@ -1,4 +1,4 @@ -load("//bazel:test.bzl", "redpanda_cc_btest") +load("//bazel:test.bzl", "redpanda_cc_btest", "redpanda_cc_gtest") redpanda_cc_btest( name = "feature_table_test", @@ -16,3 +16,17 @@ redpanda_cc_btest( "@seastar//:testing", ], ) + +redpanda_cc_gtest( + name = "enterprise_features_test", + timeout = "short", + srcs = [ + "enterprise_features_test.cc", + ], + deps = [ + "//src/v/config", + "//src/v/features:enterprise_features", + "//src/v/test_utils:gtest", + "@googletest//:gtest", + ], +) diff --git a/src/v/features/tests/CMakeLists.txt b/src/v/features/tests/CMakeLists.txt index 6bf4ec66269b2..5e0ff1598c66e 100644 --- a/src/v/features/tests/CMakeLists.txt +++ b/src/v/features/tests/CMakeLists.txt @@ -9,4 +9,13 @@ rp_test( SOURCES feature_table_test.cc LIBRARIES v::seastar_testing_main v::features LABELS features -) \ No newline at end of file +) + +rp_test( + UNIT_TEST + GTEST + BINARY_NAME test_enterprise_features + SOURCES enterprise_features_test.cc + LIBRARIES v::gtest_main v::enterprise_features v::config + LABELS enterprise_features +) diff --git a/src/v/features/tests/enterprise_features_test.cc b/src/v/features/tests/enterprise_features_test.cc new file mode 100644 index 0000000000000..7f420eacb852e --- /dev/null +++ b/src/v/features/tests/enterprise_features_test.cc @@ -0,0 +1,179 @@ +// Copyright 2024 Redpanda Data, Inc. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.md +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0 + +#include "config/base_property.h" +#include "config/config_store.h" +#include "config/property.h" +#include "config/types.h" +#include "features/enterprise_features.h" + +#include +#include + +namespace config { +namespace { + +struct test_config : public config_store { + enterprise> enterprise_bool; + enterprise> enterprise_str_enum; + enterprise>> enterprise_str_vec; + enterprise>> enterprise_opt_int; + enterprise> enterprise_enum; + + using meta = base_property::metadata; + + test_config() + : enterprise_bool( + *this, + true, + "enterprise_bool", + "An enterprise-only bool config", + meta{.needs_restart = needs_restart::no}, + false, + property::noop_validator, + std::nullopt) + , enterprise_str_enum( + *this, + std::vector{"bar"}, + "enterprise_str_enum", + "An enterprise-only enum property", + meta{.needs_restart = needs_restart::no}, + "foo", + std::vector{"foo", "bar", "baz"}) + , enterprise_str_vec( + *this, + std::vector{"GSSAPI"}, + "enterprise_str_vec", + "An enterprise-only vector of strings", + meta{.needs_restart = needs_restart::no}) + , enterprise_opt_int( + *this, + [](const int& v) -> bool { return v > 1000; }, + "enterprise_opt_int", + "An enterprise-only optional int", + meta{.needs_restart = needs_restart::no}, + 0) + , enterprise_enum( + *this, + std::vector{tls_version::v1_3}, + "enterprise_str_enum", + "An enterprise-only enum property", + meta{.needs_restart = needs_restart::no}, + tls_version::v1_1, + std::vector{ + tls_version::v1_0, + tls_version::v1_1, + tls_version::v1_2, + tls_version::v1_3}) {} +}; + +} // namespace +} // namespace config + +namespace features { +template +struct EnterpriseFeatureTest : public ::testing::Test { + using value_type = typename Property::value_type; + + struct test_values { + value_type default_value; + value_type allowed_value; + value_type restricted_value; + }; + + struct test_case { + config::enterprise* enterprise_conf; + test_values ts; + }; + + test_case get_test_case() { + if constexpr (std::same_as>) { + return { + &cfg.enterprise_bool, + {.default_value = false, + .allowed_value = false, + .restricted_value = true}}; + } + if constexpr (std:: + same_as>) { + return { + &cfg.enterprise_str_enum, + {.default_value = "foo", + .allowed_value = "baz", + .restricted_value = "bar"}}; + } + if constexpr (std::same_as< + Property, + config::property>>) { + return { + &cfg.enterprise_str_vec, + {.default_value{}, + .allowed_value = {"OTHER"}, + .restricted_value = {"GSSAPI", "OTHER"}}}; + } + if constexpr (std::same_as< + Property, + config::property>>) { + return { + &cfg.enterprise_opt_int, + {.default_value{0}, .allowed_value{10}, .restricted_value{1010}}}; + } + if constexpr (std::same_as< + Property, + config::enum_property>) { + return { + &cfg.enterprise_enum, + {.default_value = config::tls_version::v1_1, + .allowed_value = config::tls_version::v1_2, + .restricted_value = config::tls_version::v1_3}}; + } + } + + config::test_config cfg{}; +}; + +using EnterprisePropertyTypes = ::testing::Types< + config::property, + config::enum_property, + config::property>, + config::property>, + config::enum_property>; +TYPED_TEST_SUITE(EnterpriseFeatureTest, EnterprisePropertyTypes); + +TYPED_TEST(EnterpriseFeatureTest, SanctionedValues) { + auto tc = this->get_test_case(); + auto& enterprise_conf = *tc.enterprise_conf; + const auto [default_value, allowed_value, restricted_value] = tc.ts; + + auto binded = sanctioning_binding{enterprise_conf}; + + // default value + EXPECT_EQ(enterprise_conf.value(), default_value); + + EXPECT_EQ(binded(true), std::make_pair(default_value, false)); + EXPECT_EQ(binded(false), std::make_pair(default_value, false)); + + // allowed value + enterprise_conf.set_value(allowed_value); + + EXPECT_EQ(enterprise_conf.value(), allowed_value); + + EXPECT_EQ(binded(true), std::make_pair(allowed_value, false)); + EXPECT_EQ(binded(false), std::make_pair(allowed_value, false)); + + // sanctioned value + enterprise_conf.set_value(restricted_value); + + EXPECT_EQ(enterprise_conf.value(), restricted_value); + + EXPECT_EQ(binded(true), std::make_pair(default_value, true)); + EXPECT_EQ(binded(false), std::make_pair(restricted_value, false)); +} + +} // namespace features From 2567b8841ed4dcef0f393b149628001af8bccf4f Mon Sep 17 00:00:00 2001 From: Ioannis Kavvadias Date: Fri, 25 Oct 2024 10:18:51 +0100 Subject: [PATCH 2/3] cluster: Sanction partition autobalancer on invalid license Implements CORE-8014 If there is no valid license and partition_autobalancing_mode is set to continuous it will be ignored and it will revert to node_add behavior. --- src/v/cluster/controller.cc | 2 +- src/v/cluster/partition_balancer_backend.cc | 25 ++++++++- src/v/cluster/partition_balancer_backend.h | 8 ++- tests/rptest/tests/partition_balancer_test.py | 56 +++++++++++++++++++ 4 files changed, 85 insertions(+), 6 deletions(-) diff --git a/src/v/cluster/controller.cc b/src/v/cluster/controller.cc index 6bfd9e33a0563..31bef6c4b0fab 100644 --- a/src/v/cluster/controller.cc +++ b/src/v/cluster/controller.cc @@ -724,12 +724,12 @@ ss::future<> controller::start( co_await _partition_balancer.start_single( _raft0, std::ref(_stm), + std::ref(_feature_table), std::ref(_partition_balancer_state), std::ref(_hm_backend), std::ref(_partition_allocator), std::ref(_tp_frontend), std::ref(_members_frontend), - config::shard_local_cfg().partition_autobalancing_mode.bind(), config::shard_local_cfg() .partition_autobalancing_node_availability_timeout_sec.bind(), config::shard_local_cfg() diff --git a/src/v/cluster/partition_balancer_backend.cc b/src/v/cluster/partition_balancer_backend.cc index 543272badbe1c..743b1028bebe3 100644 --- a/src/v/cluster/partition_balancer_backend.cc +++ b/src/v/cluster/partition_balancer_backend.cc @@ -23,6 +23,9 @@ #include "cluster/types.h" #include "config/configuration.h" #include "config/property.h" +#include "features/enterprise_features.h" +#include "features/feature_table.h" +#include "model/metadata.h" #include "random/generators.h" #include "utils/stable_iterator_adaptor.h" @@ -43,12 +46,12 @@ static constexpr std::chrono::seconds add_move_cmd_timeout = 10s; partition_balancer_backend::partition_balancer_backend( consensus_ptr raft0, ss::sharded& controller_stm, + ss::sharded& feature_table, ss::sharded& state, ss::sharded& health_monitor, ss::sharded& partition_allocator, ss::sharded& topics_frontend, ss::sharded& members_frontend, - config::binding&& mode, config::binding&& availability_timeout, config::binding&& max_disk_usage_percent, config::binding&& storage_space_alert_free_threshold_percent, @@ -62,12 +65,15 @@ partition_balancer_backend::partition_balancer_backend( config::binding topic_aware) : _raft0(std::move(raft0)) , _controller_stm(controller_stm.local()) + , _feature_table(feature_table.local()) , _state(state.local()) , _health_monitor(health_monitor.local()) , _partition_allocator(partition_allocator.local()) , _topics_frontend(topics_frontend.local()) , _members_frontend(members_frontend.local()) - , _mode(std::move(mode)) + , _mode(features::make_sanctioning_binding< + features::license_required_feature:: + partition_auto_balancing_continuous>()) , _availability_timeout(std::move(availability_timeout)) , _max_disk_usage_percent(std::move(max_disk_usage_percent)) , _storage_space_alert_free_threshold_percent( @@ -358,9 +364,22 @@ ss::future<> partition_balancer_backend::do_tick() { // status requests by default 700ms const auto node_responsiveness_timeout = _node_status_interval() * 7; + const bool should_sanction = _feature_table.should_sanction(); + + const auto [mode, is_sanctioned] = _mode(should_sanction); + if (is_sanctioned) { + vlog( + clusterlog.warn, + "A Redpanda Enterprise Edition license is required to use enterprise " + "feature \"partition_autobalancing_mode\" with value \"{}\". " + "Behavior is being restricted to \"{}\".", + _mode(false).first, + mode); + } + partition_balancer_planner planner( planner_config{ - .mode = _mode(), + .mode = mode, .soft_max_disk_usage_ratio = soft_max_disk_usage_ratio, .hard_max_disk_usage_ratio = hard_max_disk_usage_ratio, .max_concurrent_actions = _max_concurrent_actions(), diff --git a/src/v/cluster/partition_balancer_backend.h b/src/v/cluster/partition_balancer_backend.h index e6bda567bc9b5..00e6b0642c638 100644 --- a/src/v/cluster/partition_balancer_backend.h +++ b/src/v/cluster/partition_balancer_backend.h @@ -16,6 +16,7 @@ #include "cluster/partition_balancer_types.h" #include "cluster/types.h" #include "config/property.h" +#include "features/enterprise_features.h" #include "model/fundamental.h" #include "raft/consensus.h" #include "utils/mutex.h" @@ -35,12 +36,12 @@ class partition_balancer_backend { partition_balancer_backend( consensus_ptr raft0, ss::sharded&, + ss::sharded&, ss::sharded&, ss::sharded&, ss::sharded&, ss::sharded&, ss::sharded&, - config::binding&& mode, config::binding&& availability_timeout, config::binding&& max_disk_usage_percent, config::binding&& storage_space_alert_free_threshold_percent, @@ -91,13 +92,16 @@ class partition_balancer_backend { consensus_ptr _raft0; controller_stm& _controller_stm; + features::feature_table& _feature_table; partition_balancer_state& _state; health_monitor_backend& _health_monitor; partition_allocator& _partition_allocator; topics_frontend& _topics_frontend; members_frontend& _members_frontend; - config::binding _mode; + features::sanctioning_binding< + config::enum_property> + _mode; config::binding _availability_timeout; config::binding _max_disk_usage_percent; config::binding _storage_space_alert_free_threshold_percent; diff --git a/tests/rptest/tests/partition_balancer_test.py b/tests/rptest/tests/partition_balancer_test.py index 52f7e5991b769..6d02a7f9df420 100644 --- a/tests/rptest/tests/partition_balancer_test.py +++ b/tests/rptest/tests/partition_balancer_test.py @@ -1206,3 +1206,59 @@ def test_recovery_mode_rebalance_finish(self): "partition counts not balanced" self.run_validation(consumer_timeout_sec=CONSUMER_TIMEOUT) + + @cluster(num_nodes=7, log_allow_list=CHAOS_LOG_ALLOW_LIST) + @matrix(disable_license=[True, False]) + def test_partition_autobalancer_sanction(self, disable_license): + def partitions_in_node(node_id): + return self.node2partition_count().get(node_id, 0) + + if disable_license: + environment = dict(__REDPANDA_DISABLE_BUILTIN_TRIAL_LICENSE='1') + else: + environment = dict() + + self.start_redpanda(num_nodes=5, environment=environment) + + self.topic = TopicSpec(partition_count=20) + self.client().create_topic(self.topic) + + self.start_producer(1) + self.start_consumer(1) + self.await_startup() + + with self.NodeStopper(self) as ns: + to_kill = random.choice(self.redpanda.nodes) + + kill_id = int(self.redpanda.node_id(to_kill)) + partitions_in_node_before = partitions_in_node(kill_id) + assert partitions_in_node_before != 0, f"Invalid Setup: Initial node partitions is \"{partitions_in_node_before}\"" + + ns.make_unavailable(to_kill, + failure_types=[FailureSpec.FAILURE_KILL]) + + time.sleep(30) + + admin = Admin(self.redpanda, + retry_codes=[503, 504], + retries_amount=10) + status = admin.get_partition_balancer_status(timeout=10) + + partitions_in_node_after = partitions_in_node(kill_id) + + s = status['status'] + assert s == 'ready', f"Expected status == 'ready' but got '{s}' instead" + + unavailable = status['violations'].get('unavailable_nodes', []) + if disable_license: + assert len( unavailable) == 0, \ + f"Expected no nodes in unavailable. unavailable: {unavailable}" + assert partitions_in_node_after == partitions_in_node_before + else: + assert kill_id in unavailable, f"Expected node with id \"{kill_id}\" in unavailable. unavailable: {unavailable}" + assert partitions_in_node_after == 0 + + # Restore the system to a fully healthy state before validation: + # not strictly necessary but simplifies debugging. + ns.make_available() + self.run_validation(consumer_timeout_sec=CONSUMER_TIMEOUT) From 7d3588fc0265c8723e5bbed22958b3d69b70df72 Mon Sep 17 00:00:00 2001 From: Ioannis Kavvadias Date: Tue, 29 Oct 2024 16:18:34 +0000 Subject: [PATCH 3/3] cluster: Sanction core continuous balancing on invalid license Implements CORE-8041 - Sanction-17 If there is no valid license and core_balancing_continuous is turned on, it will be ignored. --- src/v/cluster/controller.cc | 1 - src/v/cluster/shard_balancer.cc | 19 ++++++++++++--- src/v/cluster/shard_balancer.h | 5 ++-- tests/rptest/tests/shard_placement_test.py | 28 ++++++++++++++++------ 4 files changed, 40 insertions(+), 13 deletions(-) diff --git a/src/v/cluster/controller.cc b/src/v/cluster/controller.cc index 31bef6c4b0fab..08fc8c1e0dda1 100644 --- a/src/v/cluster/controller.cc +++ b/src/v/cluster/controller.cc @@ -511,7 +511,6 @@ ss::future<> controller::start( std::ref(_tp_state), std::ref(_backend), config::shard_local_cfg().core_balancing_on_core_count_change.bind(), - config::shard_local_cfg().core_balancing_continuous.bind(), config::shard_local_cfg().core_balancing_debounce_timeout.bind(), config::shard_local_cfg().topic_partitions_per_shard.bind(), config::shard_local_cfg().topic_partitions_reserve_shard0.bind()); diff --git a/src/v/cluster/shard_balancer.cc b/src/v/cluster/shard_balancer.cc index bccc7decd44b0..52600dc2d6834 100644 --- a/src/v/cluster/shard_balancer.cc +++ b/src/v/cluster/shard_balancer.cc @@ -14,6 +14,7 @@ #include "cluster/cluster_utils.h" #include "cluster/logger.h" #include "config/node_config.h" +#include "features/enterprise_features.h" #include "random/generators.h" #include "ssx/async_algorithm.h" #include "types.h" @@ -49,7 +50,6 @@ shard_balancer::shard_balancer( ss::sharded& topics, ss::sharded& cb, config::binding balancing_on_core_count_change, - config::binding balancing_continuous, config::binding debounce_timeout, config::binding partitions_per_shard, config::binding partitions_reserve_shard0) @@ -60,7 +60,9 @@ shard_balancer::shard_balancer( , _controller_backend(cb) , _self(*config::node().node_id()) , _balancing_on_core_count_change(std::move(balancing_on_core_count_change)) - , _balancing_continuous(std::move(balancing_continuous)) + , _balancing_continuous( + features::make_sanctioning_binding< + features::license_required_feature::core_balancing_continuous>()) , _debounce_timeout(std::move(debounce_timeout)) , _debounce_jitter(_debounce_timeout()) , _partitions_per_shard(std::move(partitions_per_shard)) @@ -461,9 +463,20 @@ void shard_balancer::maybe_assign( // partition is removed from this node, this will likely disrupt the // counts balance, so we set up the balancing timer. + const bool should_sanction = _features.should_sanction(); + const auto [balancing_continuous, is_sanctioned] + = _balancing_continuous(should_sanction); + if (is_sanctioned) { + vlog( + clusterlog.warn, + "A Redpanda Enterprise Edition license is required to use " + "enterprise feature \"core_balancing_continuous\". " + "This property is being ignored."); + } + if ( _features.is_active(features::feature::node_local_core_assignment) - && _balancing_continuous() && !_balance_timer.armed()) { + && balancing_continuous && !_balance_timer.armed()) { // Add jitter so that different nodes don't move replicas of the // same partition in unison. auto debounce_interval = _debounce_jitter.next_duration(); diff --git a/src/v/cluster/shard_balancer.h b/src/v/cluster/shard_balancer.h index bd85b6a1fe9c6..11edde77b0aff 100644 --- a/src/v/cluster/shard_balancer.h +++ b/src/v/cluster/shard_balancer.h @@ -13,7 +13,9 @@ #include "cluster/controller_backend.h" #include "cluster/shard_placement_table.h" +#include "config/property.h" #include "container/chunked_hash_map.h" +#include "features/enterprise_features.h" #include "random/simple_time_jitter.h" #include "ssx/event.h" #include "utils/mutex.h" @@ -41,7 +43,6 @@ class shard_balancer { ss::sharded&, ss::sharded&, config::binding balancing_on_core_count_change, - config::binding balancing_continuous, config::binding debounce_timeout, config::binding partitions_per_shard, config::binding partitions_reserve_shard0); @@ -116,7 +117,7 @@ class shard_balancer { model::node_id _self; config::binding _balancing_on_core_count_change; - config::binding _balancing_continuous; + features::sanctioning_binding> _balancing_continuous; config::binding _debounce_timeout; simple_time_jitter _debounce_jitter; config::binding _partitions_per_shard; diff --git a/tests/rptest/tests/shard_placement_test.py b/tests/rptest/tests/shard_placement_test.py index 58587b5c21652..7fe04e41cbae2 100644 --- a/tests/rptest/tests/shard_placement_test.py +++ b/tests/rptest/tests/shard_placement_test.py @@ -7,6 +7,7 @@ # the Business Source License, use of this software will be governed # by the Apache License, Version 2.0 +import time from ducktape.utils.util import wait_until from rptest.services.cluster import cluster @@ -17,6 +18,7 @@ from rptest.tests.prealloc_nodes import PreallocNodesTest from rptest.services.redpanda_installer import RedpandaInstaller from rptest.util import wait_until_result +from ducktape.mark import matrix class ShardPlacementTest(PreallocNodesTest): @@ -547,10 +549,17 @@ def check_balanced_shard_map(shard_map, num_cpus): self.stop_client_load() @cluster(num_nodes=6) - def test_node_join(self): + @matrix(disable_license=[True, False]) + def test_node_join(self, disable_license): self.redpanda.add_extra_rp_conf({ "core_balancing_continuous": True, }) + if disable_license: + environment = dict(__REDPANDA_DISABLE_BUILTIN_TRIAL_LICENSE='1') + else: + environment = dict() + self.redpanda.set_environment(environment) + seed_nodes = self.redpanda.nodes[0:3] joiner_nodes = self.redpanda.nodes[3:] self.redpanda.start(nodes=seed_nodes) @@ -584,7 +593,7 @@ def shard_rebalance_finished(): for topic in topics: total_count = sum( - len(replicas) for p, replicas in shard_map[topic].items()) + len(replicas) for _, replicas in shard_map[topic].items()) if total_count != n_partitions * 3: return False @@ -599,11 +608,16 @@ def shard_rebalance_finished(): return (True, shard_map) - shard_map_after_balance = wait_until_result(shard_rebalance_finished, - timeout_sec=60, - backoff_sec=2) - self.logger.info("shard rebalance finished") - self.print_shard_stats(shard_map_after_balance) + if disable_license: + time.sleep(60) + rebalanced = shard_rebalance_finished() + assert not rebalanced, f"continuous core balancing should not be active. Received {rebalanced}" + else: + shard_map_after_balance = wait_until_result( + shard_rebalance_finished, timeout_sec=60, backoff_sec=2) + self.logger.info("shard rebalance finished") + self.print_shard_stats(shard_map_after_balance) + self.wait_shard_map_consistent_with_cluster_partitions( user_topics=topics, admin=admin)