diff --git a/rs/cli/src/commands/network.rs b/rs/cli/src/commands/network.rs index 2fc17f965..fa8a6b93a 100644 --- a/rs/cli/src/commands/network.rs +++ b/rs/cli/src/commands/network.rs @@ -10,6 +10,11 @@ pub struct Network { #[clap(long)] pub heal: bool, + /// Optimize the decentralization of the subnets that are not compliant with the + /// business rules (target topology). + #[clap(long, visible_alias = "optimize")] + pub optimize_decentralization: bool, + /// Ensure that at least one node of each node operator is /// assigned to some (any) subnet. Node will only be assigned to a subnet if /// this does not worsen the decentralization of the target subnet. @@ -37,9 +42,11 @@ impl ExecutableCommand for Network { let ic_admin = ctx.ic_admin().await?; let mut errors = vec![]; let network_heal = self.heal || std::env::args().any(|arg| arg == "heal"); - if network_heal { + if network_heal || self.optimize_decentralization { info!("Healing the network by replacing unhealthy nodes and optimizing decentralization in subnets that have unhealthy nodes"); - let proposals = runner.network_heal(ctx.forum_post_link(), &self.skip_subnets).await?; + let proposals = runner + .network_heal(ctx.forum_post_link(), &self.skip_subnets, self.optimize_decentralization) + .await?; for proposal in proposals { if let Err(e) = ic_admin.propose_run(proposal.cmd, proposal.opts).await { errors.push(e); diff --git a/rs/cli/src/runner.rs b/rs/cli/src/runner.rs index 3bb3d3aec..267e9748d 100644 --- a/rs/cli/src/runner.rs +++ b/rs/cli/src/runner.rs @@ -505,7 +505,12 @@ impl Runner { }) } - pub async fn network_heal(&self, forum_post_link: Option, skip_subnets: &[String]) -> anyhow::Result> { + pub async fn network_heal( + &self, + forum_post_link: Option, + skip_subnets: &[String], + optimize_decentralization: bool, + ) -> anyhow::Result> { let mut errors = vec![]; // Get the list of subnets, and the list of open proposal for each subnet, if any @@ -544,6 +549,7 @@ impl Runner { vec![] }), &all_nodes, + optimize_decentralization, ) .await?; diff --git a/rs/decentralization/src/nakamoto/mod.rs b/rs/decentralization/src/nakamoto/mod.rs index e56951f5e..5d0798de9 100644 --- a/rs/decentralization/src/nakamoto/mod.rs +++ b/rs/decentralization/src/nakamoto/mod.rs @@ -647,7 +647,10 @@ mod tests { // expected error message assert_eq!( new_test_subnet(0, 2, 0).check_business_rules().unwrap(), - (1000, vec!["Subnet should have 1 DFINITY-owned nodes, got 0".to_string()]) + ( + 1000, + vec!["Subnet should have 1 DFINITY-owned node(s) for subnet recovery, got 0".to_string()] + ) ); } @@ -998,7 +1001,7 @@ mod tests { important.insert(subnet.principal, subnet); let network_heal_response = NetworkHealRequest::new(important.clone()) - .heal_and_optimize(nodes_available.clone(), &health_of_nodes, vec![], &all_nodes) + .heal_and_optimize(nodes_available.clone(), &health_of_nodes, vec![], &all_nodes, false) .await .unwrap(); let result = network_heal_response.first().unwrap().clone(); diff --git a/rs/decentralization/src/network.rs b/rs/decentralization/src/network.rs index b89f1915e..95c4ec2a1 100644 --- a/rs/decentralization/src/network.rs +++ b/rs/decentralization/src/network.rs @@ -1,5 +1,5 @@ use crate::nakamoto::NakamotoScore; -use crate::subnets::unhealthy_with_nodes; +use crate::subnets::{subnets_with_business_rules_violations, unhealthy_with_nodes}; use crate::SubnetChangeResponse; use actix_web::http::StatusCode; use actix_web::{HttpResponse, ResponseError}; @@ -176,7 +176,7 @@ impl DecentralizedSubnet { if dfinity_owned_nodes_count != target_dfinity_owned_nodes_count { checks.push(format!( - "Subnet should have {} DFINITY-owned nodes, got {}", + "Subnet should have {} DFINITY-owned node(s) for subnet recovery, got {}", target_dfinity_owned_nodes_count, dfinity_owned_nodes_count )); penalties += target_dfinity_owned_nodes_count.abs_diff(dfinity_owned_nodes_count) * 1000; @@ -1258,6 +1258,7 @@ impl NetworkHealRequest { health_of_nodes: &IndexMap, cordoned_features: Vec, all_nodes: &[Node], + optimize_for_business_rules_compliance: bool, ) -> Result, NetworkError> { let mut subnets_changed = Vec::new(); let subnets_to_heal = unhealthy_with_nodes(&self.subnets, health_of_nodes) @@ -1273,12 +1274,42 @@ impl NetworkHealRequest { }) .sorted_by(|a, b| a.cmp(b).reverse()) .collect_vec(); + let subnets_to_optimize = if optimize_for_business_rules_compliance { + // Exclude subnets that are already in subnets_to_heal + let subnet_ids_to_heal = subnets_to_heal + .iter() + .map(|subnet| subnet.decentralized_subnet.id) + .collect::>(); + let subnets = self + .subnets + .iter() + .filter_map(|(subnet_id, subnet)| { + if subnet_ids_to_heal.contains(subnet_id) { + None + } else { + Some(subnet.clone()) + } + }) + .collect_vec(); + // Find subnets that have business rules violations + subnets_with_business_rules_violations(&subnets) + .into_iter() + .map(|subnet| NetworkHealSubnets { + name: subnet.metadata.name.clone(), + decentralized_subnet: DecentralizedSubnet::from(subnet), + unhealthy_nodes: vec![], + }) + .sorted_by(|a, b| a.cmp(b).reverse()) + .collect_vec() + } else { + vec![] + }; - if subnets_to_heal.is_empty() { - info!("Nothing to do! All subnets are healthy.") + if subnets_to_heal.is_empty() && subnets_to_optimize.is_empty() { + info!("Nothing to do! All subnets are healthy and compliant with business rules.") } - for subnet in subnets_to_heal { + for subnet in subnets_to_heal.into_iter().chain(subnets_to_optimize) { // If more than 1/3 nodes do not have the latest subnet state, subnet will stall. // From those 1/2 are added and 1/2 removed -> nodes_in_subnet/3 * 1/2 = nodes_in_subnet/6 let max_replaceable_nodes = subnet.decentralized_subnet.nodes.len() / 6; @@ -1406,6 +1437,11 @@ impl NetworkHealRequest { .expect("No suitable changes found") }; + if change.node_ids_removed.is_empty() { + warn!("No suitable changes found for subnet {}", subnet.decentralized_subnet.id); + continue; + } + info!( "Replacing {} nodes in subnet {} gives Nakamoto coefficient: {}\n", change.node_ids_removed.len(), diff --git a/rs/decentralization/src/subnets.rs b/rs/decentralization/src/subnets.rs index cd6a5a3f0..ea2e2dd45 100644 --- a/rs/decentralization/src/subnets.rs +++ b/rs/decentralization/src/subnets.rs @@ -7,6 +7,8 @@ use indexmap::IndexMap; use itertools::Itertools; use std::sync::Arc; +use crate::network::DecentralizedSubnet; + pub fn unhealthy_with_nodes( subnets: &IndexMap, nodes_health: &IndexMap, @@ -33,6 +35,26 @@ pub fn unhealthy_with_nodes( .collect::>() } +pub fn subnets_with_business_rules_violations(subnets: &[Subnet]) -> Vec { + subnets + .iter() + .filter_map(|subnet| { + let decentralized_subnet = DecentralizedSubnet::from(subnet.clone()); + + if decentralized_subnet + .check_business_rules() + .expect("business rules check should succeed") + .0 + > 0 + { + Some(subnet.clone()) + } else { + None + } + }) + .collect_vec() +} + pub struct NodesRemover { pub no_auto: bool, pub remove_degraded: bool,