Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(decentralization): add --optimize to network heal for subnets not compliant with target topology #1085

Merged
merged 5 commits into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions rs/cli/src/commands/network.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ pub struct Network {
#[clap(long)]
pub heal: bool,

/// Optimize the decentralization of the subnets that are not compliant with the
/// business rules (target topology).
#[clap(long, visible_alias = "optimize")]
pub optimize_decentralization: bool,

/// Ensure that at least one node of each node operator is
/// assigned to some (any) subnet. Node will only be assigned to a subnet if
/// this does not worsen the decentralization of the target subnet.
Expand Down Expand Up @@ -37,9 +42,11 @@ impl ExecutableCommand for Network {
let ic_admin = ctx.ic_admin().await?;
let mut errors = vec![];
let network_heal = self.heal || std::env::args().any(|arg| arg == "heal");
if network_heal {
if network_heal || self.optimize_decentralization {
info!("Healing the network by replacing unhealthy nodes and optimizing decentralization in subnets that have unhealthy nodes");
let proposals = runner.network_heal(ctx.forum_post_link(), &self.skip_subnets).await?;
let proposals = runner
.network_heal(ctx.forum_post_link(), &self.skip_subnets, self.optimize_decentralization)
.await?;
for proposal in proposals {
if let Err(e) = ic_admin.propose_run(proposal.cmd, proposal.opts).await {
errors.push(e);
Expand Down
8 changes: 7 additions & 1 deletion rs/cli/src/runner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,12 @@ impl Runner {
})
}

pub async fn network_heal(&self, forum_post_link: Option<String>, skip_subnets: &[String]) -> anyhow::Result<Vec<RunnerProposal>> {
pub async fn network_heal(
&self,
forum_post_link: Option<String>,
skip_subnets: &[String],
optimize_decentralization: bool,
) -> anyhow::Result<Vec<RunnerProposal>> {
let mut errors = vec![];

// Get the list of subnets, and the list of open proposal for each subnet, if any
Expand Down Expand Up @@ -544,6 +549,7 @@ impl Runner {
vec![]
}),
&all_nodes,
optimize_decentralization,
)
.await?;

Expand Down
7 changes: 5 additions & 2 deletions rs/decentralization/src/nakamoto/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,10 @@ mod tests {
// expected error message
assert_eq!(
new_test_subnet(0, 2, 0).check_business_rules().unwrap(),
(1000, vec!["Subnet should have 1 DFINITY-owned nodes, got 0".to_string()])
(
1000,
vec!["Subnet should have 1 DFINITY-owned node(s) for subnet recovery, got 0".to_string()]
)
);
}

Expand Down Expand Up @@ -998,7 +1001,7 @@ mod tests {
important.insert(subnet.principal, subnet);

let network_heal_response = NetworkHealRequest::new(important.clone())
.heal_and_optimize(nodes_available.clone(), &health_of_nodes, vec![], &all_nodes)
.heal_and_optimize(nodes_available.clone(), &health_of_nodes, vec![], &all_nodes, false)
.await
.unwrap();
let result = network_heal_response.first().unwrap().clone();
Expand Down
46 changes: 41 additions & 5 deletions rs/decentralization/src/network.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::nakamoto::NakamotoScore;
use crate::subnets::unhealthy_with_nodes;
use crate::subnets::{subnets_with_business_rules_violations, unhealthy_with_nodes};
use crate::SubnetChangeResponse;
use actix_web::http::StatusCode;
use actix_web::{HttpResponse, ResponseError};
Expand Down Expand Up @@ -176,7 +176,7 @@ impl DecentralizedSubnet {

if dfinity_owned_nodes_count != target_dfinity_owned_nodes_count {
checks.push(format!(
"Subnet should have {} DFINITY-owned nodes, got {}",
"Subnet should have {} DFINITY-owned node(s) for subnet recovery, got {}",
target_dfinity_owned_nodes_count, dfinity_owned_nodes_count
));
penalties += target_dfinity_owned_nodes_count.abs_diff(dfinity_owned_nodes_count) * 1000;
Expand Down Expand Up @@ -1258,6 +1258,7 @@ impl NetworkHealRequest {
health_of_nodes: &IndexMap<PrincipalId, HealthStatus>,
cordoned_features: Vec<NodeFeaturePair>,
all_nodes: &[Node],
optimize_for_business_rules_compliance: bool,
) -> Result<Vec<SubnetChangeResponse>, NetworkError> {
let mut subnets_changed = Vec::new();
let subnets_to_heal = unhealthy_with_nodes(&self.subnets, health_of_nodes)
Expand All @@ -1273,12 +1274,42 @@ impl NetworkHealRequest {
})
.sorted_by(|a, b| a.cmp(b).reverse())
.collect_vec();
let subnets_to_optimize = if optimize_for_business_rules_compliance {
// Exclude subnets that are already in subnets_to_heal
let subnet_ids_to_heal = subnets_to_heal
.iter()
.map(|subnet| subnet.decentralized_subnet.id)
.collect::<AHashSet<_>>();
let subnets = self
.subnets
.iter()
.filter_map(|(subnet_id, subnet)| {
if subnet_ids_to_heal.contains(subnet_id) {
None
} else {
Some(subnet.clone())
}
})
.collect_vec();
// Find subnets that have business rules violations
subnets_with_business_rules_violations(&subnets)
.into_iter()
.map(|subnet| NetworkHealSubnets {
name: subnet.metadata.name.clone(),
decentralized_subnet: DecentralizedSubnet::from(subnet),
unhealthy_nodes: vec![],
})
.sorted_by(|a, b| a.cmp(b).reverse())
.collect_vec()
} else {
vec![]
};

if subnets_to_heal.is_empty() {
info!("Nothing to do! All subnets are healthy.")
if subnets_to_heal.is_empty() && subnets_to_optimize.is_empty() {
info!("Nothing to do! All subnets are healthy and compliant with business rules.")
}

for subnet in subnets_to_heal {
for subnet in subnets_to_heal.into_iter().chain(subnets_to_optimize) {
// If more than 1/3 nodes do not have the latest subnet state, subnet will stall.
// From those 1/2 are added and 1/2 removed -> nodes_in_subnet/3 * 1/2 = nodes_in_subnet/6
let max_replaceable_nodes = subnet.decentralized_subnet.nodes.len() / 6;
Expand Down Expand Up @@ -1406,6 +1437,11 @@ impl NetworkHealRequest {
.expect("No suitable changes found")
};

if change.node_ids_removed.is_empty() {
warn!("No suitable changes found for subnet {}", subnet.decentralized_subnet.id);
continue;
}

info!(
"Replacing {} nodes in subnet {} gives Nakamoto coefficient: {}\n",
change.node_ids_removed.len(),
Expand Down
22 changes: 22 additions & 0 deletions rs/decentralization/src/subnets.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ use indexmap::IndexMap;
use itertools::Itertools;
use std::sync::Arc;

use crate::network::DecentralizedSubnet;

pub fn unhealthy_with_nodes(
subnets: &IndexMap<PrincipalId, Subnet>,
nodes_health: &IndexMap<PrincipalId, HealthStatus>,
Expand All @@ -33,6 +35,26 @@ pub fn unhealthy_with_nodes(
.collect::<IndexMap<_, _>>()
}

pub fn subnets_with_business_rules_violations(subnets: &[Subnet]) -> Vec<Subnet> {
subnets
.iter()
.filter_map(|subnet| {
let decentralized_subnet = DecentralizedSubnet::from(subnet.clone());

if decentralized_subnet
.check_business_rules()
.expect("business rules check should succeed")
.0
> 0
{
Some(subnet.clone())
} else {
None
}
})
.collect_vec()
}

pub struct NodesRemover {
pub no_auto: bool,
pub remove_degraded: bool,
Expand Down
Loading