Skip to content

Commit

Permalink
Fix simtest failure due to pruned checkpoints after long sync timeout (
Browse files Browse the repository at this point in the history
…#16205)

Also adds improved debug logging in a few places.
  • Loading branch information
aschran authored Feb 13, 2024
1 parent e6c5adb commit dbaf4c8
Show file tree
Hide file tree
Showing 7 changed files with 51 additions and 5 deletions.
2 changes: 1 addition & 1 deletion crates/sui-config/src/p2p.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ pub struct StateSyncConfig {
/// - in case of a fork, to prevent the node from syncing to the wrong chain.
/// - in case of a network stall, to force the node to proceed with a manually-injected
/// checkpoint.
#[serde(skip_serializing_if = "Vec::is_empty")]
#[serde(skip_serializing_if = "Vec::is_empty", default)]
pub pinned_checkpoints: Vec<(CheckpointSequenceNumber, CheckpointDigest)>,

/// Query peers for their latest checkpoint every interval period.
Expand Down
2 changes: 2 additions & 0 deletions crates/sui-e2e-tests/tests/protocol_version_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,10 @@ mod sim_only_tests {
.with_supported_protocol_version_callback(Arc::new(|idx, name| {
if name.is_some() && idx == 0 {
// first validator only does not support version FINISH.
info!("node {name:?} supports protocol versions up to {START}");
SupportedProtocolVersions::new_for_testing(START, START)
} else {
info!("node {name:?} supports protocol versions up to {FINISH}");
SupportedProtocolVersions::new_for_testing(START, FINISH)
}
}))
Expand Down
6 changes: 6 additions & 0 deletions crates/sui-network/src/state_sync/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1316,6 +1316,7 @@ where
tokio::time::sleep(duration).await;
return Err(checkpoint);
};
debug!("completed checkpoint contents sync");
Ok(checkpoint)
}

Expand Down Expand Up @@ -1346,6 +1347,11 @@ where
// Iterate through our selected peers trying each one in turn until we're able to
// successfully get the target checkpoint
for mut peer in peers {
debug!(
?timeout,
"requesting checkpoint contents from {}",
peer.inner().peer_id(),
);
let request = Request::new(digest).with_timeout(timeout);
if let Some(contents) = peer
.get_checkpoint_contents(request)
Expand Down
14 changes: 13 additions & 1 deletion crates/sui-swarm-config/src/node_config_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use sui_config::node::{
DEFAULT_GRPC_CONCURRENCY_LIMIT,
};
use sui_config::node::{default_zklogin_oauth_providers, ConsensusProtocol, RunWithRange};
use sui_config::p2p::{P2pConfig, SeedPeer};
use sui_config::p2p::{P2pConfig, SeedPeer, StateSyncConfig};
use sui_config::{
local_ip_utils, ConsensusConfig, NodeConfig, AUTHORITIES_DB_NAME, CONSENSUS_DB_NAME,
FULL_NODE_DB_PATH,
Expand Down Expand Up @@ -128,6 +128,12 @@ impl ValidatorConfigBuilder {
.unwrap()
}),
external_address: Some(validator.p2p_address),
// Set a shorter timeout for checkpoint content download in tests, since
// checkpoint pruning also happens much faster, and network is local.
state_sync: Some(StateSyncConfig {
checkpoint_content_timeout_ms: Some(10_000),
..Default::default()
}),
..Default::default()
};

Expand Down Expand Up @@ -364,6 +370,12 @@ impl FullnodeConfigBuilder {
.p2p_external_address
.or(Some(validator_config.p2p_address.clone())),
seed_peers,
// Set a shorter timeout for checkpoint content download in tests, since
// checkpoint pruning also happens much faster, and network is local.
state_sync: Some(StateSyncConfig {
checkpoint_content_timeout_ms: Some(10_000),
..Default::default()
}),
..Default::default()
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ validator_configs:
grpc-concurrency-limit: 20000000000
p2p-config:
listen-address: "0.0.0.0:1"
state-sync:
checkpoint-content-timeout-ms: 10000
genesis:
genesis: "[fake genesis]"
authority-store-pruning-config:
Expand Down Expand Up @@ -171,6 +173,8 @@ validator_configs:
grpc-concurrency-limit: 20000000000
p2p-config:
listen-address: "0.0.0.0:1"
state-sync:
checkpoint-content-timeout-ms: 10000
genesis:
genesis: "[fake genesis]"
authority-store-pruning-config:
Expand Down Expand Up @@ -292,6 +296,8 @@ validator_configs:
grpc-concurrency-limit: 20000000000
p2p-config:
listen-address: "0.0.0.0:1"
state-sync:
checkpoint-content-timeout-ms: 10000
genesis:
genesis: "[fake genesis]"
authority-store-pruning-config:
Expand Down Expand Up @@ -413,6 +419,8 @@ validator_configs:
grpc-concurrency-limit: 20000000000
p2p-config:
listen-address: "0.0.0.0:1"
state-sync:
checkpoint-content-timeout-ms: 10000
genesis:
genesis: "[fake genesis]"
authority-store-pruning-config:
Expand Down Expand Up @@ -534,6 +542,8 @@ validator_configs:
grpc-concurrency-limit: 20000000000
p2p-config:
listen-address: "0.0.0.0:1"
state-sync:
checkpoint-content-timeout-ms: 10000
genesis:
genesis: "[fake genesis]"
authority-store-pruning-config:
Expand Down Expand Up @@ -655,6 +665,8 @@ validator_configs:
grpc-concurrency-limit: 20000000000
p2p-config:
listen-address: "0.0.0.0:1"
state-sync:
checkpoint-content-timeout-ms: 10000
genesis:
genesis: "[fake genesis]"
authority-store-pruning-config:
Expand Down Expand Up @@ -776,6 +788,8 @@ validator_configs:
grpc-concurrency-limit: 20000000000
p2p-config:
listen-address: "0.0.0.0:1"
state-sync:
checkpoint-content-timeout-ms: 10000
genesis:
genesis: "[fake genesis]"
authority-store-pruning-config:
Expand Down
14 changes: 12 additions & 2 deletions crates/sui-swarm/src/memory/swarm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ use std::{
ops,
path::{Path, PathBuf},
};

use sui_config::node::{DBCheckpointConfig, OverloadThresholdConfig, RunWithRange};
use sui_config::NodeConfig;
use sui_macros::nondeterministic;
Expand All @@ -28,6 +27,7 @@ use sui_swarm_config::node_config_builder::FullnodeConfigBuilder;
use sui_types::base_types::AuthorityName;
use sui_types::object::Object;
use tempfile::TempDir;
use tracing::info;

pub struct SwarmBuilder<R = OsRng> {
rng: R,
Expand Down Expand Up @@ -295,7 +295,13 @@ impl<R: rand::RngCore + rand::CryptoRng> SwarmBuilder<R> {
let mut nodes: HashMap<_, _> = network_config
.validator_configs()
.iter()
.map(|config| (config.protocol_public_key(), Node::new(config.to_owned())))
.map(|config| {
info!(
"SwarmBuilder configuring validator with name {}",
config.protocol_public_key()
);
(config.protocol_public_key(), Node::new(config.to_owned()))
})
.collect();

let mut fullnode_config_builder = FullnodeConfigBuilder::new()
Expand Down Expand Up @@ -326,6 +332,10 @@ impl<R: rand::RngCore + rand::CryptoRng> SwarmBuilder<R> {
}
}
let config = builder.build(&mut OsRng, &network_config);
info!(
"SwarmBuilder configuring full node with name {}",
config.protocol_public_key()
);
nodes.insert(config.protocol_public_key(), Node::new(config));
});
}
Expand Down
4 changes: 3 additions & 1 deletion crates/test-cluster/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,9 @@ impl TestCluster {
unreachable!("Broken reconfig channel");
})
.await
.expect("Timed out waiting for cluster to target epoch")
.unwrap_or_else(|_| {
panic!("Timed out waiting for cluster to target epoch {target_epoch:?}")
})
}

pub async fn wait_for_run_with_range_shutdown_signal(&self) -> Option<RunWithRange> {
Expand Down

0 comments on commit dbaf4c8

Please sign in to comment.