-
Notifications
You must be signed in to change notification settings - Fork 659
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
test(resharding): Adjust State mapping check for single shard tracking #12706
Changes from 1 commit
bac5355
fa8e29e
c617d07
73d28fd
c22af85
e68f8ba
5459d25
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -459,10 +459,8 @@ fn test_resharding_v3_base(params: TestReshardingParameters) { | |
TrieSanityCheck::new(&clients, params.load_mem_tries_for_tracked_shards); | ||
|
||
let latest_block_height = Cell::new(0u64); | ||
// Height of a block after resharding. | ||
let new_layout_block_height = Cell::new(None); | ||
// Height of an epoch after resharding. | ||
let new_layout_epoch_height = Cell::new(None); | ||
let resharding_block_hash = Cell::new(None); | ||
let epoch_height_after_resharding = Cell::new(None); | ||
let success_condition = |test_loop_data: &mut TestLoopData| -> bool { | ||
params | ||
.loop_actions | ||
|
@@ -485,56 +483,47 @@ fn test_resharding_v3_base(params: TestReshardingParameters) { | |
|
||
let client = clients[client_index]; | ||
let block_header = client.chain.get_block_header(&tip.last_block_hash).unwrap(); | ||
let shard_layout = client.epoch_manager.get_shard_layout(&tip.epoch_id).unwrap(); | ||
println!("Block: {:?} {} {:?}", tip.last_block_hash, tip.height, block_header.chunk_mask()); | ||
println!("Shard IDs: {:?}", shard_layout.shard_ids().collect_vec()); | ||
|
||
// Check that all chunks are included. | ||
if params.all_chunks_expected && params.chunk_ranges_to_drop.is_empty() { | ||
assert!(block_header.chunk_mask().iter().all(|chunk_bit| *chunk_bit)); | ||
} | ||
|
||
let shard_layout = client.epoch_manager.get_shard_layout(&tip.epoch_id).unwrap(); | ||
println!( | ||
"new block #{} shards: {:?} chunk mask {:?}", | ||
tip.height, | ||
shard_layout.shard_ids().collect_vec(), | ||
block_header.chunk_mask().to_vec() | ||
); | ||
|
||
trie_sanity_check.assert_state_sanity(&clients, expected_num_shards); | ||
|
||
let epoch_height = | ||
client.epoch_manager.get_epoch_height_from_prev_block(&tip.prev_block_hash).unwrap(); | ||
|
||
// Return false if we have not yet passed an epoch with increased number of shards. | ||
if new_layout_epoch_height.get().is_none() { | ||
assert!(epoch_height < 6); | ||
let prev_epoch_id = client | ||
.epoch_manager | ||
.get_prev_epoch_id_from_prev_block(&tip.prev_block_hash) | ||
.unwrap(); | ||
let epoch_config = client.epoch_manager.get_epoch_config(&prev_epoch_id).unwrap(); | ||
let epoch_id = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could also just use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. changed in fa8e29e |
||
client.epoch_manager.get_epoch_id_from_prev_block(&tip.prev_block_hash).unwrap(); | ||
let epoch_info = client.epoch_manager.get_epoch_info(&epoch_id).unwrap(); | ||
let epoch_height = epoch_info.epoch_height(); | ||
|
||
// Return false if we have not resharded yet. | ||
if epoch_height_after_resharding.get().is_none() { | ||
assert!(epoch_height < 5); | ||
let epoch_config = client.epoch_manager.get_epoch_config(&epoch_id).unwrap(); | ||
if epoch_config.shard_layout.num_shards() != expected_num_shards { | ||
return false; | ||
} | ||
// Just passed an epoch with increased number of shards. | ||
new_layout_block_height.set(Some(latest_block_height.get())); | ||
new_layout_epoch_height.set(Some(epoch_height)); | ||
// Just resharded. | ||
resharding_block_hash.set(Some(tip.prev_block_hash)); | ||
epoch_height_after_resharding.set(Some(epoch_height)); | ||
// Assert that we will have a chance for gc to kick in before the test is over. | ||
assert!(epoch_height + GC_NUM_EPOCHS_TO_KEEP < TESTLOOP_NUM_EPOCHS_TO_WAIT); | ||
println!("State after resharding:"); | ||
print_and_assert_shard_accounts(&clients, &tip); | ||
} | ||
|
||
check_state_shard_uid_mapping_after_resharding( | ||
&client, | ||
parent_shard_uid, | ||
params.allow_negative_refcount, | ||
); | ||
for client in clients { | ||
check_state_shard_uid_mapping_after_resharding( | ||
client, | ||
&tip.prev_block_hash, | ||
&resharding_block_hash.get().unwrap(), | ||
parent_shard_uid, | ||
params.allow_negative_refcount, | ||
); | ||
} | ||
|
||
// Return false if garbage collection window has not passed yet since resharding. | ||
if epoch_height <= new_layout_epoch_height.get().unwrap() + GC_NUM_EPOCHS_TO_KEEP { | ||
if epoch_height <= epoch_height_after_resharding.get().unwrap() + GC_NUM_EPOCHS_TO_KEEP { | ||
return false; | ||
} | ||
for loop_action in ¶ms.loop_actions { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -118,3 +118,25 @@ pub fn shard_was_split(shard_layout: &ShardLayout, shard_id: ShardId) -> bool { | |
}; | ||
parent != shard_id | ||
} | ||
|
||
pub fn get_tracked_shards_from_prev_block( | ||
client: &Client, | ||
prev_block_hash: &CryptoHash, | ||
) -> Vec<ShardUId> { | ||
let account_id = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: could avoid cloning: diff --git a/integration-tests/src/test_loop/utils/sharding.rs b/integration-tests/src/test_loop/utils/sharding.rs
index 9db148659..5dab61245 100644
--- a/integration-tests/src/test_loop/utils/sharding.rs
+++ b/integration-tests/src/test_loop/utils/sharding.rs
@@ -123,14 +123,14 @@ pub fn get_tracked_shards_from_prev_block(
client: &Client,
prev_block_hash: &CryptoHash,
) -> Vec<ShardUId> {
- let account_id =
- client.validator_signer.get().map(|validator| validator.validator_id().clone());
+ let signer = client.validator_signer.get();
+ let account_id = signer.as_ref().map(|s| s.validator_id());
let mut tracked_shards = vec![];
for shard_uid in
client.epoch_manager.get_shard_layout_from_prev_block(prev_block_hash).unwrap().shard_uids()
{
if client.shard_tracker.care_about_shard(
- account_id.as_ref(),
+ account_id,
prev_block_hash,
shard_uid.shard_id(),
true,
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed in fa8e29e |
||
client.validator_signer.get().map(|validator| validator.validator_id().clone()); | ||
let mut tracked_shards = vec![]; | ||
for shard_uid in | ||
client.epoch_manager.get_shard_layout_from_prev_block(prev_block_hash).unwrap().shard_uids() | ||
{ | ||
if client.shard_tracker.care_about_shard( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is a bit of a mismatch between the function name and the implementation here. The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
account_id.as_ref(), | ||
prev_block_hash, | ||
shard_uid.shard_id(), | ||
true, | ||
) { | ||
tracked_shards.push(shard_uid); | ||
} | ||
} | ||
tracked_shards | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
use super::sharding::shard_was_split; | ||
use crate::test_loop::utils::sharding::{client_tracking_shard, get_memtrie_for_shard}; | ||
use crate::test_loop::utils::sharding::{ | ||
client_tracking_shard, get_memtrie_for_shard, get_tracked_shards_from_prev_block, | ||
}; | ||
use borsh::BorshDeserialize; | ||
use itertools::Itertools; | ||
use near_chain::types::Tip; | ||
|
@@ -10,6 +12,7 @@ use near_primitives::shard_layout::ShardLayout; | |
use near_primitives::state::FlatStateValue; | ||
use near_primitives::types::{AccountId, EpochId, NumShards}; | ||
use near_primitives::version::PROTOCOL_VERSION; | ||
use near_store::adapter::trie_store::get_shard_uid_mapping; | ||
use near_store::adapter::StoreAdapter; | ||
use near_store::db::refcount::decode_value_with_rc; | ||
use near_store::flat::FlatStorageStatus; | ||
|
@@ -340,6 +343,8 @@ fn should_assert_state_sanity( | |
/// Asserts that all parent shard State is accessible via parent and children shards. | ||
pub fn check_state_shard_uid_mapping_after_resharding( | ||
client: &Client, | ||
prev_block_hash: &CryptoHash, | ||
resharding_block_hash: &CryptoHash, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It feels strange to pass the prev_block_hash and resharding_block_hash, and get the tip from the client on the first line of this method. Mixing "pure"-ish function approach with "stateful" approach is asking for trouble. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed in fa8e29e |
||
parent_shard_uid: ShardUId, | ||
allow_negative_refcount: bool, | ||
) { | ||
|
@@ -350,17 +355,35 @@ pub fn check_state_shard_uid_mapping_after_resharding( | |
epoch_config.shard_layout.get_children_shards_uids(parent_shard_uid.shard_id()).unwrap(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit though not your code: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed in fa8e29e |
||
assert_eq!(children_shard_uids.len(), 2); | ||
|
||
let store = client.chain.chain_store.store().trie_store(); | ||
let mut checked_any = false; | ||
for kv in store.store().iter_raw_bytes(DBCol::State) { | ||
// Currently tracked shards. | ||
let tracked_shards = get_tracked_shards_from_prev_block(client, prev_block_hash); | ||
// ShardUId mappings (different than map to itself) that we have stored in DB. | ||
let mut shard_uid_mapping = HashMap::new(); | ||
// Currently tracked children shards that are mapped to an ancestor. | ||
let mut tracked_mapped_children = vec![]; | ||
let store = client.chain.chain_store.store(); | ||
for child_shard_uid in &children_shard_uids { | ||
let mapped_shard_uid = get_shard_uid_mapping(store, *child_shard_uid); | ||
if &mapped_shard_uid == child_shard_uid { | ||
continue; | ||
} | ||
shard_uid_mapping.insert(child_shard_uid, mapped_shard_uid); | ||
if tracked_shards.contains(child_shard_uid) { | ||
tracked_mapped_children.push(*child_shard_uid); | ||
} | ||
} | ||
|
||
// Whether we found any value in DB for which we could test the mapping. | ||
let mut checked_any_key = false; | ||
let trie_store = store.trie_store(); | ||
for kv in store.iter_raw_bytes(DBCol::State) { | ||
let (key, value) = kv.unwrap(); | ||
let shard_uid = ShardUId::try_from_slice(&key[0..8]).unwrap(); | ||
// Just after resharding, no State data must be keyed using children ShardUIds. | ||
assert!(!children_shard_uids.contains(&shard_uid)); | ||
assert!(!shard_uid_mapping.contains_key(&shard_uid)); | ||
if shard_uid != parent_shard_uid { | ||
continue; | ||
} | ||
checked_any = true; | ||
let node_hash = CryptoHash::try_from_slice(&key[8..]).unwrap(); | ||
let (value, rc) = decode_value_with_rc(&value); | ||
// It is possible we have delayed receipts leftovers on disk, | ||
|
@@ -374,14 +397,37 @@ pub fn check_state_shard_uid_mapping_after_resharding( | |
assert!(value.is_none()); | ||
continue; | ||
} | ||
let parent_value = store.get(parent_shard_uid, &node_hash); | ||
// Parent shard data must still be accessible using parent ShardUId. | ||
let parent_value = trie_store.get(parent_shard_uid, &node_hash); | ||
// Sanity check: parent shard data must still be accessible using Trie interface and parent ShardUId. | ||
assert_eq!(&parent_value.unwrap()[..], value.unwrap()); | ||
|
||
// All parent shard data is available via both children shards. | ||
for child_shard_uid in &children_shard_uids { | ||
let child_value = store.get(*child_shard_uid, &node_hash); | ||
for child_shard_uid in &tracked_mapped_children { | ||
let child_value = trie_store.get(*child_shard_uid, &node_hash); | ||
assert_eq!(&child_value.unwrap()[..], value.unwrap()); | ||
} | ||
checked_any_key = true; | ||
} | ||
assert!(checked_any_key); | ||
|
||
let shards_tracked_after_resharding = | ||
get_tracked_shards_from_prev_block(client, resharding_block_hash); | ||
// Sanity checks if the node tracks all shards (e.g. it is RPC node). | ||
if !client.config.tracked_shards.is_empty() { | ||
assert_eq!(tracked_mapped_children.len(), 2); | ||
assert_eq!( | ||
shards_tracked_after_resharding.len(), | ||
epoch_config.shard_layout.num_shards() as usize | ||
); | ||
} | ||
// If any child shard was tracked after resharding, it means the node had to split the parent shard. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wondering what happens if the node tracks the parent both not any child We do resharding for nothing? 🤔 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, added a test in fa8e29e |
||
if children_shard_uids | ||
.iter() | ||
.any(|child_shard_uid| shards_tracked_after_resharding.contains(child_shard_uid)) | ||
{ | ||
assert_eq!(shard_uid_mapping.len(), 2); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm this makes me think of another test case to add: chunk producer tracks the parent and a child after reshrding, then gets assigned to an unrelated shard, then in the future state syncs the child again. then the state mapping will still exist but the state application in state sync will just write the child shard uid directly in the DB. Wonder what happens in that case There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When the child shard is no longer tracked doesn't the mapping get removed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. State cleanup will deal with cleaning resharding mapping. For now, once mapping is set it will be used forever for the child. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I am pretty sure that was covered by shard shuffling test, but added |
||
} else { | ||
// Otherwise, no mapping was set and no shard State would be mapped. | ||
assert!(shard_uid_mapping.is_empty()); | ||
} | ||
assert!(checked_any); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No objection from me on removing this and the print below, but I wonder if anybody was using these print statements while debugging?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1 I think those are good to have.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Haven't used them myself. We can add them back locally to debug if needed 👍
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will leave on of them, because they duplicate
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed in fa8e29e