From 323bd75d028d19d3a2fa414880f0465f74197f96 Mon Sep 17 00:00:00 2001 From: Vitaly Terekhov Date: Thu, 20 Jun 2024 12:11:15 +0000 Subject: [PATCH] feature(collator): added more execution metrics --- collator/src/collator/do_collate.rs | 74 +++++++++++++------- collator/src/collator/execution_manager.rs | 57 ++++++++++++++-- collator/src/collator/types.rs | 36 +++++----- scripts/gen-dashboard.py | 79 +++++++++++++++++++--- 4 files changed, 191 insertions(+), 55 deletions(-) diff --git a/collator/src/collator/do_collate.rs b/collator/src/collator/do_collate.rs index 7e2213b39..50d9b786a 100644 --- a/collator/src/collator/do_collate.rs +++ b/collator/src/collator/do_collate.rs @@ -122,8 +122,17 @@ impl CollatorStdImpl { let (max_messages_per_set, min_externals_per_set, group_limit, group_vert_size) = self.get_msgs_execution_params(); + metrics::gauge!("tycho_do_collate_msgs_exec_params_set_size") + .set(max_messages_per_set as f64); + metrics::gauge!("tycho_do_collate_msgs_exec_params_min_exts_per_set") + .set(min_externals_per_set as f64); + metrics::gauge!("tycho_do_collate_msgs_exec_params_group_limit").set(group_limit as f64); + metrics::gauge!("tycho_do_collate_msgs_exec_params_group_vert_size") + .set(group_vert_size as f64); + // init execution manager let mut exec_manager = ExecutionManager::new( + self.shard_id, collation_data.next_lt, Arc::new(PreloadedBlockchainConfig::with_config( mc_data.config().clone(), @@ -191,6 +200,7 @@ impl CollatorStdImpl { let mut fill_msgs_total_elapsed = Duration::ZERO; let mut execute_msgs_total_elapsed = Duration::ZERO; let mut process_txs_total_elapsed = Duration::ZERO; + let mut exec_msgs_sets_count = 0; loop { let mut timer = Instant::now(); @@ -205,7 +215,7 @@ impl CollatorStdImpl { } else { vec![] }; - collation_data.read_ext_msgs += ext_msgs.len() as u32; + collation_data.read_ext_msgs += ext_msgs.len() as u64; // 2. Then iterate through existing internals and try to fill the set let mut remaining_capacity = max_messages_per_set - ext_msgs.len(); @@ -256,7 +266,7 @@ impl CollatorStdImpl { ext_count = ext_msgs.len(), "read additional externals", ); - collation_data.read_ext_msgs += ext_msgs.len() as u32; + collation_data.read_ext_msgs += ext_msgs.len() as u64; msgs_set.append(&mut ext_msgs); } @@ -317,8 +327,11 @@ impl CollatorStdImpl { break; } + exec_msgs_sets_count += 1; + let msgs_set_len = msgs_set.len() as u32; let mut msgs_set_executed_count = 0; + let mut exec_ticks_count = 0; exec_manager.set_msgs_for_execution(msgs_set); let mut msgs_set_offset = collation_data.processed_upto.processed_offset; let mut msgs_set_full_processed = false; @@ -327,7 +340,8 @@ impl CollatorStdImpl { // execute msgs processing by groups while !msgs_set_full_processed { - // Process messages + // Exec messages + exec_ticks_count += 1; timer = std::time::Instant::now(); let tick = exec_manager.tick(msgs_set_offset).await?; execute_msgs_total_elapsed += timer.elapsed(); @@ -357,7 +371,7 @@ impl CollatorStdImpl { item.in_message, )?; - collation_data.new_msgs_created += new_messages.len() as u32; + collation_data.new_msgs_created += new_messages.len() as u64; for new_message in &new_messages { let MsgInfo::Int(int_msg_info) = &new_message.info else { @@ -384,7 +398,8 @@ impl CollatorStdImpl { // currently we simply check only transactions count // but needs to make good implementation futher msgs_set_executed_count += one_tick_executed_count; - collation_data.tx_count += one_tick_executed_count as u32; + collation_data.tx_count += one_tick_executed_count as u64; + collation_data.ext_msgs_error_count += tick.ext_msgs_error_count; tracing::debug!(target: tracing_targets::COLLATOR, "processed messages from set {}/{}, total {}, offset = {}", msgs_set_executed_count, msgs_set_len, @@ -396,10 +411,13 @@ impl CollatorStdImpl { } } + metrics::gauge!("tycho_do_collate_exec_ticks_per_msgs_set", labels) + .set(exec_ticks_count as f64); + timer = std::time::Instant::now(); // HACK: temporary always full process msgs set and check block limits after - if collation_data.tx_count >= self.config.block_txs_limit { + if collation_data.tx_count >= self.config.block_txs_limit as u64 { tracing::debug!(target: tracing_targets::COLLATOR, "STUB: block limit reached: {}/{}", collation_data.tx_count, self.config.block_txs_limit, @@ -433,9 +451,15 @@ impl CollatorStdImpl { if block_limits_reached { // block is full - exit loop + metrics::counter!("tycho_do_collate_blocks_with_limits_reached_count", labels) + .increment(1); break; } } + + metrics::gauge!("tycho_do_collate_exec_msgs_sets_per_block", labels) + .set(exec_msgs_sets_count as f64); + metrics::histogram!("tycho_do_collate_fill_msgs_total_time", labels) .record(fill_msgs_total_elapsed); metrics::histogram!("tycho_do_collate_exec_msgs_total_time", labels) @@ -523,40 +547,44 @@ impl CollatorStdImpl { } // metrics - metrics::counter!("tycho_do_collate_tx_total", labels) - .increment(collation_data.tx_count as _); + metrics::counter!("tycho_do_collate_tx_total", labels).increment(collation_data.tx_count); metrics::counter!("tycho_do_collate_int_enqueue_count") - .increment(collation_data.int_enqueue_count as _); + .increment(collation_data.int_enqueue_count); metrics::counter!("tycho_do_collate_int_dequeue_count") - .increment(collation_data.int_dequeue_count as _); - metrics::gauge!("tycho_do_collate_int_msgs_queue_calc").increment( + .increment(collation_data.int_dequeue_count); + metrics::gauge!("tycho_do_collate_int_msgs_queue_calc").set( (collation_data.int_enqueue_count as i64 - collation_data.int_dequeue_count as i64) as f64, ); metrics::counter!("tycho_do_collate_msgs_exec_count_all", labels) - .increment(collation_data.execute_count_all as _); + .increment(collation_data.execute_count_all); metrics::counter!("tycho_do_collate_msgs_read_count_ext", labels) - .increment(collation_data.read_ext_msgs as _); + .increment(collation_data.read_ext_msgs); metrics::counter!("tycho_do_collate_msgs_exec_count_ext", labels) - .increment(collation_data.execute_count_ext as _); + .increment(collation_data.execute_count_ext); + metrics::counter!("tycho_do_collate_msgs_error_count_ext", labels) + .increment(collation_data.ext_msgs_error_count); metrics::counter!("tycho_do_collate_msgs_read_count_int", labels) - .increment(collation_data.read_int_msgs_from_iterator as _); + .increment(collation_data.read_int_msgs_from_iterator); metrics::counter!("tycho_do_collate_msgs_exec_count_int", labels) - .increment(collation_data.execute_count_int as _); + .increment(collation_data.execute_count_int); // new internals messages - metrics::counter!("tycho_do_collate_new_msgs_created_count") - .increment(collation_data.new_msgs_created as _); - metrics::counter!("tycho_do_collate_new_msgs_inserted_to_iterator_count") - .increment(collation_data.inserted_new_msgs_to_iterator as _); - metrics::counter!("tycho_do_collate_msgs_read_count_new_int") - .increment(collation_data.read_new_msgs_from_iterator as _); + metrics::counter!("tycho_do_collate_new_msgs_created_count", labels) + .increment(collation_data.new_msgs_created); + metrics::counter!( + "tycho_do_collate_new_msgs_inserted_to_iterator_count", + labels + ) + .increment(collation_data.inserted_new_msgs_to_iterator); + metrics::counter!("tycho_do_collate_msgs_read_count_new_int", labels) + .increment(collation_data.read_new_msgs_from_iterator); metrics::counter!("tycho_do_collate_msgs_exec_count_new_int", labels) - .increment(collation_data.execute_count_new_int as _); + .increment(collation_data.execute_count_new_int); collation_data.total_execute_msgs_time_mc = execute_msgs_total_elapsed.as_micros(); self.update_stats(&collation_data); diff --git a/collator/src/collator/execution_manager.rs b/collator/src/collator/execution_manager.rs index 1f244fed3..95f9c5ef5 100644 --- a/collator/src/collator/execution_manager.rs +++ b/collator/src/collator/execution_manager.rs @@ -1,12 +1,14 @@ use std::cmp; use std::collections::hash_map::Entry; use std::sync::Arc; +use std::time::Duration; use anyhow::Result; use everscale_types::models::*; use everscale_types::prelude::*; use futures_util::stream::FuturesUnordered; use futures_util::{Future, StreamExt}; +use humantime::format_duration; use ton_executor::{ ExecuteParams, ExecutorOutput, OrdinaryTransactionExecutor, PreloadedBlockchainConfig, TickTockTransactionExecutor, TransactionExecutor, @@ -21,6 +23,7 @@ use crate::tracing_targets; /// Execution manager pub(super) struct ExecutionManager { + shard_id: ShardIdent, // this time is used if account's lt is smaller min_next_lt: u64, /// blockchain config @@ -42,6 +45,7 @@ type MessageGroup = FastHashMap>>; impl ExecutionManager { /// constructor pub fn new( + shard_id: ShardIdent, min_next_lt: u64, config: Arc, params: Arc, @@ -50,6 +54,7 @@ impl ExecutionManager { shard_accounts: ShardAccounts, ) -> Self { Self { + shard_id, min_next_lt, config, params, @@ -106,35 +111,47 @@ impl ExecutionManager { pub async fn tick(&mut self, offset: u32) -> Result { tracing::trace!(target: tracing_targets::EXEC_MANAGER, offset, "messages set execution tick"); + let labels = &[("workchain", self.shard_id.workchain().to_string())]; + let (new_offset, group) = match self.message_groups.remove(&offset) { Some(group) => (offset + 1, group), None => return Ok(ExecutedTick::new_finished(offset)), }; let finished = !self.message_groups.contains_key(&new_offset); - let group_len = group.len(); - tracing::debug!(target: tracing_targets::EXEC_MANAGER, offset, group_len); + let group_size = group.len(); + let mut group_max_vert_size = 0; // TODO check externals is not exist accounts needed ? let mut futures = FuturesUnordered::new(); for (account_id, msgs) in group { + group_max_vert_size = cmp::max(group_max_vert_size, msgs.len()); let shard_account_stuff = self.accounts_cache.create_account_stuff(&account_id)?; futures.push(self.execute_messages(shard_account_stuff, msgs)); } - let mut items = Vec::with_capacity(group_len); + let mut items = Vec::with_capacity(group_size); + let mut ext_msgs_error_count = 0; + + let mut max_account_msgs_exec_time = Duration::ZERO; + let mut total_exec_time = Duration::ZERO; while let Some(executed_msgs_result) = futures.next().await { let executed = executed_msgs_result?; + + max_account_msgs_exec_time = max_account_msgs_exec_time.max(executed.exec_time); + total_exec_time += executed.exec_time; + for tx in executed.transactions { if matches!(&tx.in_message.info, MsgInfo::ExtIn(_)) { if let Err(e) = &tx.result { - tracing::error!( + tracing::warn!( target: tracing_targets::EXEC_MANAGER, account_addr = %executed.account_state.account_addr, message_hash = %tx.in_message.cell.repr_hash(), "failed to execute external message: {e:?}", ); + ext_msgs_error_count += 1; continue; } } @@ -155,10 +172,36 @@ impl ExecutionManager { .add_account_stuff(executed.account_state); } + let mean_account_msgs_exec_time = total_exec_time + .checked_div(group_size as u32) + .unwrap_or_default(); + + tracing::info!(target: tracing_targets::EXEC_MANAGER, + offset, group_size, group_max_vert_size, + total_exec_time = %format_duration(total_exec_time), + mean_account_msgs_exec_time = %format_duration(mean_account_msgs_exec_time), + max_account_msgs_exec_time = %format_duration(max_account_msgs_exec_time), + ); + + metrics::gauge!("tycho_do_collate_one_tick_group_size", labels).set(group_size as f64); + metrics::gauge!("tycho_do_collate_one_tick_group_max_vert_size", labels) + .set(group_max_vert_size as f64); + metrics::histogram!( + "tycho_do_collate_one_tick_account_msgs_exec_mean_time", + labels + ) + .record(mean_account_msgs_exec_time); + metrics::histogram!( + "tycho_do_collate_one_tick_account_msgs_exec_max_time", + labels + ) + .record(max_account_msgs_exec_time); + Ok(ExecutedTick { new_offset, finished, items, + ext_msgs_error_count, }) } @@ -172,6 +215,8 @@ impl ExecutionManager { let params = self.params.clone(); rayon_run_fifo(move || { + let timer = std::time::Instant::now(); + let mut transactions = Vec::with_capacity(msgs.len()); for msg in msgs { @@ -187,6 +232,7 @@ impl ExecutionManager { Ok(ExecutedTransactions { account_state, transactions, + exec_time: timer.elapsed(), }) }) } @@ -313,6 +359,7 @@ pub struct ExecutedTick { pub new_offset: u32, pub finished: bool, pub items: Vec, + pub ext_msgs_error_count: u64, } impl ExecutedTick { @@ -321,6 +368,7 @@ impl ExecutedTick { new_offset, finished: true, items: Vec::new(), + ext_msgs_error_count: 0, } } } @@ -334,6 +382,7 @@ pub struct ExecutedTickItem { pub struct ExecutedTransactions { pub account_state: Box, pub transactions: Vec, + pub exec_time: Duration, } pub struct ExecutedOrdinaryTransaction { diff --git a/collator/src/collator/types.rs b/collator/src/collator/types.rs index 1c3e5a666..8777931ed 100644 --- a/collator/src/collator/types.rs +++ b/collator/src/collator/types.rs @@ -320,23 +320,25 @@ pub(super) struct BlockCollationData { pub gen_utime: u32, pub gen_utime_ms: u16, - pub tx_count: u32, + pub tx_count: u64, pub total_execute_msgs_time_mc: u128, - pub execute_count_all: u32, - pub execute_count_ext: u32, - pub execute_count_int: u32, - pub execute_count_new_int: u32, + pub execute_count_all: u64, + pub execute_count_ext: u64, + pub execute_count_int: u64, + pub execute_count_new_int: u64, - pub int_enqueue_count: u32, - pub int_dequeue_count: u32, + pub ext_msgs_error_count: u64, - pub read_ext_msgs: u32, - pub read_int_msgs_from_iterator: u32, - pub new_msgs_created: u32, - pub inserted_new_msgs_to_iterator: u32, - pub read_new_msgs_from_iterator: u32, + pub int_enqueue_count: u64, + pub int_dequeue_count: u64, + + pub read_ext_msgs: u64, + pub read_int_msgs_from_iterator: u64, + pub new_msgs_created: u64, + pub inserted_new_msgs_to_iterator: u64, + pub read_new_msgs_from_iterator: u64, pub start_lt: u64, // Should be updated on each tx finalization from ExecutionManager.max_lt @@ -457,11 +459,11 @@ pub(super) struct CollatorStats { pub total_execute_msgs_time_mc: u128, pub avg_exec_msgs_per_1000_ms: u128, - pub total_execute_count_all: u32, - pub total_execute_count_ext: u32, - pub total_execute_count_int: u32, - pub total_execute_count_new_int: u32, - pub int_queue_length: u32, + pub total_execute_count_all: u64, + pub total_execute_count_ext: u64, + pub total_execute_count_int: u64, + pub total_execute_count_new_int: u64, + pub int_queue_length: u64, } pub(super) struct CachedMempoolAnchor { diff --git a/scripts/gen-dashboard.py b/scripts/gen-dashboard.py index 272e7b86c..1e4aea99f 100644 --- a/scripts/gen-dashboard.py +++ b/scripts/gen-dashboard.py @@ -463,11 +463,62 @@ def collator_finalize_block() -> RowPanel: def collator_do_collate() -> RowPanel: metrics = [ + create_gauge_panel( + "tycho_do_collate_msgs_exec_params_set_size", + "Params: msgs set size", + ), + create_gauge_panel( + "tycho_do_collate_msgs_exec_params_min_exts_per_set", + "Params: min externals per set", + ), + create_gauge_panel( + "tycho_do_collate_msgs_exec_params_group_limit", + "Params: group limit", + ), + create_gauge_panel( + "tycho_do_collate_msgs_exec_params_group_vert_size", + "Params: group vertical size limit", + ), create_counter_panel( "tycho_do_collate_tx_total", "Number of transactions over time", labels=['workchain=~"$workchain"'], ), + create_counter_panel( + "tycho_do_collate_blocks_with_limits_reached_count", + "Number of blocks with limits reached", + labels=['workchain=~"$workchain"'], + ), + create_gauge_panel( + "tycho_do_collate_exec_msgs_sets_per_block", + "Number of msgs sets per block", + labels=['workchain=~"$workchain"'], + ), + create_gauge_panel( + "tycho_do_collate_exec_ticks_per_msgs_set", + "Number of exec ticks per set", + labels=['workchain=~"$workchain"'], + ), + create_gauge_panel( + "tycho_do_collate_one_tick_group_size", + "One exec tick group size", + labels=['workchain=~"$workchain"'], + ), + create_gauge_panel( + "tycho_do_collate_one_tick_group_max_vert_size", + "One exec tick MAX group vertical size", + labels=['workchain=~"$workchain"'], + ), + create_heatmap_panel( + "tycho_do_collate_one_tick_account_msgs_exec_mean_time", + "MEAN exec time in group", + labels=['workchain=~"$workchain"'], + ), + create_heatmap_panel( + "tycho_do_collate_one_tick_account_msgs_exec_max_time", + "MAX exec time in group", + labels=['workchain=~"$workchain"'], + ), create_counter_panel( "tycho_do_collate_msgs_exec_count_all", "All executed msgs count", @@ -488,6 +539,11 @@ def collator_do_collate() -> RowPanel: "Executed Ext msgs count", labels=['workchain=~"$workchain"'], ), + create_counter_panel( + "tycho_do_collate_msgs_error_count_ext", + "Ext msgs error count", + labels=['workchain=~"$workchain"'], + ), create_counter_panel( "tycho_do_collate_msgs_read_count_int", "Read Int msgs count", @@ -524,7 +580,8 @@ def collator_do_collate() -> RowPanel: labels=['workchain=~"$workchain"'], ), create_gauge_panel( - "tycho_do_collate_int_msgs_queue_calc", "Calculated Internal queue len" + "tycho_do_collate_int_msgs_queue_calc", + "Calculated Internal queue len", ), create_counter_panel( "tycho_do_collate_int_enqueue_count", @@ -570,16 +627,6 @@ def collator_do_collate() -> RowPanel: "Execution time", labels=['workchain=~"$workchain"'], ), - create_heatmap_panel( - "tycho_do_collate_execute_tick_time", - "Execute Tick special transactions", - labels=['workchain=~"$workchain"'], - ), - create_heatmap_panel( - "tycho_do_collate_execute_tock_time", - "Execute Tock special transactions", - labels=['workchain=~"$workchain"'], - ), create_heatmap_panel( "tycho_do_collate_fill_msgs_total_time", "Fill messages time", @@ -605,6 +652,16 @@ def collator_do_collate() -> RowPanel: "Handle block candidate", labels=['workchain=~"$workchain"'], ), + create_heatmap_panel( + "tycho_do_collate_execute_tick_time", + "Execute Tick special transactions", + labels=['workchain=~"$workchain"'], + ), + create_heatmap_panel( + "tycho_do_collate_execute_tock_time", + "Execute Tock special transactions", + labels=['workchain=~"$workchain"'], + ), create_heatmap_panel( "tycho_collator_update_mc_data_time", "update mc data",