From 323bd75d028d19d3a2fa414880f0465f74197f96 Mon Sep 17 00:00:00 2001
From: Vitaly Terekhov <va.tehb@gmail.com>
Date: Thu, 20 Jun 2024 12:11:15 +0000
Subject: [PATCH] feature(collator): added more execution metrics

---
 collator/src/collator/do_collate.rs        | 74 +++++++++++++-------
 collator/src/collator/execution_manager.rs | 57 ++++++++++++++--
 collator/src/collator/types.rs             | 36 +++++-----
 scripts/gen-dashboard.py                   | 79 +++++++++++++++++++---
 4 files changed, 191 insertions(+), 55 deletions(-)

diff --git a/collator/src/collator/do_collate.rs b/collator/src/collator/do_collate.rs
index 7e2213b39..50d9b786a 100644
--- a/collator/src/collator/do_collate.rs
+++ b/collator/src/collator/do_collate.rs
@@ -122,8 +122,17 @@ impl CollatorStdImpl {
         let (max_messages_per_set, min_externals_per_set, group_limit, group_vert_size) =
             self.get_msgs_execution_params();
 
+        metrics::gauge!("tycho_do_collate_msgs_exec_params_set_size")
+            .set(max_messages_per_set as f64);
+        metrics::gauge!("tycho_do_collate_msgs_exec_params_min_exts_per_set")
+            .set(min_externals_per_set as f64);
+        metrics::gauge!("tycho_do_collate_msgs_exec_params_group_limit").set(group_limit as f64);
+        metrics::gauge!("tycho_do_collate_msgs_exec_params_group_vert_size")
+            .set(group_vert_size as f64);
+
         // init execution manager
         let mut exec_manager = ExecutionManager::new(
+            self.shard_id,
             collation_data.next_lt,
             Arc::new(PreloadedBlockchainConfig::with_config(
                 mc_data.config().clone(),
@@ -191,6 +200,7 @@ impl CollatorStdImpl {
         let mut fill_msgs_total_elapsed = Duration::ZERO;
         let mut execute_msgs_total_elapsed = Duration::ZERO;
         let mut process_txs_total_elapsed = Duration::ZERO;
+        let mut exec_msgs_sets_count = 0;
         loop {
             let mut timer = Instant::now();
 
@@ -205,7 +215,7 @@ impl CollatorStdImpl {
             } else {
                 vec![]
             };
-            collation_data.read_ext_msgs += ext_msgs.len() as u32;
+            collation_data.read_ext_msgs += ext_msgs.len() as u64;
 
             // 2. Then iterate through existing internals and try to fill the set
             let mut remaining_capacity = max_messages_per_set - ext_msgs.len();
@@ -256,7 +266,7 @@ impl CollatorStdImpl {
                     ext_count = ext_msgs.len(),
                     "read additional externals",
                 );
-                collation_data.read_ext_msgs += ext_msgs.len() as u32;
+                collation_data.read_ext_msgs += ext_msgs.len() as u64;
                 msgs_set.append(&mut ext_msgs);
             }
 
@@ -317,8 +327,11 @@ impl CollatorStdImpl {
                 break;
             }
 
+            exec_msgs_sets_count += 1;
+
             let msgs_set_len = msgs_set.len() as u32;
             let mut msgs_set_executed_count = 0;
+            let mut exec_ticks_count = 0;
             exec_manager.set_msgs_for_execution(msgs_set);
             let mut msgs_set_offset = collation_data.processed_upto.processed_offset;
             let mut msgs_set_full_processed = false;
@@ -327,7 +340,8 @@ impl CollatorStdImpl {
 
             // execute msgs processing by groups
             while !msgs_set_full_processed {
-                // Process messages
+                // Exec messages
+                exec_ticks_count += 1;
                 timer = std::time::Instant::now();
                 let tick = exec_manager.tick(msgs_set_offset).await?;
                 execute_msgs_total_elapsed += timer.elapsed();
@@ -357,7 +371,7 @@ impl CollatorStdImpl {
                         item.in_message,
                     )?;
 
-                    collation_data.new_msgs_created += new_messages.len() as u32;
+                    collation_data.new_msgs_created += new_messages.len() as u64;
 
                     for new_message in &new_messages {
                         let MsgInfo::Int(int_msg_info) = &new_message.info else {
@@ -384,7 +398,8 @@ impl CollatorStdImpl {
                 //      currently we simply check only transactions count
                 //      but needs to make good implementation futher
                 msgs_set_executed_count += one_tick_executed_count;
-                collation_data.tx_count += one_tick_executed_count as u32;
+                collation_data.tx_count += one_tick_executed_count as u64;
+                collation_data.ext_msgs_error_count += tick.ext_msgs_error_count;
                 tracing::debug!(target: tracing_targets::COLLATOR,
                     "processed messages from set {}/{}, total {}, offset = {}",
                     msgs_set_executed_count, msgs_set_len,
@@ -396,10 +411,13 @@ impl CollatorStdImpl {
                 }
             }
 
+            metrics::gauge!("tycho_do_collate_exec_ticks_per_msgs_set", labels)
+                .set(exec_ticks_count as f64);
+
             timer = std::time::Instant::now();
 
             // HACK: temporary always full process msgs set and check block limits after
-            if collation_data.tx_count >= self.config.block_txs_limit {
+            if collation_data.tx_count >= self.config.block_txs_limit as u64 {
                 tracing::debug!(target: tracing_targets::COLLATOR,
                     "STUB: block limit reached: {}/{}",
                     collation_data.tx_count, self.config.block_txs_limit,
@@ -433,9 +451,15 @@ impl CollatorStdImpl {
 
             if block_limits_reached {
                 // block is full - exit loop
+                metrics::counter!("tycho_do_collate_blocks_with_limits_reached_count", labels)
+                    .increment(1);
                 break;
             }
         }
+
+        metrics::gauge!("tycho_do_collate_exec_msgs_sets_per_block", labels)
+            .set(exec_msgs_sets_count as f64);
+
         metrics::histogram!("tycho_do_collate_fill_msgs_total_time", labels)
             .record(fill_msgs_total_elapsed);
         metrics::histogram!("tycho_do_collate_exec_msgs_total_time", labels)
@@ -523,40 +547,44 @@ impl CollatorStdImpl {
         }
 
         // metrics
-        metrics::counter!("tycho_do_collate_tx_total", labels)
-            .increment(collation_data.tx_count as _);
+        metrics::counter!("tycho_do_collate_tx_total", labels).increment(collation_data.tx_count);
 
         metrics::counter!("tycho_do_collate_int_enqueue_count")
-            .increment(collation_data.int_enqueue_count as _);
+            .increment(collation_data.int_enqueue_count);
         metrics::counter!("tycho_do_collate_int_dequeue_count")
-            .increment(collation_data.int_dequeue_count as _);
-        metrics::gauge!("tycho_do_collate_int_msgs_queue_calc").increment(
+            .increment(collation_data.int_dequeue_count);
+        metrics::gauge!("tycho_do_collate_int_msgs_queue_calc").set(
             (collation_data.int_enqueue_count as i64 - collation_data.int_dequeue_count as i64)
                 as f64,
         );
 
         metrics::counter!("tycho_do_collate_msgs_exec_count_all", labels)
-            .increment(collation_data.execute_count_all as _);
+            .increment(collation_data.execute_count_all);
 
         metrics::counter!("tycho_do_collate_msgs_read_count_ext", labels)
-            .increment(collation_data.read_ext_msgs as _);
+            .increment(collation_data.read_ext_msgs);
         metrics::counter!("tycho_do_collate_msgs_exec_count_ext", labels)
-            .increment(collation_data.execute_count_ext as _);
+            .increment(collation_data.execute_count_ext);
+        metrics::counter!("tycho_do_collate_msgs_error_count_ext", labels)
+            .increment(collation_data.ext_msgs_error_count);
 
         metrics::counter!("tycho_do_collate_msgs_read_count_int", labels)
-            .increment(collation_data.read_int_msgs_from_iterator as _);
+            .increment(collation_data.read_int_msgs_from_iterator);
         metrics::counter!("tycho_do_collate_msgs_exec_count_int", labels)
-            .increment(collation_data.execute_count_int as _);
+            .increment(collation_data.execute_count_int);
 
         // new internals messages
-        metrics::counter!("tycho_do_collate_new_msgs_created_count")
-            .increment(collation_data.new_msgs_created as _);
-        metrics::counter!("tycho_do_collate_new_msgs_inserted_to_iterator_count")
-            .increment(collation_data.inserted_new_msgs_to_iterator as _);
-        metrics::counter!("tycho_do_collate_msgs_read_count_new_int")
-            .increment(collation_data.read_new_msgs_from_iterator as _);
+        metrics::counter!("tycho_do_collate_new_msgs_created_count", labels)
+            .increment(collation_data.new_msgs_created);
+        metrics::counter!(
+            "tycho_do_collate_new_msgs_inserted_to_iterator_count",
+            labels
+        )
+        .increment(collation_data.inserted_new_msgs_to_iterator);
+        metrics::counter!("tycho_do_collate_msgs_read_count_new_int", labels)
+            .increment(collation_data.read_new_msgs_from_iterator);
         metrics::counter!("tycho_do_collate_msgs_exec_count_new_int", labels)
-            .increment(collation_data.execute_count_new_int as _);
+            .increment(collation_data.execute_count_new_int);
 
         collation_data.total_execute_msgs_time_mc = execute_msgs_total_elapsed.as_micros();
         self.update_stats(&collation_data);
diff --git a/collator/src/collator/execution_manager.rs b/collator/src/collator/execution_manager.rs
index 1f244fed3..95f9c5ef5 100644
--- a/collator/src/collator/execution_manager.rs
+++ b/collator/src/collator/execution_manager.rs
@@ -1,12 +1,14 @@
 use std::cmp;
 use std::collections::hash_map::Entry;
 use std::sync::Arc;
+use std::time::Duration;
 
 use anyhow::Result;
 use everscale_types::models::*;
 use everscale_types::prelude::*;
 use futures_util::stream::FuturesUnordered;
 use futures_util::{Future, StreamExt};
+use humantime::format_duration;
 use ton_executor::{
     ExecuteParams, ExecutorOutput, OrdinaryTransactionExecutor, PreloadedBlockchainConfig,
     TickTockTransactionExecutor, TransactionExecutor,
@@ -21,6 +23,7 @@ use crate::tracing_targets;
 
 /// Execution manager
 pub(super) struct ExecutionManager {
+    shard_id: ShardIdent,
     // this time is used if account's lt is smaller
     min_next_lt: u64,
     /// blockchain config
@@ -42,6 +45,7 @@ type MessageGroup = FastHashMap<HashBytes, Vec<Box<ParsedMessage>>>;
 impl ExecutionManager {
     /// constructor
     pub fn new(
+        shard_id: ShardIdent,
         min_next_lt: u64,
         config: Arc<PreloadedBlockchainConfig>,
         params: Arc<ExecuteParams>,
@@ -50,6 +54,7 @@ impl ExecutionManager {
         shard_accounts: ShardAccounts,
     ) -> Self {
         Self {
+            shard_id,
             min_next_lt,
             config,
             params,
@@ -106,35 +111,47 @@ impl ExecutionManager {
     pub async fn tick(&mut self, offset: u32) -> Result<ExecutedTick> {
         tracing::trace!(target: tracing_targets::EXEC_MANAGER, offset, "messages set execution tick");
 
+        let labels = &[("workchain", self.shard_id.workchain().to_string())];
+
         let (new_offset, group) = match self.message_groups.remove(&offset) {
             Some(group) => (offset + 1, group),
             None => return Ok(ExecutedTick::new_finished(offset)),
         };
         let finished = !self.message_groups.contains_key(&new_offset);
 
-        let group_len = group.len();
-        tracing::debug!(target: tracing_targets::EXEC_MANAGER, offset, group_len);
+        let group_size = group.len();
+        let mut group_max_vert_size = 0;
 
         // TODO check externals is not exist accounts needed ?
         let mut futures = FuturesUnordered::new();
         for (account_id, msgs) in group {
+            group_max_vert_size = cmp::max(group_max_vert_size, msgs.len());
             let shard_account_stuff = self.accounts_cache.create_account_stuff(&account_id)?;
             futures.push(self.execute_messages(shard_account_stuff, msgs));
         }
 
-        let mut items = Vec::with_capacity(group_len);
+        let mut items = Vec::with_capacity(group_size);
+        let mut ext_msgs_error_count = 0;
+
+        let mut max_account_msgs_exec_time = Duration::ZERO;
+        let mut total_exec_time = Duration::ZERO;
 
         while let Some(executed_msgs_result) = futures.next().await {
             let executed = executed_msgs_result?;
+
+            max_account_msgs_exec_time = max_account_msgs_exec_time.max(executed.exec_time);
+            total_exec_time += executed.exec_time;
+
             for tx in executed.transactions {
                 if matches!(&tx.in_message.info, MsgInfo::ExtIn(_)) {
                     if let Err(e) = &tx.result {
-                        tracing::error!(
+                        tracing::warn!(
                             target: tracing_targets::EXEC_MANAGER,
                             account_addr = %executed.account_state.account_addr,
                             message_hash = %tx.in_message.cell.repr_hash(),
                             "failed to execute external message: {e:?}",
                         );
+                        ext_msgs_error_count += 1;
                         continue;
                     }
                 }
@@ -155,10 +172,36 @@ impl ExecutionManager {
                 .add_account_stuff(executed.account_state);
         }
 
+        let mean_account_msgs_exec_time = total_exec_time
+            .checked_div(group_size as u32)
+            .unwrap_or_default();
+
+        tracing::info!(target: tracing_targets::EXEC_MANAGER,
+            offset, group_size, group_max_vert_size,
+            total_exec_time = %format_duration(total_exec_time),
+            mean_account_msgs_exec_time = %format_duration(mean_account_msgs_exec_time),
+            max_account_msgs_exec_time = %format_duration(max_account_msgs_exec_time),
+        );
+
+        metrics::gauge!("tycho_do_collate_one_tick_group_size", labels).set(group_size as f64);
+        metrics::gauge!("tycho_do_collate_one_tick_group_max_vert_size", labels)
+            .set(group_max_vert_size as f64);
+        metrics::histogram!(
+            "tycho_do_collate_one_tick_account_msgs_exec_mean_time",
+            labels
+        )
+        .record(mean_account_msgs_exec_time);
+        metrics::histogram!(
+            "tycho_do_collate_one_tick_account_msgs_exec_max_time",
+            labels
+        )
+        .record(max_account_msgs_exec_time);
+
         Ok(ExecutedTick {
             new_offset,
             finished,
             items,
+            ext_msgs_error_count,
         })
     }
 
@@ -172,6 +215,8 @@ impl ExecutionManager {
         let params = self.params.clone();
 
         rayon_run_fifo(move || {
+            let timer = std::time::Instant::now();
+
             let mut transactions = Vec::with_capacity(msgs.len());
 
             for msg in msgs {
@@ -187,6 +232,7 @@ impl ExecutionManager {
             Ok(ExecutedTransactions {
                 account_state,
                 transactions,
+                exec_time: timer.elapsed(),
             })
         })
     }
@@ -313,6 +359,7 @@ pub struct ExecutedTick {
     pub new_offset: u32,
     pub finished: bool,
     pub items: Vec<ExecutedTickItem>,
+    pub ext_msgs_error_count: u64,
 }
 
 impl ExecutedTick {
@@ -321,6 +368,7 @@ impl ExecutedTick {
             new_offset,
             finished: true,
             items: Vec::new(),
+            ext_msgs_error_count: 0,
         }
     }
 }
@@ -334,6 +382,7 @@ pub struct ExecutedTickItem {
 pub struct ExecutedTransactions {
     pub account_state: Box<ShardAccountStuff>,
     pub transactions: Vec<ExecutedOrdinaryTransaction>,
+    pub exec_time: Duration,
 }
 
 pub struct ExecutedOrdinaryTransaction {
diff --git a/collator/src/collator/types.rs b/collator/src/collator/types.rs
index 1c3e5a666..8777931ed 100644
--- a/collator/src/collator/types.rs
+++ b/collator/src/collator/types.rs
@@ -320,23 +320,25 @@ pub(super) struct BlockCollationData {
     pub gen_utime: u32,
     pub gen_utime_ms: u16,
 
-    pub tx_count: u32,
+    pub tx_count: u64,
 
     pub total_execute_msgs_time_mc: u128,
 
-    pub execute_count_all: u32,
-    pub execute_count_ext: u32,
-    pub execute_count_int: u32,
-    pub execute_count_new_int: u32,
+    pub execute_count_all: u64,
+    pub execute_count_ext: u64,
+    pub execute_count_int: u64,
+    pub execute_count_new_int: u64,
 
-    pub int_enqueue_count: u32,
-    pub int_dequeue_count: u32,
+    pub ext_msgs_error_count: u64,
 
-    pub read_ext_msgs: u32,
-    pub read_int_msgs_from_iterator: u32,
-    pub new_msgs_created: u32,
-    pub inserted_new_msgs_to_iterator: u32,
-    pub read_new_msgs_from_iterator: u32,
+    pub int_enqueue_count: u64,
+    pub int_dequeue_count: u64,
+
+    pub read_ext_msgs: u64,
+    pub read_int_msgs_from_iterator: u64,
+    pub new_msgs_created: u64,
+    pub inserted_new_msgs_to_iterator: u64,
+    pub read_new_msgs_from_iterator: u64,
 
     pub start_lt: u64,
     // Should be updated on each tx finalization from ExecutionManager.max_lt
@@ -457,11 +459,11 @@ pub(super) struct CollatorStats {
     pub total_execute_msgs_time_mc: u128,
     pub avg_exec_msgs_per_1000_ms: u128,
 
-    pub total_execute_count_all: u32,
-    pub total_execute_count_ext: u32,
-    pub total_execute_count_int: u32,
-    pub total_execute_count_new_int: u32,
-    pub int_queue_length: u32,
+    pub total_execute_count_all: u64,
+    pub total_execute_count_ext: u64,
+    pub total_execute_count_int: u64,
+    pub total_execute_count_new_int: u64,
+    pub int_queue_length: u64,
 }
 
 pub(super) struct CachedMempoolAnchor {
diff --git a/scripts/gen-dashboard.py b/scripts/gen-dashboard.py
index 272e7b86c..1e4aea99f 100644
--- a/scripts/gen-dashboard.py
+++ b/scripts/gen-dashboard.py
@@ -463,11 +463,62 @@ def collator_finalize_block() -> RowPanel:
 
 def collator_do_collate() -> RowPanel:
     metrics = [
+        create_gauge_panel(
+            "tycho_do_collate_msgs_exec_params_set_size",
+            "Params: msgs set size",
+        ),
+        create_gauge_panel(
+            "tycho_do_collate_msgs_exec_params_min_exts_per_set",
+            "Params: min externals per set",
+        ),
+        create_gauge_panel(
+            "tycho_do_collate_msgs_exec_params_group_limit",
+            "Params: group limit",
+        ),
+        create_gauge_panel(
+            "tycho_do_collate_msgs_exec_params_group_vert_size",
+            "Params: group vertical size limit",
+        ),
         create_counter_panel(
             "tycho_do_collate_tx_total",
             "Number of transactions over time",
             labels=['workchain=~"$workchain"'],
         ),
+        create_counter_panel(
+            "tycho_do_collate_blocks_with_limits_reached_count",
+            "Number of blocks with limits reached",
+            labels=['workchain=~"$workchain"'],
+        ),
+        create_gauge_panel(
+            "tycho_do_collate_exec_msgs_sets_per_block",
+            "Number of msgs sets per block",
+            labels=['workchain=~"$workchain"'],
+        ),
+        create_gauge_panel(
+            "tycho_do_collate_exec_ticks_per_msgs_set",
+            "Number of exec ticks per set",
+            labels=['workchain=~"$workchain"'],
+        ),
+        create_gauge_panel(
+            "tycho_do_collate_one_tick_group_size",
+            "One exec tick group size",
+            labels=['workchain=~"$workchain"'],
+        ),
+        create_gauge_panel(
+            "tycho_do_collate_one_tick_group_max_vert_size",
+            "One exec tick MAX group vertical size",
+            labels=['workchain=~"$workchain"'],
+        ),
+        create_heatmap_panel(
+            "tycho_do_collate_one_tick_account_msgs_exec_mean_time",
+            "MEAN exec time in group",
+            labels=['workchain=~"$workchain"'],
+        ),
+        create_heatmap_panel(
+            "tycho_do_collate_one_tick_account_msgs_exec_max_time",
+            "MAX exec time in group",
+            labels=['workchain=~"$workchain"'],
+        ),
         create_counter_panel(
             "tycho_do_collate_msgs_exec_count_all",
             "All executed msgs count",
@@ -488,6 +539,11 @@ def collator_do_collate() -> RowPanel:
             "Executed Ext msgs count",
             labels=['workchain=~"$workchain"'],
         ),
+        create_counter_panel(
+            "tycho_do_collate_msgs_error_count_ext",
+            "Ext msgs error count",
+            labels=['workchain=~"$workchain"'],
+        ),
         create_counter_panel(
             "tycho_do_collate_msgs_read_count_int",
             "Read Int msgs count",
@@ -524,7 +580,8 @@ def collator_do_collate() -> RowPanel:
             labels=['workchain=~"$workchain"'],
         ),
         create_gauge_panel(
-            "tycho_do_collate_int_msgs_queue_calc", "Calculated Internal queue len"
+            "tycho_do_collate_int_msgs_queue_calc",
+            "Calculated Internal queue len",
         ),
         create_counter_panel(
             "tycho_do_collate_int_enqueue_count",
@@ -570,16 +627,6 @@ def collator_do_collate() -> RowPanel:
             "Execution time",
             labels=['workchain=~"$workchain"'],
         ),
-        create_heatmap_panel(
-            "tycho_do_collate_execute_tick_time",
-            "Execute Tick special transactions",
-            labels=['workchain=~"$workchain"'],
-        ),
-        create_heatmap_panel(
-            "tycho_do_collate_execute_tock_time",
-            "Execute Tock special transactions",
-            labels=['workchain=~"$workchain"'],
-        ),
         create_heatmap_panel(
             "tycho_do_collate_fill_msgs_total_time",
             "Fill messages time",
@@ -605,6 +652,16 @@ def collator_do_collate() -> RowPanel:
             "Handle block candidate",
             labels=['workchain=~"$workchain"'],
         ),
+        create_heatmap_panel(
+            "tycho_do_collate_execute_tick_time",
+            "Execute Tick special transactions",
+            labels=['workchain=~"$workchain"'],
+        ),
+        create_heatmap_panel(
+            "tycho_do_collate_execute_tock_time",
+            "Execute Tock special transactions",
+            labels=['workchain=~"$workchain"'],
+        ),
         create_heatmap_panel(
             "tycho_collator_update_mc_data_time",
             "update mc data",