Metrics-based bad token detector (#3172)

# Description Follow-up to #3156. This PR introduces an in-memory, ratio-based bad token detection strategy that complements the existing heuristics. Instead of relying on consecutive failures, it keeps track of both successful and failed settlement attempts for each token. The logic is as follows: 1. When a settlement encoding fails, every token involved in that attempt has its statistics updated: both total attempts and failed attempts are incremented. 2. Otherwise, every token involved has its total attempts incremented but not its failures. 3. Before marking a token as unsupported, the detector requires at least 20 recorded attempts. Once this threshold is met, if the token's failure ratio (`fails / attempts`) is at least 90%, it is considered unsupported. This approach is more resilient than just counting consecutive failures. A highly utilized and generally reliable token that occasionally appears in failing trades with problematic tokens won't be prematurely flagged as unsupported because its overall success ratio remains high. Due to the nature of the implementation, all the statistics get discarded on every restart. Implementing a persistence layer might make sense in the future, but problems with bad tokens are usually temporal. ## How to test A forked e2e test for the simulation-based detector is required first, and it is expected to be implemented in a separate PR. --------- Co-authored-by: MartinquaXD <[email protected]> Co-authored-by: Mateo <[email protected]>
cowprotocol · Dec 23, 2024 · 7e52015 · 7e52015
1 parent 1451574
commit 7e52015
Show file tree

Hide file tree

Showing 8 changed files with 115 additions and 12 deletions.
diff --git a/crates/driver/src/domain/competition/bad_tokens/metrics.rs b/crates/driver/src/domain/competition/bad_tokens/metrics.rs
@@ -1,11 +1,62 @@
-use {super::Quality, crate::domain::eth};
+use {super::Quality, crate::domain::eth, dashmap::DashMap, std::sync::Arc};
+
+/// Monitors tokens to determine whether they are considered "unsupported" based
+/// on the ratio of failing to total settlement encoding attempts. A token must
+/// have participated in at least `REQUIRED_MEASUREMENTS` attempts to be
+/// evaluated. If, at that point, the ratio of failures is greater than or equal
+/// to `FAILURE_RATIO`, the token is considered unsupported.
+#[derive(Default, Clone)]
+pub struct Detector(Arc<Inner>);
 
 #[derive(Default)]
-pub struct Detector;
+struct Inner {
+    counter: DashMap<eth::TokenAddress, TokenStatistics>,
+}
+
+#[derive(Default)]
+struct TokenStatistics {
+    attempts: u32,
+    fails: u32,
+}
 
 impl Detector {
-    pub fn get_quality(&self, _token: eth::TokenAddress) -> Option<Quality> {
-        // TODO implement a reasonable heuristic
-        None
+    /// The ratio of failures to attempts that qualifies a token as unsupported.
+    const FAILURE_RATIO: f64 = 0.9;
+    /// The minimum number of attempts required before evaluating a token’s
+    /// quality.
+    const REQUIRED_MEASUREMENTS: u32 = 20;
+
+    pub fn get_quality(&self, token: &eth::TokenAddress) -> Option<Quality> {
+        let measurements = self.0.counter.get(token)?;
+        let is_unsupported = measurements.attempts >= Self::REQUIRED_MEASUREMENTS
+            && (measurements.fails as f64 / measurements.attempts as f64) >= Self::FAILURE_RATIO;
+
+        is_unsupported.then_some(Quality::Unsupported)
+    }
+
+    /// Updates the tokens that participated in settlements by
+    /// incrementing their attempt count.
+    /// `failure` indicates whether the settlement was successful or not.
+    pub fn update_tokens(
+        &self,
+        token_pairs: &[(eth::TokenAddress, eth::TokenAddress)],
+        failure: bool,
+    ) {
+        token_pairs
+            .iter()
+            .flat_map(|(token_a, token_b)| [token_a, token_b])
+            .for_each(|token| {
+                self.0
+                    .counter
+                    .entry(*token)
+                    .and_modify(|counter| {
+                        counter.attempts += 1;
+                        counter.fails += u32::from(failure)
+                    })
+                    .or_insert_with(|| TokenStatistics {
+                        attempts: 1,
+                        fails: u32::from(failure),
+                    });
+            });
     }
 }
diff --git a/crates/driver/src/domain/competition/bad_tokens/mod.rs b/crates/driver/src/domain/competition/bad_tokens/mod.rs
@@ -53,8 +53,8 @@ impl Detector {
     }
 
     /// Enables detection of unsupported tokens based on heuristics.
-    pub fn with_heuristic_detector(&mut self) -> &mut Self {
-        self.metrics = Some(metrics::Detector);
+    pub fn with_metrics_detector(&mut self, detector: metrics::Detector) -> &mut Self {
+        self.metrics = Some(detector);
         self
     }
 
@@ -106,6 +106,20 @@ impl Detector {
         auction
     }
 
+    /// Updates the tokens quality metric for successful operation.
+    pub fn encoding_succeeded(&self, token_pairs: &[(eth::TokenAddress, eth::TokenAddress)]) {
+        if let Some(metrics) = &self.metrics {
+            metrics.update_tokens(token_pairs, false);
+        }
+    }
+
+    /// Updates the tokens quality metric for failures.
+    pub fn encoding_failed(&self, token_pairs: &[(eth::TokenAddress, eth::TokenAddress)]) {
+        if let Some(metrics) = &self.metrics {
+            metrics.update_tokens(token_pairs, true);
+        }
+    }
+
     fn get_token_quality(&self, token: eth::TokenAddress, now: Instant) -> Option<Quality> {
         if let Some(quality) = self.hardcoded.get(&token) {
             return Some(*quality);
@@ -118,7 +132,7 @@ impl Detector {
         }
 
         if let Some(metrics) = &self.metrics {
-            return metrics.get_quality(token);
+            return metrics.get_quality(&token);
         }
 
         None

diff --git a/crates/driver/src/domain/competition/mod.rs b/crates/driver/src/domain/competition/mod.rs
@@ -122,6 +122,7 @@ impl Competition {
             .into_iter()
             .map(|solution| async move {
                 let id = solution.id().clone();
+                let token_pairs = solution.token_pairs();
                 observe::encoding(&id);
                 let settlement = solution
                     .encode(
@@ -131,16 +132,19 @@ impl Competition {
                         self.solver.solver_native_token(),
                     )
                     .await;
-                (id, settlement)
+                (id, token_pairs, settlement)
             })
             .collect::<FuturesUnordered<_>>()
-            .filter_map(|(id, result)| async move {
+            .filter_map(|(id, token_pairs, result)| async move {
                 match result {
-                    Ok(solution) => Some(solution),
+                    Ok(solution) => {
+                        self.bad_tokens.encoding_succeeded(&token_pairs);
+                        Some(solution)
+                    }
                     // don't report on errors coming from solution merging
                     Err(_err) if id.solutions().len() > 1 => None,
                     Err(err) => {
-                        // TODO update metrics of bad token detection
+                        self.bad_tokens.encoding_failed(&token_pairs);
                         observe::encoding_failed(self.solver.name(), &id, &err);
                         notify::encoding_failed(&self.solver, auction.id(), &id, &err);
                         None

diff --git a/crates/driver/src/domain/competition/solution/mod.rs b/crates/driver/src/domain/competition/solution/mod.rs
@@ -169,6 +169,23 @@ impl Solution {
         &self.trades
     }
 
+    /// Returns all the token pairs involved in the solution.
+    pub fn token_pairs(&self) -> Vec<(TokenAddress, TokenAddress)> {
+        self.trades
+            .iter()
+            .map(|trade| match trade {
+                Trade::Fulfillment(fulfillment) => {
+                    let order = fulfillment.order();
+                    (order.sell.token, order.buy.token)
+                }
+                Trade::Jit(jit) => {
+                    let order = jit.order();
+                    (order.sell.token, order.buy.token)
+                }
+            })
+            .collect()
+    }
+
     /// Interactions executed by this solution.
     pub fn interactions(&self) -> &[Interaction] {
         &self.interactions

diff --git a/crates/driver/src/infra/api/mod.rs b/crates/driver/src/infra/api/mod.rs
@@ -58,6 +58,8 @@ impl Api {
         app = routes::metrics(app);
         app = routes::healthz(app);
 
+        let metrics_bad_token_detector = bad_tokens::metrics::Detector::default();
+
         // Multiplex each solver as part of the API. Multiple solvers are multiplexed
         // on the same driver so only one liquidity collector collects the liquidity
         // for all of them. This is important because liquidity collection is
@@ -80,6 +82,13 @@ impl Api {
                 bad_tokens.with_simulation_detector(self.bad_token_detector.clone());
             }
 
+            if solver
+                .bad_token_detection()
+                .enable_metrics_based_bad_token_detection
+            {
+                bad_tokens.with_metrics_detector(metrics_bad_token_detector.clone());
+            }
+
             let router = router.with_state(State(Arc::new(Inner {
                 eth: self.eth.clone(),
                 solver: solver.clone(),

diff --git a/crates/driver/src/infra/config/file/load.rs b/crates/driver/src/infra/config/file/load.rs
@@ -110,6 +110,8 @@ pub async fn load(chain: chain::Id, path: &Path) -> infra::Config {
                         .collect(),
                     enable_simulation_based_bad_token_detection: config
                         .enable_simulation_bad_token_detection,
+                    enable_metrics_based_bad_token_detection: config
+                        .enable_metrics_bad_token_detection,
                 },
             }
         }))

diff --git a/crates/driver/src/infra/config/file/mod.rs b/crates/driver/src/infra/config/file/mod.rs
@@ -277,6 +277,11 @@ struct SolverConfig {
     /// tokens with `trace_callMany` based simulation.
     #[serde(default)]
     enable_simulation_bad_token_detection: bool,
+
+    /// Whether or not the solver opted into detecting unsupported
+    /// tokens with metrics-based detection.
+    #[serde(default)]
+    enable_metrics_bad_token_detection: bool,
 }
 
 #[derive(Clone, Copy, Debug, Default, Deserialize, PartialEq, Serialize)]

diff --git a/crates/driver/src/infra/solver/mod.rs b/crates/driver/src/infra/solver/mod.rs
@@ -307,4 +307,5 @@ pub struct BadTokenDetection {
     /// Tokens that are explicitly allow- or deny-listed.
     pub tokens_supported: HashMap<eth::TokenAddress, bad_tokens::Quality>,
     pub enable_simulation_based_bad_token_detection: bool,
+    pub enable_metrics_based_bad_token_detection: bool,
 }