From 7e5201566d450f8b9c5161c14feb54762b8aa101 Mon Sep 17 00:00:00 2001
From: ilya <ilya@cow.fi>
Date: Mon, 23 Dec 2024 12:26:35 +0000
Subject: [PATCH] Metrics-based bad token detector (#3172)

# Description
Follow-up to #3156. This PR introduces an in-memory, ratio-based bad
token detection strategy that complements the existing heuristics.
Instead of relying on consecutive failures, it keeps track of both
successful and failed settlement attempts for each token. The logic is
as follows:

1. When a settlement encoding fails, every token involved in that
attempt has its statistics updated: both total attempts and failed
attempts are incremented.
2. Otherwise, every token involved has its total attempts incremented
but not its failures.
3. Before marking a token as unsupported, the detector requires at least
20 recorded attempts. Once this threshold is met, if the token's failure
ratio (`fails / attempts`) is at least 90%, it is considered
unsupported.

This approach is more resilient than just counting consecutive failures.
A highly utilized and generally reliable token that occasionally appears
in failing trades with problematic tokens won't be prematurely flagged
as unsupported because its overall success ratio remains high.

Due to the nature of the implementation, all the statistics get
discarded on every restart. Implementing a persistence layer might make
sense in the future, but problems with bad tokens are usually temporal.

## How to test
A forked e2e test for the simulation-based detector is required first,
and it is expected to be implemented in a separate PR.

---------

Co-authored-by: MartinquaXD <martin.beckmann@protonmail.com>
Co-authored-by: Mateo <mateo@cow.fi>
---
 .../domain/competition/bad_tokens/metrics.rs  | 61 +++++++++++++++++--
 .../src/domain/competition/bad_tokens/mod.rs  | 20 +++++-
 crates/driver/src/domain/competition/mod.rs   | 12 ++--
 .../src/domain/competition/solution/mod.rs    | 17 ++++++
 crates/driver/src/infra/api/mod.rs            |  9 +++
 crates/driver/src/infra/config/file/load.rs   |  2 +
 crates/driver/src/infra/config/file/mod.rs    |  5 ++
 crates/driver/src/infra/solver/mod.rs         |  1 +
 8 files changed, 115 insertions(+), 12 deletions(-)
diff --git a/crates/driver/src/domain/competition/bad_tokens/metrics.rs b/crates/driver/src/domain/competition/bad_tokens/metrics.rs
index 8db0be0b82..10eb559ada 100644
--- a/crates/driver/src/domain/competition/bad_tokens/metrics.rs
+++ b/crates/driver/src/domain/competition/bad_tokens/metrics.rs
@@ -1,11 +1,62 @@
-use {super::Quality, crate::domain::eth};
+use {super::Quality, crate::domain::eth, dashmap::DashMap, std::sync::Arc};
+
+/// Monitors tokens to determine whether they are considered "unsupported" based
+/// on the ratio of failing to total settlement encoding attempts. A token must
+/// have participated in at least `REQUIRED_MEASUREMENTS` attempts to be
+/// evaluated. If, at that point, the ratio of failures is greater than or equal
+/// to `FAILURE_RATIO`, the token is considered unsupported.
+#[derive(Default, Clone)]
+pub struct Detector(Arc<Inner>);
 
 #[derive(Default)]
-pub struct Detector;
+struct Inner {
+    counter: DashMap<eth::TokenAddress, TokenStatistics>,
+}
+
+#[derive(Default)]
+struct TokenStatistics {
+    attempts: u32,
+    fails: u32,
+}
 
 impl Detector {
-    pub fn get_quality(&self, _token: eth::TokenAddress) -> Option<Quality> {
-        // TODO implement a reasonable heuristic
-        None
+    /// The ratio of failures to attempts that qualifies a token as unsupported.
+    const FAILURE_RATIO: f64 = 0.9;
+    /// The minimum number of attempts required before evaluating a token’s
+    /// quality.
+    const REQUIRED_MEASUREMENTS: u32 = 20;
+
+    pub fn get_quality(&self, token: &eth::TokenAddress) -> Option<Quality> {
+        let measurements = self.0.counter.get(token)?;
+        let is_unsupported = measurements.attempts >= Self::REQUIRED_MEASUREMENTS
+            && (measurements.fails as f64 / measurements.attempts as f64) >= Self::FAILURE_RATIO;
+
+        is_unsupported.then_some(Quality::Unsupported)
+    }
+
+    /// Updates the tokens that participated in settlements by
+    /// incrementing their attempt count.
+    /// `failure` indicates whether the settlement was successful or not.
+    pub fn update_tokens(
+        &self,
+        token_pairs: &[(eth::TokenAddress, eth::TokenAddress)],
+        failure: bool,
+    ) {
+        token_pairs
+            .iter()
+            .flat_map(|(token_a, token_b)| [token_a, token_b])
+            .for_each(|token| {
+                self.0
+                    .counter
+                    .entry(*token)
+                    .and_modify(|counter| {
+                        counter.attempts += 1;
+                        counter.fails += u32::from(failure)
+                    })
+                    .or_insert_with(|| TokenStatistics {
+                        attempts: 1,
+                        fails: u32::from(failure),
+                    });
+            });
     }
 }
diff --git a/crates/driver/src/domain/competition/bad_tokens/mod.rs b/crates/driver/src/domain/competition/bad_tokens/mod.rs
index 4ea4d270b1..ffa1578425 100644
--- a/crates/driver/src/domain/competition/bad_tokens/mod.rs
+++ b/crates/driver/src/domain/competition/bad_tokens/mod.rs
@@ -53,8 +53,8 @@ impl Detector {
     }
 
     /// Enables detection of unsupported tokens based on heuristics.
-    pub fn with_heuristic_detector(&mut self) -> &mut Self {
-        self.metrics = Some(metrics::Detector);
+    pub fn with_metrics_detector(&mut self, detector: metrics::Detector) -> &mut Self {
+        self.metrics = Some(detector);
         self
     }
 
@@ -106,6 +106,20 @@ impl Detector {
         auction
     }
 
+    /// Updates the tokens quality metric for successful operation.
+    pub fn encoding_succeeded(&self, token_pairs: &[(eth::TokenAddress, eth::TokenAddress)]) {
+        if let Some(metrics) = &self.metrics {
+            metrics.update_tokens(token_pairs, false);
+        }
+    }
+
+    /// Updates the tokens quality metric for failures.
+    pub fn encoding_failed(&self, token_pairs: &[(eth::TokenAddress, eth::TokenAddress)]) {
+        if let Some(metrics) = &self.metrics {
+            metrics.update_tokens(token_pairs, true);
+        }
+    }
+
     fn get_token_quality(&self, token: eth::TokenAddress, now: Instant) -> Option<Quality> {
         if let Some(quality) = self.hardcoded.get(&token) {
             return Some(*quality);
@@ -118,7 +132,7 @@ impl Detector {
         }
 
         if let Some(metrics) = &self.metrics {
-            return metrics.get_quality(token);
+            return metrics.get_quality(&token);
         }
 
         None
diff --git a/crates/driver/src/domain/competition/mod.rs b/crates/driver/src/domain/competition/mod.rs
index 5745bb64d8..6050e25df8 100644
--- a/crates/driver/src/domain/competition/mod.rs
+++ b/crates/driver/src/domain/competition/mod.rs
@@ -122,6 +122,7 @@ impl Competition {
             .into_iter()
             .map(|solution| async move {
                 let id = solution.id().clone();
+                let token_pairs = solution.token_pairs();
                 observe::encoding(&id);
                 let settlement = solution
                     .encode(
@@ -131,16 +132,19 @@ impl Competition {
                         self.solver.solver_native_token(),
                     )
                     .await;
-                (id, settlement)
+                (id, token_pairs, settlement)
             })
             .collect::<FuturesUnordered<_>>()
-            .filter_map(|(id, result)| async move {
+            .filter_map(|(id, token_pairs, result)| async move {
                 match result {
-                    Ok(solution) => Some(solution),
+                    Ok(solution) => {
+                        self.bad_tokens.encoding_succeeded(&token_pairs);
+                        Some(solution)
+                    }
                     // don't report on errors coming from solution merging
                     Err(_err) if id.solutions().len() > 1 => None,
                     Err(err) => {
-                        // TODO update metrics of bad token detection
+                        self.bad_tokens.encoding_failed(&token_pairs);
                         observe::encoding_failed(self.solver.name(), &id, &err);
                         notify::encoding_failed(&self.solver, auction.id(), &id, &err);
                         None
diff --git a/crates/driver/src/domain/competition/solution/mod.rs b/crates/driver/src/domain/competition/solution/mod.rs
index 2f55562900..800e3fb892 100644
--- a/crates/driver/src/domain/competition/solution/mod.rs
+++ b/crates/driver/src/domain/competition/solution/mod.rs
@@ -169,6 +169,23 @@ impl Solution {
         &self.trades
     }
 
+    /// Returns all the token pairs involved in the solution.
+    pub fn token_pairs(&self) -> Vec<(TokenAddress, TokenAddress)> {
+        self.trades
+            .iter()
+            .map(|trade| match trade {
+                Trade::Fulfillment(fulfillment) => {
+                    let order = fulfillment.order();
+                    (order.sell.token, order.buy.token)
+                }
+                Trade::Jit(jit) => {
+                    let order = jit.order();
+                    (order.sell.token, order.buy.token)
+                }
+            })
+            .collect()
+    }
+
     /// Interactions executed by this solution.
     pub fn interactions(&self) -> &[Interaction] {
         &self.interactions
diff --git a/crates/driver/src/infra/api/mod.rs b/crates/driver/src/infra/api/mod.rs
index ea4544c4c3..c6f57364d3 100644
--- a/crates/driver/src/infra/api/mod.rs
+++ b/crates/driver/src/infra/api/mod.rs
@@ -58,6 +58,8 @@ impl Api {
         app = routes::metrics(app);
         app = routes::healthz(app);
 
+        let metrics_bad_token_detector = bad_tokens::metrics::Detector::default();
+
         // Multiplex each solver as part of the API. Multiple solvers are multiplexed
         // on the same driver so only one liquidity collector collects the liquidity
         // for all of them. This is important because liquidity collection is
@@ -80,6 +82,13 @@ impl Api {
                 bad_tokens.with_simulation_detector(self.bad_token_detector.clone());
             }
 
+            if solver
+                .bad_token_detection()
+                .enable_metrics_based_bad_token_detection
+            {
+                bad_tokens.with_metrics_detector(metrics_bad_token_detector.clone());
+            }
+
             let router = router.with_state(State(Arc::new(Inner {
                 eth: self.eth.clone(),
                 solver: solver.clone(),
diff --git a/crates/driver/src/infra/config/file/load.rs b/crates/driver/src/infra/config/file/load.rs
index bb090d3d4e..81908c1fc2 100644
--- a/crates/driver/src/infra/config/file/load.rs
+++ b/crates/driver/src/infra/config/file/load.rs
@@ -110,6 +110,8 @@ pub async fn load(chain: chain::Id, path: &Path) -> infra::Config {
                         .collect(),
                     enable_simulation_based_bad_token_detection: config
                         .enable_simulation_bad_token_detection,
+                    enable_metrics_based_bad_token_detection: config
+                        .enable_metrics_bad_token_detection,
                 },
             }
         }))
diff --git a/crates/driver/src/infra/config/file/mod.rs b/crates/driver/src/infra/config/file/mod.rs
index 9f79c01e4f..2468361902 100644
--- a/crates/driver/src/infra/config/file/mod.rs
+++ b/crates/driver/src/infra/config/file/mod.rs
@@ -277,6 +277,11 @@ struct SolverConfig {
     /// tokens with `trace_callMany` based simulation.
     #[serde(default)]
     enable_simulation_bad_token_detection: bool,
+
+    /// Whether or not the solver opted into detecting unsupported
+    /// tokens with metrics-based detection.
+    #[serde(default)]
+    enable_metrics_bad_token_detection: bool,
 }
 
 #[derive(Clone, Copy, Debug, Default, Deserialize, PartialEq, Serialize)]
diff --git a/crates/driver/src/infra/solver/mod.rs b/crates/driver/src/infra/solver/mod.rs
index 76281b8f72..84adcb6d1b 100644
--- a/crates/driver/src/infra/solver/mod.rs
+++ b/crates/driver/src/infra/solver/mod.rs
@@ -307,4 +307,5 @@ pub struct BadTokenDetection {
     /// Tokens that are explicitly allow- or deny-listed.
     pub tokens_supported: HashMap<eth::TokenAddress, bad_tokens::Quality>,
     pub enable_simulation_based_bad_token_detection: bool,
+    pub enable_metrics_based_bad_token_detection: bool,
 }