diff --git a/Cargo.lock b/Cargo.lock
index 0896a3d35c..bc66a5f125 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -415,7 +415,6 @@ dependencies = [
  "arboard",
  "burn",
  "burn-common",
- "burn-wgpu",
  "clap 4.5.23",
  "colored",
  "cubecl",
diff --git a/backend-comparison/Cargo.toml b/backend-comparison/Cargo.toml
index 39ddd0c6f5..ee5f0bd8a2 100644
--- a/backend-comparison/Cargo.toml
+++ b/backend-comparison/Cargo.toml
@@ -34,7 +34,7 @@ wgpu-spirv-fusion = ["wgpu-spirv", "burn/fusion"]
 arboard = { workspace = true }
 burn = { path = "../crates/burn", default-features = false }
 burn-common = { path = "../crates/burn-common", version = "0.16.0" }
-burn-wgpu = { path = "../crates/burn-wgpu", default-features = false, version = "0.16.0", optional = true }
+
 clap = { workspace = true }
 colored = { workspace = true }
 cubecl = { workspace = true, features = ["wgpu"], default-features = true }
@@ -96,6 +96,11 @@ name = "conv3d"
 harness = false
 name = "matmul"
 
+[[bench]]
+harness = false
+name = "matmul-fused"
+path = "benches/matmul_fused.rs"
+
 [[bench]]
 harness = false
 name = "data"
diff --git a/backend-comparison/benches/matmul.rs b/backend-comparison/benches/matmul.rs
index 6970408a58..d950c71926 100644
--- a/backend-comparison/benches/matmul.rs
+++ b/backend-comparison/benches/matmul.rs
@@ -1,5 +1,5 @@
 use backend_comparison::persistence::save;
-use burn::tensor::{activation::relu, backend::Backend, Shape, Tensor};
+use burn::tensor::{backend::Backend, Distribution, Shape, Tensor};
 use burn_common::benchmark::{run_benchmark, Benchmark};
 use derive_new::new;
 
@@ -11,7 +11,7 @@ struct MatmulBenchmark<B: Backend, const D: usize> {
 }
 
 impl<B: Backend, const D: usize> Benchmark for MatmulBenchmark<B, D> {
-    type Args = (Tensor<B, D>, Tensor<B, D>, Tensor<B, 1>);
+    type Args = (Tensor<B, D>, Tensor<B, D>);
 
     fn name(&self) -> String {
         "matmul".into()
@@ -21,17 +21,15 @@ impl<B: Backend, const D: usize> Benchmark for MatmulBenchmark<B, D> {
         vec![self.shape_lhs.dims.clone(), self.shape_rhs.dims.clone()]
     }
 
-    fn execute(&self, (lhs, rhs, bias): Self::Args) {
-        let bias = bias.unsqueeze();
-        relu(lhs.matmul(rhs) + bias);
+    fn execute(&self, (lhs, rhs): Self::Args) {
+        lhs.matmul(rhs);
     }
 
     fn prepare(&self) -> Self::Args {
-        let lhs = Tensor::zeros(self.shape_lhs.clone(), &self.device);
-        let rhs = Tensor::zeros(self.shape_rhs.clone(), &self.device);
-        let bias = Tensor::zeros([self.shape_rhs.dims[2]], &self.device);
+        let lhs = Tensor::random(self.shape_lhs.clone(), Distribution::Default, &self.device);
+        let rhs = Tensor::random(self.shape_rhs.clone(), Distribution::Default, &self.device);
 
-        (lhs, rhs, bias)
+        (lhs, rhs)
     }
 
     fn sync(&self) {
diff --git a/backend-comparison/benches/matmul_fused.rs b/backend-comparison/benches/matmul_fused.rs
new file mode 100644
index 0000000000..375be97b4e
--- /dev/null
+++ b/backend-comparison/benches/matmul_fused.rs
@@ -0,0 +1,74 @@
+use backend_comparison::persistence::save;
+use burn::tensor::{activation::relu, backend::Backend, Distribution, Shape, Tensor};
+use burn_common::benchmark::{run_benchmark, Benchmark};
+use derive_new::new;
+
+#[derive(new)]
+struct MatmulBenchmark<B: Backend, const D: usize> {
+    shape_lhs: Shape,
+    shape_rhs: Shape,
+    device: B::Device,
+}
+
+impl<B: Backend, const D: usize> Benchmark for MatmulBenchmark<B, D> {
+    type Args = (Tensor<B, D>, Tensor<B, D>, Tensor<B, 1>);
+
+    fn name(&self) -> String {
+        "matmul_bias_relu".into()
+    }
+
+    fn shapes(&self) -> Vec<Vec<usize>> {
+        vec![self.shape_lhs.dims.clone(), self.shape_rhs.dims.clone()]
+    }
+
+    fn execute(&self, (lhs, rhs, bias): Self::Args) {
+        let bias = bias.unsqueeze();
+        relu(lhs.matmul(rhs) + bias);
+    }
+
+    fn prepare(&self) -> Self::Args {
+        let lhs = Tensor::random(self.shape_lhs.clone(), Distribution::Default, &self.device);
+        let rhs = Tensor::random(self.shape_rhs.clone(), Distribution::Default, &self.device);
+        let bias = Tensor::random(
+            [self.shape_rhs.dims[2]],
+            Distribution::Default,
+            &self.device,
+        );
+
+        (lhs, rhs, bias)
+    }
+
+    fn sync(&self) {
+        B::sync(&self.device)
+    }
+}
+
+#[allow(dead_code)]
+fn bench<B: Backend>(
+    device: &B::Device,
+    feature_name: &str,
+    url: Option<&str>,
+    token: Option<&str>,
+) {
+    let benchmarks = [
+        (2, 4096, 4096, 4096),
+        (32, 2048, 2048, 2048),
+        (256, 1024, 1024, 1024),
+        (1024, 256, 256, 256),
+    ]
+    .into_iter()
+    .map(|(b, m, n, k)| {
+        let shape_lhs = [b, m, k].into();
+        let shape_rhs = [b, k, n].into();
+
+        MatmulBenchmark::<B, 3>::new(shape_lhs, shape_rhs, device.clone())
+    })
+    .map(run_benchmark)
+    .collect();
+
+    save::<B>(benchmarks, device, feature_name, url, token).unwrap();
+}
+
+fn main() {
+    backend_comparison::bench_on_backend!();
+}
diff --git a/backend-comparison/src/burnbenchapp/base.rs b/backend-comparison/src/burnbenchapp/base.rs
index 0424e00c4e..83c5060a6b 100644
--- a/backend-comparison/src/burnbenchapp/base.rs
+++ b/backend-comparison/src/burnbenchapp/base.rs
@@ -103,6 +103,8 @@ enum BenchmarkValues {
     Data,
     #[strum(to_string = "matmul")]
     Matmul,
+    #[strum(to_string = "matmul-fused")]
+    MatmulFused,
     #[strum(to_string = "unary")]
     Unary,
     #[strum(to_string = "max-pool2d")]