diff --git a/crates/burn-autodiff/src/backend.rs b/crates/burn-autodiff/src/backend.rs
index e1f41e77a2..78bfe301f5 100644
--- a/crates/burn-autodiff/src/backend.rs
+++ b/crates/burn-autodiff/src/backend.rs
@@ -30,6 +30,7 @@ impl<B: Backend, C: CheckpointStrategy> Backend for Autodiff<B, C> {
     type IntElem = B::IntElem;
 
     type BoolTensorPrimitive = B::BoolTensorPrimitive;
+    type BoolElem = B::BoolElem;
 
     type QuantizedTensorPrimitive = B::QuantizedTensorPrimitive;
     type QuantizedEncoding = B::QuantizedEncoding;
diff --git a/crates/burn-autodiff/src/tests/mod.rs b/crates/burn-autodiff/src/tests/mod.rs
index 1980550157..438adfb98e 100644
--- a/crates/burn-autodiff/src/tests/mod.rs
+++ b/crates/burn-autodiff/src/tests/mod.rs
@@ -90,18 +90,18 @@ macro_rules! testgen_all {
 
             pub type FloatType = <TestBackend as burn_tensor::backend::Backend>::FloatElem;
             pub type IntType = <TestBackend as burn_tensor::backend::Backend>::IntElem;
-            pub type BoolType = <TestBackend as burn_tensor::backend::Backend>::BoolTensorPrimitive;
+            pub type BoolType = <TestBackend as burn_tensor::backend::Backend>::BoolElem;
 
             ::paste::paste! {
                 $(mod [<$float _ty>] {
                     pub use super::*;
 
-                    pub type TestBackend = TestBackend2<$float, IntType>;
+                    pub type TestBackend = TestBackend2<$float, IntType, BoolType>;
                     pub type TestAutodiffBackend = burn_autodiff::Autodiff<TestBackend>;
                     pub type TestAutodiffTensor<const D: usize> = burn_tensor::Tensor<TestAutodiffBackend, D>;
-                    pub type TestTensor<const D: usize> = TestTensor2<$float, IntType, D>;
-                    pub type TestTensorInt<const D: usize> = TestTensorInt2<$float, IntType, D>;
-                    pub type TestTensorBool<const D: usize> = TestTensorBool2<$float, IntType, D>;
+                    pub type TestTensor<const D: usize> = TestTensor2<$float, IntType, BoolType, D>;
+                    pub type TestTensorInt<const D: usize> = TestTensorInt2<$float, IntType, BoolType, D>;
+                    pub type TestTensorBool<const D: usize> = TestTensorBool2<$float, IntType, BoolType, D>;
 
                     type FloatType = $float;
 
diff --git a/crates/burn-candle/src/backend.rs b/crates/burn-candle/src/backend.rs
index e03b26474c..1ad606b910 100644
--- a/crates/burn-candle/src/backend.rs
+++ b/crates/burn-candle/src/backend.rs
@@ -168,6 +168,7 @@ impl<F: FloatCandleElement, I: IntCandleElement> Backend for Candle<F, I> {
     type IntElem = I;
 
     type BoolTensorPrimitive = CandleTensor;
+    type BoolElem = u32;
 
     type QuantizedTensorPrimitive = CandleQTensor;
     type QuantizedEncoding = u8;
diff --git a/crates/burn-cuda/src/lib.rs b/crates/burn-cuda/src/lib.rs
index 086d00bab7..030c2d9ff1 100644
--- a/crates/burn-cuda/src/lib.rs
+++ b/crates/burn-cuda/src/lib.rs
@@ -7,10 +7,10 @@ pub use cubecl::cuda::CudaDevice;
 use cubecl::cuda::CudaRuntime;
 
 #[cfg(not(feature = "fusion"))]
-pub type Cuda<F = f32, I = i32> = JitBackend<CudaRuntime, F, I>;
+pub type Cuda<F = f32, I = i32> = JitBackend<CudaRuntime, F, I, u8>;
 
 #[cfg(feature = "fusion")]
-pub type Cuda<F = f32, I = i32> = burn_fusion::Fusion<JitBackend<CudaRuntime, F, I>>;
+pub type Cuda<F = f32, I = i32> = burn_fusion::Fusion<JitBackend<CudaRuntime, F, I, u8>>;
 
 #[cfg(test)]
 mod tests {
@@ -19,5 +19,5 @@ mod tests {
     pub type TestRuntime = cubecl::cuda::CudaRuntime;
     pub use half::{bf16, f16};
 
-    burn_jit::testgen_all!([f16, bf16, f32], [i8, i16, i32, i64]);
+    burn_jit::testgen_all!([f16, bf16, f32], [i8, i16, i32, i64], [u8, u32]);
 }
diff --git a/crates/burn-fusion/src/backend.rs b/crates/burn-fusion/src/backend.rs
index aa72ba7dbc..aa308ad9a7 100644
--- a/crates/burn-fusion/src/backend.rs
+++ b/crates/burn-fusion/src/backend.rs
@@ -5,7 +5,7 @@ use burn_tensor::{
     backend::{Backend, DeviceOps},
     ops::{BoolTensor, FloatTensor, IntTensor, QuantizedTensor},
     repr::{OperationDescription, QuantizedKind, ReprBackend, TensorHandle},
-    Device,
+    Device, Element,
 };
 use serde::{de::DeserializeOwned, Serialize};
 use std::marker::PhantomData;
@@ -35,6 +35,8 @@ impl<B: FusionBackend> Backend for Fusion<B> {
 
     type BoolTensorPrimitive = FusionTensor<B::FusionRuntime>;
 
+    type BoolElem = B::BoolElem;
+
     type QuantizedTensorPrimitive = QFusionTensor<B::FusionRuntime>;
 
     type QuantizedEncoding = B::QuantizedEncoding;
@@ -142,6 +144,8 @@ pub trait FusionRuntime: Send + Sync + Sized + core::fmt::Debug {
     type FusionDevice: DeviceOps;
     /// The client to interact with the runtime.
     type FusionClient: FusionClient<Self>;
+    /// The type that represents booleans on the backend.
+    type BoolRepr: Element;
 
     /// The list of optimizations that will be used to optimize the computational graph.
     fn optimizations(
diff --git a/crates/burn-hip/src/lib.rs b/crates/burn-hip/src/lib.rs
index 89b91243c6..fc8f704e74 100644
--- a/crates/burn-hip/src/lib.rs
+++ b/crates/burn-hip/src/lib.rs
@@ -12,11 +12,11 @@ use cubecl::hip::HipRuntime;
 
 #[cfg(target_os = "linux")]
 #[cfg(not(feature = "fusion"))]
-pub type Hip<F = f32, I = i32> = JitBackend<HipRuntime, F, I>;
+pub type Hip<F = f32, I = i32, B = u8> = JitBackend<HipRuntime, F, I, B>;
 
 #[cfg(target_os = "linux")]
 #[cfg(feature = "fusion")]
-pub type Hip<F = f32, I = i32> = burn_fusion::Fusion<JitBackend<HipRuntime, F, I>>;
+pub type Hip<F = f32, I = i32, B = u8> = burn_fusion::Fusion<JitBackend<HipRuntime, F, I, B>>;
 
 // TODO: Hang the computer when AMD isn't available.
 //
diff --git a/crates/burn-jit/src/backend.rs b/crates/burn-jit/src/backend.rs
index 23629c4a9e..b455d859a0 100644
--- a/crates/burn-jit/src/backend.rs
+++ b/crates/burn-jit/src/backend.rs
@@ -1,4 +1,5 @@
 use crate::{
+    element::BoolElement,
     tensor::{JitTensor, QJitTensor},
     FloatElement, IntElement, JitRuntime,
 };
@@ -18,24 +19,27 @@ pub(crate) static SEED: Mutex<Option<StdRng>> = Mutex::new(None);
 
 /// Generic tensor backend that can be compiled just-in-time to any shader runtime
 #[derive(new)]
-pub struct JitBackend<R: JitRuntime, F: FloatElement, I: IntElement> {
+pub struct JitBackend<R: JitRuntime, F: FloatElement, I: IntElement, BT: BoolElement> {
     _runtime: PhantomData<R>,
     _float_elem: PhantomData<F>,
     _int_elem: PhantomData<I>,
+    _bool_elem: PhantomData<BT>,
 }
 
-impl<R, F, I> Backend for JitBackend<R, F, I>
+impl<R, F, I, BT> Backend for JitBackend<R, F, I, BT>
 where
     R: JitRuntime,
     R::Server: ComputeServer,
     R::Device: burn_tensor::backend::DeviceOps,
     F: FloatElement,
     I: IntElement,
+    BT: BoolElement,
 {
     type Device = R::Device;
 
     type FloatElem = F;
     type IntElem = I;
+    type BoolElem = BT;
 
     type FloatTensorPrimitive = JitTensor<R>;
     type IntTensorPrimitive = JitTensor<R>;
@@ -63,19 +67,25 @@ where
     }
 }
 
-impl<R: JitRuntime, F: FloatElement, I: IntElement> core::fmt::Debug for JitBackend<R, F, I> {
+impl<R: JitRuntime, F: FloatElement, I: IntElement, BT: BoolElement> core::fmt::Debug
+    for JitBackend<R, F, I, BT>
+{
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.write_fmt(format_args!("JitBackend {{ runtime: {}}}", R::name()))
     }
 }
 
-impl<R: JitRuntime, F: FloatElement, I: IntElement> Clone for JitBackend<R, F, I> {
+impl<R: JitRuntime, F: FloatElement, I: IntElement, BT: BoolElement> Clone
+    for JitBackend<R, F, I, BT>
+{
     fn clone(&self) -> Self {
         Self::new()
     }
 }
 
-impl<R: JitRuntime, F: FloatElement, I: IntElement> Default for JitBackend<R, F, I> {
+impl<R: JitRuntime, F: FloatElement, I: IntElement, BT: BoolElement> Default
+    for JitBackend<R, F, I, BT>
+{
     fn default() -> Self {
         Self::new()
     }
@@ -90,7 +100,9 @@ where
 }
 
 #[cfg(not(feature = "fusion"))]
-impl<R: JitRuntime, F: FloatElement, I: IntElement> ReprBackend for JitBackend<R, F, I> {
+impl<R: JitRuntime, F: FloatElement, I: IntElement, BT: BoolElement> ReprBackend
+    for JitBackend<R, F, I, BT>
+{
     type Handle = HandleKind<Self>;
 
     fn float_tensor(handle: TensorHandle<Self::Handle>) -> FloatTensor<Self> {
diff --git a/crates/burn-jit/src/element.rs b/crates/burn-jit/src/element.rs
index 939b2fb24e..f0e15352cf 100644
--- a/crates/burn-jit/src/element.rs
+++ b/crates/burn-jit/src/element.rs
@@ -13,6 +13,27 @@ pub trait FloatElement: JitElement + Float {}
 /// The int element type for the jit backend.
 pub trait IntElement: JitElement + Int {}
 
+/// The element type for booleans for the jit backend.
+pub trait BoolElement: JitElement + Int {
+    /// The true value for the boolean element.
+    fn true_val() -> Self {
+        Self::from_int(1)
+    }
+
+    /// The false value for the boolean element.
+    fn false_val() -> Self {
+        Self::from_int(0)
+    }
+
+    /// New bool element from Rust bool.
+    fn new_bool(val: bool) -> Self {
+        match val {
+            true => Self::true_val(),
+            false => Self::false_val(),
+        }
+    }
+}
+
 impl JitElement for u64 {}
 impl JitElement for u32 {}
 impl JitElement for u16 {}
@@ -36,3 +57,6 @@ impl IntElement for i64 {}
 impl IntElement for i32 {}
 impl IntElement for i16 {}
 impl IntElement for i8 {}
+
+impl BoolElement for u8 {}
+impl BoolElement for u32 {}
diff --git a/crates/burn-jit/src/fusion/base.rs b/crates/burn-jit/src/fusion/base.rs
index 7968626e89..4572f580b5 100644
--- a/crates/burn-jit/src/fusion/base.rs
+++ b/crates/burn-jit/src/fusion/base.rs
@@ -1,6 +1,6 @@
 use super::elemwise::optimization::{ElemwiseOptimization, ElemwiseOptimizationState};
-use crate::fusion::elemwise::builder::ElementWiseBuilder;
 use crate::tensor::{JitQuantizationParameters, QJitTensor};
+use crate::{element::BoolElement, fusion::elemwise::builder::ElementWiseBuilder};
 use crate::{kernel, tensor::JitTensor, FloatElement, IntElement, JitBackend, JitRuntime};
 use burn_fusion::{client::MutexFusionClient, FusionBackend, FusionRuntime};
 use burn_tensor::quantization::QuantizationScheme;
@@ -30,13 +30,14 @@ pub enum JitOptimizationState {
     ElementWise(ElemwiseOptimizationState),
 }
 
-impl<R> burn_fusion::Optimization<FusionJitRuntime<R>> for JitOptimization<R>
+impl<R, BT> burn_fusion::Optimization<FusionJitRuntime<R, BT>> for JitOptimization<R>
 where
     R: JitRuntime,
+    BT: BoolElement,
 {
     fn execute(&mut self, context: &mut burn_fusion::stream::Context<'_, JitFusionHandle<R>>) {
         match self {
-            Self::ElementWise2(op) => op.execute(context),
+            Self::ElementWise2(op) => op.execute::<BT>(context),
         }
     }
 
@@ -61,7 +62,9 @@ where
     }
 }
 
-impl<R: JitRuntime, F: FloatElement, I: IntElement> ReprBackend for JitBackend<R, F, I> {
+impl<R: JitRuntime, F: FloatElement, I: IntElement, BT: BoolElement> ReprBackend
+    for JitBackend<R, F, I, BT>
+{
     type Handle = JitFusionHandle<R>;
 
     fn float_tensor(handle: TensorHandle<Self::Handle>) -> burn_tensor::ops::FloatTensor<Self> {
@@ -122,30 +125,37 @@ impl<R: JitRuntime, F: FloatElement, I: IntElement> ReprBackend for JitBackend<R
     }
 }
 
-impl<R: JitRuntime> FusionRuntime for FusionJitRuntime<R> {
+impl<R: JitRuntime, BT: BoolElement> FusionRuntime for FusionJitRuntime<R, BT> {
     type OptimizationState = JitOptimizationState;
     type Optimization = JitOptimization<R>;
     type FusionHandle = JitFusionHandle<R>;
     type FusionDevice = R::JitDevice;
     type FusionClient = MutexFusionClient<Self>;
+    type BoolRepr = BT;
 
     fn optimizations(
         device: R::Device,
     ) -> Vec<Box<dyn burn_fusion::OptimizationBuilder<Self::Optimization>>> {
-        vec![Box::new(ElementWiseBuilder::<R>::new(device.clone()))]
+        vec![Box::new(ElementWiseBuilder::<R>::new(
+            device.clone(),
+            BT::as_elem().into(),
+        ))]
     }
 }
 
 /// Fusion runtime for JIT runtimes.
 #[derive(Debug)]
-pub struct FusionJitRuntime<R: JitRuntime> {
+pub struct FusionJitRuntime<R: JitRuntime, BT: BoolElement> {
     _b: PhantomData<R>,
+    _bool: PhantomData<BT>,
 }
 
-impl<R: JitRuntime, F: FloatElement, I: IntElement> FusionBackend for JitBackend<R, F, I> {
-    type FusionRuntime = FusionJitRuntime<R>;
+impl<R: JitRuntime, F: FloatElement, I: IntElement, BT: BoolElement> FusionBackend
+    for JitBackend<R, F, I, BT>
+{
+    type FusionRuntime = FusionJitRuntime<R, BT>;
 
-    type FullPrecisionBackend = JitBackend<R, f32, i32>;
+    type FullPrecisionBackend = JitBackend<R, f32, i32, BT>;
 
     fn cast_float(
         tensor: burn_tensor::ops::FloatTensor<Self>,
diff --git a/crates/burn-jit/src/fusion/elemwise/builder.rs b/crates/burn-jit/src/fusion/elemwise/builder.rs
index 6766e3000a..e37196bc2a 100644
--- a/crates/burn-jit/src/fusion/elemwise/builder.rs
+++ b/crates/burn-jit/src/fusion/elemwise/builder.rs
@@ -1,7 +1,10 @@
 use burn_fusion::OptimizationBuilder;
 
 use crate::{
-    fusion::{on_write::builder::FuseOnWriteBuilder, JitOptimization},
+    fusion::{
+        on_write::{builder::FuseOnWriteBuilder, ir::ElemwisePrecision},
+        JitOptimization,
+    },
     JitRuntime,
 };
 
@@ -14,13 +17,13 @@ pub(crate) struct ElementWiseBuilder<R: JitRuntime> {
 }
 
 impl<R: JitRuntime> ElementWiseBuilder<R> {
-    pub fn new(device: R::Device) -> Self {
+    pub fn new(device: R::Device, bool_precision: ElemwisePrecision) -> Self {
         let client = R::client(&device);
         let props = client.properties();
         let max_bindings = props.hardware_properties().max_bindings;
 
         Self {
-            builder: FuseOnWriteBuilder::new(max_bindings),
+            builder: FuseOnWriteBuilder::new(max_bindings, bool_precision),
             device,
         }
     }
diff --git a/crates/burn-jit/src/fusion/elemwise/optimization.rs b/crates/burn-jit/src/fusion/elemwise/optimization.rs
index f5f3000926..d3e8e35b50 100644
--- a/crates/burn-jit/src/fusion/elemwise/optimization.rs
+++ b/crates/burn-jit/src/fusion/elemwise/optimization.rs
@@ -1,4 +1,4 @@
-use crate::fusion::on_write::kernel::fuse_on_write;
+use crate::{fusion::on_write::kernel::fuse_on_write, BoolElement};
 use crate::{fusion::JitFusionHandle, JitRuntime};
 use burn_fusion::stream::Context;
 use burn_tensor::repr::TensorDescription;
@@ -28,9 +28,9 @@ pub struct ElemwiseOptimizationState {
 
 impl<R: JitRuntime> ElemwiseOptimization<R> {
     /// Execute the optimization.
-    pub fn execute(&mut self, context: &mut Context<'_, JitFusionHandle<R>>) {
+    pub fn execute<BT: BoolElement>(&mut self, context: &mut Context<'_, JitFusionHandle<R>>) {
         self.trace
-            .run::<R, Self>(&self.client, &self.device, context)
+            .run::<R, BT, Self>(&self.client, &self.device, context)
     }
 
     /// Number of element wise operations fused.
diff --git a/crates/burn-jit/src/fusion/on_write/builder.rs b/crates/burn-jit/src/fusion/on_write/builder.rs
index 1bd167af90..287b656274 100644
--- a/crates/burn-jit/src/fusion/on_write/builder.rs
+++ b/crates/burn-jit/src/fusion/on_write/builder.rs
@@ -1,5 +1,5 @@
 use super::{
-    ir::{Arg, BinaryElemwiseArgs, ElemwiseOp, UnaryElemwiseArgs},
+    ir::{Arg, BinaryElemwiseArgs, ElemwiseOp, ElemwisePrecision, UnaryElemwiseArgs},
     trace::FuseOnWriteTrace,
     trace_builder::FuseOnWriteTraceBuilder,
 };
@@ -30,9 +30,9 @@ struct TryFuseBuilder {
 }
 
 impl TryFuseBuilder {
-    fn new(max_bindings: u32) -> Self {
+    fn new(max_bindings: u32, bool_precision: ElemwisePrecision) -> Self {
         Self {
-            builder: FuseOnWriteTraceBuilder::new(),
+            builder: FuseOnWriteTraceBuilder::new(bool_precision),
             max_bindings,
             added_ops: false,
         }
@@ -118,7 +118,7 @@ impl OptimizationBuilder<FuseOnWriteTrace> for FuseOnWriteBuilder {
     fn reset(&mut self) {
         self.num_ops = 0;
         self.status = OptimizationStatus::Open;
-        self.builder = TryFuseBuilder::new(self.max_bindings);
+        self.builder = TryFuseBuilder::new(self.max_bindings, self.builder.builder.bool_precision);
         self.current_output_shape.clear();
     }
 
@@ -137,9 +137,9 @@ impl OptimizationBuilder<FuseOnWriteTrace> for FuseOnWriteBuilder {
 }
 
 impl FuseOnWriteBuilder {
-    pub fn new(max_bindings: u32) -> Self {
+    pub fn new(max_bindings: u32, bool_precision: ElemwisePrecision) -> Self {
         Self {
-            builder: TryFuseBuilder::new(max_bindings),
+            builder: TryFuseBuilder::new(max_bindings, bool_precision),
             num_ops: 0,
             max_bindings,
             current_output_shape: Vec::new(),
diff --git a/crates/burn-jit/src/fusion/on_write/trace.rs b/crates/burn-jit/src/fusion/on_write/trace.rs
index 591cc9c347..d9ec09aea8 100644
--- a/crates/burn-jit/src/fusion/on_write/trace.rs
+++ b/crates/burn-jit/src/fusion/on_write/trace.rs
@@ -1,6 +1,6 @@
 use crate::{
     fusion::{on_write::ir::LayoutInfo, strides_dyn_rank, JitFusionHandle},
-    JitRuntime,
+    BoolElement, JitRuntime,
 };
 
 use super::ir::{Arg, ElemwiseConfig, ElemwiseOp, ElemwisePrecision, GlobalArgsLaunch};
@@ -90,16 +90,17 @@ struct PotentialInplace<'a> {
 
 impl FuseOnWriteTrace {
     /// Run a trace with the given [runner](TraceRunner).
-    pub fn run<R: JitRuntime, Runner: TraceRunner<R>>(
+    pub fn run<R: JitRuntime, BT: BoolElement, Runner: TraceRunner<R>>(
         &self,
         client: &ComputeClient<R::Server, R::Channel>,
         device: &R::Device,
         context: &mut Context<'_, JitFusionHandle<R>>,
     ) {
-        let analysis = self.analyse::<R, Runner>(client, device, context);
+        let analysis = self.analyse::<R, BT, Runner>(client, device, context);
 
         let inputs = self.register_inputs(context, &analysis.handle_inputs, analysis.vectorization);
-        let outputs = self.register_outputs(&analysis.handle_outputs, analysis.vectorization);
+        let outputs =
+            self.register_outputs::<_, BT>(&analysis.handle_outputs, analysis.vectorization);
 
         let mut ops = Sequence::new();
         for op in analysis.reads.into_values() {
@@ -126,7 +127,7 @@ impl FuseOnWriteTrace {
         Runner::run(client, inputs, outputs, config)
     }
 
-    fn analyse<'a, 'c, R: JitRuntime, Runner: TraceRunner<R>>(
+    fn analyse<'a, 'c, R: JitRuntime, BT: BoolElement, Runner: TraceRunner<R>>(
         &'a self,
         client: &ComputeClient<R::Server, R::Channel>,
         device: &R::Device,
@@ -146,7 +147,7 @@ impl FuseOnWriteTrace {
         };
 
         self.analyse_inputs(context, &mut analysis);
-        self.analyse_outputs(client, device, context, &mut analysis);
+        self.analyse_outputs::<_, BT>(client, device, context, &mut analysis);
 
         analysis.vectorization = Runner::vectorization(
             analysis.handle_inputs.iter().map(|item| &item.handle),
@@ -189,7 +190,7 @@ impl FuseOnWriteTrace {
         }
     }
 
-    fn analyse_outputs<'a, 'c, R: JitRuntime>(
+    fn analyse_outputs<'a, 'c, R: JitRuntime, BT: BoolElement>(
         &'a self,
         client: &ComputeClient<R::Server, R::Channel>,
         device: &R::Device,
@@ -273,9 +274,9 @@ impl FuseOnWriteTrace {
                     }
                 }
 
-                // We encode bool tensors as u32.
+                // We encode bool tensors as `B`.
                 let dtype = match tensor_global.dtype {
-                    DType::Bool => DType::U32,
+                    DType::Bool => BT::dtype(),
                     _ => tensor_global.dtype,
                 };
                 let size = tensor_global.shape.iter().product::<usize>() * Elem::from(dtype).size();
@@ -406,7 +407,7 @@ impl FuseOnWriteTrace {
         inputs
     }
 
-    fn register_outputs<'s, R: JitRuntime>(
+    fn register_outputs<'s, R: JitRuntime, BT: BoolElement>(
         &self,
         handle_outputs: &'s [HandleOutput<'_, R>],
         vectorization: u8,
@@ -473,8 +474,11 @@ impl FuseOnWriteTrace {
                         ElemwisePrecision::U32 => outputs.t_u32.push(arg),
                         ElemwisePrecision::U16 => outputs.t_u16.push(arg),
                         ElemwisePrecision::U8 => outputs.t_u8.push(arg),
-                        // Bools are encoded as u32.
-                        ElemwisePrecision::Bool => outputs.t_u32.push(arg),
+                        ElemwisePrecision::Bool => match BT::dtype() {
+                            DType::U32 => outputs.t_u32.push(arg),
+                            DType::U8 => outputs.t_u8.push(arg),
+                            _ => todo!(),
+                        },
                     };
                 }
             }
diff --git a/crates/burn-jit/src/fusion/on_write/trace_builder.rs b/crates/burn-jit/src/fusion/on_write/trace_builder.rs
index 06e8d24e15..5cb427814d 100644
--- a/crates/burn-jit/src/fusion/on_write/trace_builder.rs
+++ b/crates/burn-jit/src/fusion/on_write/trace_builder.rs
@@ -16,10 +16,11 @@ pub struct FuseOnWriteTraceBuilder {
     scalars: BTreeMap<ElemwisePrecision, u32>,
     ops: Vec<ElemwiseOp>,
     reads: BTreeMap<TensorId, ElemwiseOp>,
+    pub bool_precision: ElemwisePrecision,
 }
 
 impl FuseOnWriteTraceBuilder {
-    pub fn new() -> Self {
+    pub fn new(bool_precision: ElemwisePrecision) -> Self {
         Self {
             locals: Locals::default(),
             outputs: RegisteredTensors::default(),
@@ -27,6 +28,7 @@ impl FuseOnWriteTraceBuilder {
             scalars: BTreeMap::default(),
             ops: Vec::new(),
             reads: BTreeMap::new(),
+            bool_precision,
         }
     }
 
@@ -49,9 +51,9 @@ impl FuseOnWriteTraceBuilder {
     pub fn input(&mut self, tensor: &TensorDescription) -> Arg {
         let precision = tensor.dtype.into();
 
-        // Bool tensors are encoded as u32.
+        // Bool tensors are encoded as bool_precision.
         let precision_input = match precision {
-            ElemwisePrecision::Bool => ElemwisePrecision::U32,
+            ElemwisePrecision::Bool => self.bool_precision,
             _ => precision,
         };
 
@@ -82,9 +84,9 @@ impl FuseOnWriteTraceBuilder {
     pub fn output(&mut self, tensor: &TensorDescription) -> Arg {
         let precision = tensor.dtype.into();
 
-        // Bool tensors are encoded as u32.
+        // Bool tensors are encoded as bool_precision.
         let precision_output = match precision {
-            ElemwisePrecision::Bool => ElemwisePrecision::U32,
+            ElemwisePrecision::Bool => self.bool_precision,
             _ => precision,
         };
 
@@ -103,9 +105,9 @@ impl FuseOnWriteTraceBuilder {
     pub fn scalar<E: Element>(&mut self, _: &E, dtype: DType) -> Arg {
         let precision = dtype.into();
 
-        // Bool scalars are encoded as u32.
+        // Bool scalars are encoded as bool_precision.
         let precision = match precision {
-            ElemwisePrecision::Bool => ElemwisePrecision::U32,
+            ElemwisePrecision::Bool => self.bool_precision,
             _ => precision,
         };
         let new_index = self.scalars.get(&precision).copied().unwrap_or(0);
@@ -154,9 +156,9 @@ impl FuseOnWriteTraceBuilder {
         let mark = |var: &Arg, list: &mut Vec<(TensorId, ElemwisePrecision)>| {
             if let Arg::Local(index, precision) = var {
                 if let Some(tensor_id) = self.locals.find_tensor_id(*precision, *index) {
-                    // Input and outputs tensors are using u32 for booleans.
+                    // Input and outputs tensors are using bool_precision for booleans.
                     let precision = match precision {
-                        ElemwisePrecision::Bool => ElemwisePrecision::U32,
+                        ElemwisePrecision::Bool => self.bool_precision,
                         _ => *precision,
                     };
 
diff --git a/crates/burn-jit/src/kernel/cast/bool_cast.rs b/crates/burn-jit/src/kernel/cast/bool_cast.rs
index 07a915ee1f..74e55888e1 100644
--- a/crates/burn-jit/src/kernel/cast/bool_cast.rs
+++ b/crates/burn-jit/src/kernel/cast/bool_cast.rs
@@ -1,9 +1,9 @@
-use crate::{tensor::JitTensor, JitElement, JitRuntime};
+use crate::{tensor::JitTensor, BoolElement, JitElement, JitRuntime};
 use cubecl::{calculate_cube_count_elemwise, prelude::*, CubeDim};
 
 #[cube(launch)]
-fn bool_cast_kernel<T: Numeric>(input: &Tensor<u32>, output: &mut Tensor<T>) {
-    if input[ABSOLUTE_POS] >= 1 {
+fn bool_cast_kernel<B: Numeric, T: Numeric>(input: &Tensor<B>, output: &mut Tensor<T>) {
+    if input[ABSOLUTE_POS] >= B::from_int(1) {
         output[ABSOLUTE_POS] = T::from_int(1);
     } else {
         output[ABSOLUTE_POS] = T::from_int(0);
@@ -12,11 +12,13 @@ fn bool_cast_kernel<T: Numeric>(input: &Tensor<u32>, output: &mut Tensor<T>) {
 
 /// Cast a bool tensor to the given element type.
 ///
-/// This alternative to cast is necessary because bool are represented as u32
+/// This alternative to cast is necessary because bool are represented as u32 or u8
 /// where any non-zero value means true. Depending how it was created
 /// it may hold an uncanny bit combination. Naively casting it would not
 /// necessarily yield 0 or 1.
-pub fn bool_cast<R: JitRuntime, EO: JitElement>(tensor: JitTensor<R>) -> JitTensor<R> {
+pub fn bool_cast<R: JitRuntime, BT: BoolElement, EO: JitElement>(
+    tensor: JitTensor<R>,
+) -> JitTensor<R> {
     let num_elems = tensor.shape.num_elements();
     let buffer = tensor.client.empty(num_elems * core::mem::size_of::<EO>());
     let output = JitTensor::new_contiguous(
@@ -30,11 +32,11 @@ pub fn bool_cast<R: JitRuntime, EO: JitElement>(tensor: JitTensor<R>) -> JitTens
     let cube_dim = CubeDim::default();
     let cube_count = calculate_cube_count_elemwise(num_elems, cube_dim);
 
-    bool_cast_kernel::launch::<EO, R>(
+    bool_cast_kernel::launch::<BT, EO, R>(
         &tensor.client,
         cube_count,
         cube_dim,
-        tensor.as_tensor_arg::<u32>(1),
+        tensor.as_tensor_arg::<BT>(1),
         output.as_tensor_arg::<EO>(1),
     );
 
diff --git a/crates/burn-jit/src/kernel/comparison.rs b/crates/burn-jit/src/kernel/comparison.rs
index 420a74d81b..007d4200d9 100644
--- a/crates/burn-jit/src/kernel/comparison.rs
+++ b/crates/burn-jit/src/kernel/comparison.rs
@@ -1,5 +1,7 @@
-use crate::{element::JitElement, ops::numeric::empty_device, tensor::JitTensor, JitRuntime};
-use burn_tensor::{DType, Shape};
+use crate::{
+    element::JitElement, ops::numeric::empty_device, tensor::JitTensor, BoolElement, JitRuntime,
+};
+use burn_tensor::Shape;
 use cubecl::{
     calculate_cube_count_elemwise, linalg::tensor::index_offset_with_layout, prelude::*,
     tensor_vectorization_factor,
@@ -55,10 +57,10 @@ impl<N: Numeric> ComparisonOp<N> for LowerOp {
 }
 
 #[cube(launch)]
-pub(crate) fn kernel_scalar_cmp<C: Numeric, O: ComparisonOp<C>>(
+pub(crate) fn kernel_scalar_cmp<C: Numeric, B: Numeric, O: ComparisonOp<C>>(
     input: &Tensor<Line<C>>,
     scalar: C,
-    output: &mut Tensor<Line<u32>>,
+    output: &mut Tensor<Line<B>>,
 ) {
     let offset_output = ABSOLUTE_POS;
 
@@ -70,10 +72,10 @@ pub(crate) fn kernel_scalar_cmp<C: Numeric, O: ComparisonOp<C>>(
 }
 
 #[cube(launch)]
-pub(crate) fn kernel_cmp<C: Numeric, O: ComparisonOp<C>>(
+pub(crate) fn kernel_cmp<C: Numeric, B: Numeric, O: ComparisonOp<C>>(
     lhs: &Tensor<Line<C>>,
     rhs: &Tensor<Line<C>>,
-    out: &mut Tensor<Line<u32>>,
+    out: &mut Tensor<Line<B>>,
     #[comptime] rank: Option<u32>,
     #[comptime] to_contiguous_lhs: bool,
     #[comptime] to_contiguous_rhs: bool,
@@ -87,7 +89,7 @@ pub(crate) fn kernel_cmp<C: Numeric, O: ComparisonOp<C>>(
     }
 
     if to_contiguous_lhs {
-        offset_lhs = index_offset_with_layout::<C, u32>(
+        offset_lhs = index_offset_with_layout::<C, B>(
             lhs,
             out,
             offset_out,
@@ -98,7 +100,7 @@ pub(crate) fn kernel_cmp<C: Numeric, O: ComparisonOp<C>>(
     }
 
     if to_contiguous_rhs {
-        offset_rhs = index_offset_with_layout::<C, u32>(
+        offset_rhs = index_offset_with_layout::<C, B>(
             rhs,
             out,
             offset_out,
@@ -111,7 +113,7 @@ pub(crate) fn kernel_cmp<C: Numeric, O: ComparisonOp<C>>(
     out[offset_out] = Line::cast_from(O::execute(lhs[offset_lhs], rhs[offset_rhs]));
 }
 
-pub(crate) fn launch_cmp<R: JitRuntime, E: JitElement, O: ComparisonOp<E>>(
+pub(crate) fn launch_cmp<R: JitRuntime, E: JitElement, BT: BoolElement, O: ComparisonOp<E>>(
     lhs: JitTensor<R>,
     rhs: JitTensor<R>,
 ) -> JitTensor<R> {
@@ -141,9 +143,9 @@ pub(crate) fn launch_cmp<R: JitRuntime, E: JitElement, O: ComparisonOp<E>>(
     let cube_count =
         calculate_cube_count_elemwise(num_elems / vectorization_factor as usize, cube_dim);
 
-    let same_tensor_type = core::any::TypeId::of::<E>() == core::any::TypeId::of::<u32>();
+    let same_tensor_type = core::any::TypeId::of::<E>() == core::any::TypeId::of::<BT>();
     if same_tensor_type && lhs.can_mut_broadcast(&rhs) {
-        kernel_cmp::launch::<E, O, R>(
+        kernel_cmp::launch::<E, BT, O, R>(
             &client,
             cube_count,
             cube_dim,
@@ -161,10 +163,10 @@ pub(crate) fn launch_cmp<R: JitRuntime, E: JitElement, O: ComparisonOp<E>>(
             lhs.shape,
             lhs.device,
             lhs.strides,
-            DType::U32,
+            BT::dtype(),
         )
     } else if same_tensor_type && rhs.can_mut_broadcast(&lhs) {
-        kernel_cmp::launch::<E, O, R>(
+        kernel_cmp::launch::<E, BT, O, R>(
             &client,
             cube_count,
             CubeDim::default(),
@@ -182,20 +184,20 @@ pub(crate) fn launch_cmp<R: JitRuntime, E: JitElement, O: ComparisonOp<E>>(
             rhs.shape,
             rhs.device,
             rhs.strides,
-            DType::U32,
+            BT::dtype(),
         )
     } else {
-        let output = empty_device::<R, u32>(lhs.client.clone(), lhs.device.clone(), shape_out);
+        let output = empty_device::<R, BT>(lhs.client.clone(), lhs.device.clone(), shape_out);
         let to_contiguous_lhs = lhs.strides != output.strides || lhs.shape != output.shape;
         let to_contiguous_rhs = rhs.strides != output.strides || rhs.shape != output.shape;
 
-        kernel_cmp::launch::<E, O, R>(
+        kernel_cmp::launch::<E, BT, O, R>(
             &client,
             cube_count,
             CubeDim::default(),
             lhs.as_tensor_arg::<E>(vectorization_factor),
             rhs.as_tensor_arg::<E>(vectorization_factor),
-            output.as_tensor_arg::<u32>(vectorization_factor),
+            output.as_tensor_arg::<BT>(vectorization_factor),
             None,
             to_contiguous_lhs,
             to_contiguous_rhs,
@@ -205,7 +207,12 @@ pub(crate) fn launch_cmp<R: JitRuntime, E: JitElement, O: ComparisonOp<E>>(
     }
 }
 
-pub(crate) fn launch_scalar_cmp<R: JitRuntime, E: JitElement, O: ComparisonOp<E>>(
+pub(crate) fn launch_scalar_cmp<
+    R: JitRuntime,
+    E: JitElement,
+    BT: BoolElement,
+    O: ComparisonOp<E>,
+>(
     mut tensor: JitTensor<R>,
     scalar: E,
 ) -> JitTensor<R> {
@@ -224,9 +231,9 @@ pub(crate) fn launch_scalar_cmp<R: JitRuntime, E: JitElement, O: ComparisonOp<E>
     let cube_count =
         calculate_cube_count_elemwise(num_elems / vectorization_factor as usize, cube_dim);
 
-    let same_tensor_type = core::any::TypeId::of::<E>() == core::any::TypeId::of::<u32>();
+    let same_tensor_type = core::any::TypeId::of::<E>() == core::any::TypeId::of::<BT>();
     if same_tensor_type && tensor.can_mut() {
-        kernel_scalar_cmp::launch::<E, O, R>(
+        kernel_scalar_cmp::launch::<E, BT, O, R>(
             &client,
             cube_count,
             cube_dim,
@@ -241,70 +248,94 @@ pub(crate) fn launch_scalar_cmp<R: JitRuntime, E: JitElement, O: ComparisonOp<E>
             tensor.shape,
             tensor.device,
             tensor.strides,
-            DType::U32,
+            BT::dtype(),
         )
     } else {
-        let output = empty_device::<R, u32>(
+        let output = empty_device::<R, BT>(
             tensor.client.clone(),
             tensor.device.clone(),
             tensor.shape.clone(),
         );
 
-        kernel_scalar_cmp::launch::<E, O, R>(
+        kernel_scalar_cmp::launch::<E, BT, O, R>(
             &client,
             cube_count,
             CubeDim::default(),
             tensor.as_tensor_arg::<E>(vectorization_factor),
             ScalarArg::new(scalar),
-            output.as_tensor_arg::<u32>(vectorization_factor),
+            output.as_tensor_arg::<BT>(vectorization_factor),
         );
 
         output
     }
 }
 
-pub fn equal<R: JitRuntime, E: JitElement>(lhs: JitTensor<R>, rhs: JitTensor<R>) -> JitTensor<R> {
-    launch_cmp::<R, E, EqualOp>(lhs, rhs)
+pub fn equal<R: JitRuntime, E: JitElement, BT: BoolElement>(
+    lhs: JitTensor<R>,
+    rhs: JitTensor<R>,
+) -> JitTensor<R> {
+    launch_cmp::<R, E, BT, EqualOp>(lhs, rhs)
 }
 
-pub fn greater<R: JitRuntime, E: JitElement>(lhs: JitTensor<R>, rhs: JitTensor<R>) -> JitTensor<R> {
-    launch_cmp::<R, E, GreaterOp>(lhs, rhs)
+pub fn greater<R: JitRuntime, E: JitElement, BT: BoolElement>(
+    lhs: JitTensor<R>,
+    rhs: JitTensor<R>,
+) -> JitTensor<R> {
+    launch_cmp::<R, E, BT, GreaterOp>(lhs, rhs)
 }
 
-pub fn greater_equal<R: JitRuntime, E: JitElement>(
+pub fn greater_equal<R: JitRuntime, E: JitElement, BT: BoolElement>(
     lhs: JitTensor<R>,
     rhs: JitTensor<R>,
 ) -> JitTensor<R> {
-    launch_cmp::<R, E, GreaterEqualOp>(lhs, rhs)
+    launch_cmp::<R, E, BT, GreaterEqualOp>(lhs, rhs)
 }
 
-pub fn lower<R: JitRuntime, E: JitElement>(lhs: JitTensor<R>, rhs: JitTensor<R>) -> JitTensor<R> {
-    launch_cmp::<R, E, LowerOp>(lhs, rhs)
+pub fn lower<R: JitRuntime, E: JitElement, BT: BoolElement>(
+    lhs: JitTensor<R>,
+    rhs: JitTensor<R>,
+) -> JitTensor<R> {
+    launch_cmp::<R, E, BT, LowerOp>(lhs, rhs)
 }
 
-pub fn lower_equal<R: JitRuntime, E: JitElement>(
+pub fn lower_equal<R: JitRuntime, E: JitElement, BT: BoolElement>(
     lhs: JitTensor<R>,
     rhs: JitTensor<R>,
 ) -> JitTensor<R> {
-    launch_cmp::<R, E, LowerEqualOp>(lhs, rhs)
+    launch_cmp::<R, E, BT, LowerEqualOp>(lhs, rhs)
 }
 
-pub fn equal_elem<R: JitRuntime, E: JitElement>(lhs: JitTensor<R>, rhs: E) -> JitTensor<R> {
-    launch_scalar_cmp::<R, E, EqualOp>(lhs, rhs)
+pub fn equal_elem<R: JitRuntime, E: JitElement, BT: BoolElement>(
+    lhs: JitTensor<R>,
+    rhs: E,
+) -> JitTensor<R> {
+    launch_scalar_cmp::<R, E, BT, EqualOp>(lhs, rhs)
 }
 
-pub fn greater_elem<R: JitRuntime, E: JitElement>(lhs: JitTensor<R>, rhs: E) -> JitTensor<R> {
-    launch_scalar_cmp::<R, E, GreaterOp>(lhs, rhs)
+pub fn greater_elem<R: JitRuntime, E: JitElement, BT: BoolElement>(
+    lhs: JitTensor<R>,
+    rhs: E,
+) -> JitTensor<R> {
+    launch_scalar_cmp::<R, E, BT, GreaterOp>(lhs, rhs)
 }
 
-pub fn lower_elem<R: JitRuntime, E: JitElement>(lhs: JitTensor<R>, rhs: E) -> JitTensor<R> {
-    launch_scalar_cmp::<R, E, LowerOp>(lhs, rhs)
+pub fn lower_elem<R: JitRuntime, E: JitElement, BT: BoolElement>(
+    lhs: JitTensor<R>,
+    rhs: E,
+) -> JitTensor<R> {
+    launch_scalar_cmp::<R, E, BT, LowerOp>(lhs, rhs)
 }
 
-pub fn greater_equal_elem<R: JitRuntime, E: JitElement>(lhs: JitTensor<R>, rhs: E) -> JitTensor<R> {
-    launch_scalar_cmp::<R, E, GreaterEqualOp>(lhs, rhs)
+pub fn greater_equal_elem<R: JitRuntime, E: JitElement, BT: BoolElement>(
+    lhs: JitTensor<R>,
+    rhs: E,
+) -> JitTensor<R> {
+    launch_scalar_cmp::<R, E, BT, GreaterEqualOp>(lhs, rhs)
 }
 
-pub fn lower_equal_elem<R: JitRuntime, E: JitElement>(lhs: JitTensor<R>, rhs: E) -> JitTensor<R> {
-    launch_scalar_cmp::<R, E, LowerEqualOp>(lhs, rhs)
+pub fn lower_equal_elem<R: JitRuntime, E: JitElement, BT: BoolElement>(
+    lhs: JitTensor<R>,
+    rhs: E,
+) -> JitTensor<R> {
+    launch_scalar_cmp::<R, E, BT, LowerEqualOp>(lhs, rhs)
 }
diff --git a/crates/burn-jit/src/kernel/conv/conv2d/base.rs b/crates/burn-jit/src/kernel/conv/conv2d/base.rs
index 1796389157..9f07d36c55 100644
--- a/crates/burn-jit/src/kernel/conv/conv2d/base.rs
+++ b/crates/burn-jit/src/kernel/conv/conv2d/base.rs
@@ -69,7 +69,7 @@ impl Default for ConvTranspose2dStrategy {
 /// * `options` - The options to use for the convolution
 /// * `strategy` - The convolution algorithm to use. Autotune will pick the fastest available option.
 ///
-pub fn conv2d<R: JitRuntime, E: FloatElement, I: IntElement>(
+pub fn conv2d<R: JitRuntime, E: FloatElement>(
     input: JitTensor<R>,
     weight: JitTensor<R>,
     bias: Option<JitTensor<R>>,
@@ -77,13 +77,11 @@ pub fn conv2d<R: JitRuntime, E: FloatElement, I: IntElement>(
     strategy: Conv2dStrategy,
 ) -> JitTensor<R> {
     match strategy {
-        Conv2dStrategy::Direct => conv2d_direct::<R, E, I>(input, weight, bias, options),
+        Conv2dStrategy::Direct => conv2d_direct::<R, E>(input, weight, bias, options),
         #[cfg(feature = "autotune")]
-        Conv2dStrategy::Autotune => conv2d_autotune::<R, E, I>(input, weight, bias, options),
-        Conv2dStrategy::Gemm => conv2d_im2col::<R, E, I>(input, weight, bias, options),
-        Conv2dStrategy::ImplicitGemm => {
-            conv2d_implicit_gemm::<R, E, I>(input, weight, bias, options)
-        }
+        Conv2dStrategy::Autotune => conv2d_autotune::<R, E>(input, weight, bias, options),
+        Conv2dStrategy::Gemm => conv2d_im2col::<R, E>(input, weight, bias, options),
+        Conv2dStrategy::ImplicitGemm => conv2d_implicit_gemm::<R, E>(input, weight, bias, options),
     }
 }
 
@@ -104,14 +102,14 @@ pub fn conv_transpose2d<R: JitRuntime, E: FloatElement, I: IntElement>(
 ) -> JitTensor<R> {
     match strategy {
         ConvTranspose2dStrategy::Direct => {
-            conv_transpose2d_direct::<R, E, I>(input, weight, bias, options)
+            conv_transpose2d_direct::<R, E>(input, weight, bias, options)
         }
         #[cfg(feature = "autotune")]
         ConvTranspose2dStrategy::Autotune => {
-            conv_transpose2d_autotune::<R, E, I>(input, weight, bias, options)
+            conv_transpose2d_autotune::<R, E>(input, weight, bias, options)
         }
         ConvTranspose2dStrategy::Gemm => {
-            conv_transpose2d_col2im::<R, E, I>(input, weight, bias, options)
+            conv_transpose2d_col2im::<R, E>(input, weight, bias, options)
         }
     }
 }
diff --git a/crates/burn-jit/src/kernel/conv/conv2d/col2im.rs b/crates/burn-jit/src/kernel/conv/conv2d/col2im.rs
index 846aa3d8dd..0659561805 100644
--- a/crates/burn-jit/src/kernel/conv/conv2d/col2im.rs
+++ b/crates/burn-jit/src/kernel/conv/conv2d/col2im.rs
@@ -1,14 +1,18 @@
 use burn_tensor::{
-    ops::{conv::calculate_conv_transpose_output_size, ConvTransposeOptions, FloatTensorOps as _},
+    ops::{conv::calculate_conv_transpose_output_size, ConvTransposeOptions},
     Shape,
 };
 use cubecl::{calculate_cube_count_elemwise, prelude::*};
 
 use crate::{
-    kernel::into_contiguous,
+    kernel::{
+        into_contiguous,
+        matmul::{matmul, MatmulStrategy},
+        slice,
+    },
     ops::{numeric::empty_device, reshape, swap_dims},
     tensor::JitTensor,
-    FloatElement, IntElement, JitBackend, JitRuntime,
+    FloatElement, JitElement, JitRuntime,
 };
 
 use super::batches_per_run;
@@ -20,7 +24,7 @@ use super::batches_per_run;
 /// * `bias` - The bias added to each channel
 /// * `options` - The options to use for the convolution
 ///
-pub fn conv_transpose2d_col2im<R: JitRuntime, E: FloatElement, I: IntElement>(
+pub fn conv_transpose2d_col2im<R: JitRuntime, E: FloatElement>(
     input: JitTensor<R>,
     weight: JitTensor<R>,
     bias: Option<JitTensor<R>>,
@@ -77,12 +81,12 @@ pub fn conv_transpose2d_col2im<R: JitRuntime, E: FloatElement, I: IntElement>(
         let input_shape_run = Shape::new([batches_per_run, input_channels, input_h, input_w]);
 
         for run in 0..runs {
-            let input = JitBackend::<R, E, I>::float_narrow(input.clone(), 0, run, 1);
+            let input = index::<R, E>(input.clone(), run);
             let input = reshape(input, input_shape_run.clone());
             let im_shape = Shape::new([batches_per_run, im_channels, im_h, im_w]);
-            let image_slice = JitBackend::<R, E, I>::float_narrow(image.clone(), 0, run, 1);
+            let image_slice = index::<R, E>(image.clone(), run);
             let image_slice = reshape(image_slice, im_shape);
-            execute::<R, E, I>(
+            execute::<R, E>(
                 input,
                 weight.clone(),
                 bias.clone(),
@@ -96,7 +100,7 @@ pub fn conv_transpose2d_col2im<R: JitRuntime, E: FloatElement, I: IntElement>(
     } else {
         let im_shape = Shape::new([batches_per_run, im_channels, im_h, im_w]);
         let image = empty_device::<R, E>(input.client.clone(), input.device.clone(), im_shape);
-        execute::<R, E, I>(
+        execute::<R, E>(
             input,
             weight,
             bias,
@@ -109,8 +113,21 @@ pub fn conv_transpose2d_col2im<R: JitRuntime, E: FloatElement, I: IntElement>(
     }
 }
 
+pub(crate) fn index<R: JitRuntime, E: JitElement>(tensor: JitTensor<R>, i: usize) -> JitTensor<R> {
+    #[allow(clippy::single_range_in_vec_init)]
+    let mut indices = vec![i..i + 1];
+    for dim in tensor.shape.dims[1..].iter() {
+        indices.push(0..*dim);
+    }
+    let new_shape = Shape {
+        dims: tensor.shape.dims[1..].to_vec(),
+    };
+    let tensor = slice::<R, E>(tensor, &indices);
+    reshape(tensor, new_shape)
+}
+
 #[allow(clippy::too_many_arguments)]
-fn execute<R: JitRuntime, E: FloatElement, I: IntElement>(
+fn execute<R: JitRuntime, E: FloatElement>(
     input: JitTensor<R>,
     weight: JitTensor<R>,
     bias: Option<JitTensor<R>>,
@@ -128,7 +145,7 @@ fn execute<R: JitRuntime, E: FloatElement, I: IntElement>(
     let input_shape = Shape::new([groups, input_ch_per_group, col_shape_1]);
     let input = reshape(input, input_shape);
 
-    let columns = JitBackend::<R, E, I>::float_matmul(weight, input);
+    let columns = matmul::<R, E>(weight, input, MatmulStrategy::default());
     let columns = reshape(columns, Shape::new([col_shape_0 * groups, col_shape_1]));
 
     col2im::<R, E>(
diff --git a/crates/burn-jit/src/kernel/conv/conv2d/direct.rs b/crates/burn-jit/src/kernel/conv/conv2d/direct.rs
index 9a65b6ae51..d5154ecc4b 100644
--- a/crates/burn-jit/src/kernel/conv/conv2d/direct.rs
+++ b/crates/burn-jit/src/kernel/conv/conv2d/direct.rs
@@ -11,7 +11,7 @@ use crate::{
         reshape,
     },
     tensor::JitTensor,
-    FloatElement, IntElement, JitRuntime,
+    FloatElement, JitRuntime,
 };
 
 #[derive(CubeLaunch)]
@@ -120,8 +120,7 @@ fn direct_conv2d_kernel<F: Float>(
 /// * `bias` - The bias added to each channel
 /// * `options` - The options to use for the convolution
 ///
-#[allow(clippy::extra_unused_type_parameters)]
-pub fn conv2d_direct<R: JitRuntime, E: FloatElement, I: IntElement>(
+pub fn conv2d_direct<R: JitRuntime, E: FloatElement>(
     input: JitTensor<R>,
     weight: JitTensor<R>,
     bias: Option<JitTensor<R>>,
diff --git a/crates/burn-jit/src/kernel/conv/conv2d/im2col.rs b/crates/burn-jit/src/kernel/conv/conv2d/im2col.rs
index 88125f0463..abcb8488fb 100644
--- a/crates/burn-jit/src/kernel/conv/conv2d/im2col.rs
+++ b/crates/burn-jit/src/kernel/conv/conv2d/im2col.rs
@@ -1,14 +1,16 @@
 use burn_tensor::{
-    ops::{conv::calculate_conv_output_size, ConvOptions, FloatTensorOps as _},
+    ops::{conv::calculate_conv_output_size, ConvOptions},
     Shape,
 };
 use cubecl::{calculate_cube_count_elemwise, linalg::matmul, prelude::*};
 
 use crate::{
-    kernel::into_contiguous,
+    kernel::{
+        conv::index, into_contiguous, launch_binop, matmul::matmul, matmul::MatmulStrategy, AddOp,
+    },
     ops::{numeric::empty_device, reshape, swap_dims},
     tensor::JitTensor,
-    FloatElement, IntElement, JitBackend, JitRuntime,
+    FloatElement, JitRuntime,
 };
 
 #[derive(CubeLaunch)]
@@ -178,7 +180,7 @@ fn im2col<R: JitRuntime, E: FloatElement>(
 /// * `bias` - The bias added to each channel
 /// * `options` - The options to use for the convolution
 ///
-pub fn conv2d_im2col<R: JitRuntime, E: FloatElement, I: IntElement>(
+pub fn conv2d_im2col<R: JitRuntime, E: FloatElement>(
     input: JitTensor<R>,
     weight: JitTensor<R>,
     bias: Option<JitTensor<R>>,
@@ -206,7 +208,7 @@ pub fn conv2d_im2col<R: JitRuntime, E: FloatElement, I: IntElement>(
 
     if kernel_h == 1 && kernel_w == 1 && in_height == out_h && in_width == out_w {
         // Special case for 1x1 kernels (sometimes used to scale the image by a set of weights)
-        return execute_1x1_kernel::<R, E, I>(input, weight, bias, options);
+        return execute_1x1_kernel::<R, E>(input, weight, bias, options);
     }
 
     let batches_per_run = batches_per_run(batch_size, out_h, out_w)
@@ -221,9 +223,9 @@ pub fn conv2d_im2col<R: JitRuntime, E: FloatElement, I: IntElement>(
         let input = reshape(input, in_shape);
         let in_shape_run = Shape::new([batches_per_run, in_channels, in_height, in_width]);
         for run in 0..runs {
-            let input = JitBackend::<R, E, I>::float_narrow(input.clone(), 0, run, 1);
+            let input = index::<R, E>(input.clone(), run);
             let input = reshape(input, in_shape_run.clone());
-            let out_slice = JitBackend::<R, E, I>::float_narrow(out.clone(), 0, run, 1);
+            let out_slice = index::<R, E>(out.clone(), run);
             let out_slice = reshape(out_slice, matmul_shape.clone());
             execute::<R, E>(
                 input,
@@ -245,12 +247,12 @@ pub fn conv2d_im2col<R: JitRuntime, E: FloatElement, I: IntElement>(
 
     if let Some(bias) = bias {
         let bias = reshape(bias, Shape::new([1, out_channels, 1, 1]));
-        out = JitBackend::<R, E, I>::float_add(out, bias)
+        out = launch_binop::<R, E, AddOp>(out, bias)
     }
     out
 }
 
-fn execute_1x1_kernel<R: JitRuntime, E: FloatElement, I: IntElement>(
+fn execute_1x1_kernel<R: JitRuntime, E: FloatElement>(
     input: JitTensor<R>,
     weight: JitTensor<R>,
     bias: Option<JitTensor<R>>,
@@ -266,12 +268,12 @@ fn execute_1x1_kernel<R: JitRuntime, E: FloatElement, I: IntElement>(
     let weight = reshape(weight, Shape::new([groups, out_c_per_grp, in_c_per_grp]));
     let in_shape = Shape::new([groups, in_c_per_grp, batch_size * height * width]);
     let input = reshape(input, in_shape);
-    let out = JitBackend::<R, E, I>::float_matmul(weight, input);
+    let out = matmul::<R, E>(weight, input, MatmulStrategy::default());
     let mut out = reshape(out, Shape::new([out_channels, batch_size, height, width]));
 
     if let Some(bias) = bias {
         let bias = reshape(bias, Shape::new([out_channels, 1, 1, 1]));
-        out = JitBackend::<R, E, I>::float_add(out, bias)
+        out = launch_binop::<R, E, AddOp>(out, bias)
     }
 
     swap_dims(out, 0, 1)
diff --git a/crates/burn-jit/src/kernel/conv/conv2d/implicit_gemm.rs b/crates/burn-jit/src/kernel/conv/conv2d/implicit_gemm.rs
index 49a639ef43..6771f2c5e2 100644
--- a/crates/burn-jit/src/kernel/conv/conv2d/implicit_gemm.rs
+++ b/crates/burn-jit/src/kernel/conv/conv2d/implicit_gemm.rs
@@ -18,7 +18,7 @@ use crate::{
         permute,
     },
     tensor::JitTensor,
-    FloatElement, IntElement, JitRuntime,
+    FloatElement, JitRuntime,
 };
 
 use super::nchw_to_nhwc;
@@ -30,8 +30,7 @@ use super::nchw_to_nhwc;
 /// * `bias` - The bias added to each channel
 /// * `options` - The options to use for the convolution
 ///
-#[allow(clippy::extra_unused_type_parameters)]
-pub fn conv2d_implicit_gemm<R: JitRuntime, F: FloatElement, I: IntElement>(
+pub fn conv2d_implicit_gemm<R: JitRuntime, F: FloatElement>(
     input: JitTensor<R>,
     weight: JitTensor<R>,
     bias: Option<JitTensor<R>>,
diff --git a/crates/burn-jit/src/kernel/conv/conv2d/transpose_direct.rs b/crates/burn-jit/src/kernel/conv/conv2d/transpose_direct.rs
index 1062241d75..6a97ab8759 100644
--- a/crates/burn-jit/src/kernel/conv/conv2d/transpose_direct.rs
+++ b/crates/burn-jit/src/kernel/conv/conv2d/transpose_direct.rs
@@ -8,7 +8,7 @@ use crate::{
         reshape,
     },
     tensor::JitTensor,
-    IntElement, JitRuntime,
+    JitRuntime,
 };
 use burn_tensor::{ops::ConvTransposeOptions, Shape};
 
@@ -121,8 +121,7 @@ fn conv_transpose2d_direct_kernel<E: Numeric>(
 /// * `bias` - The bias added to each channel
 /// * `options` - The options to use for the convolution
 ///
-#[allow(clippy::extra_unused_type_parameters)]
-pub fn conv_transpose2d_direct<R: JitRuntime, E: JitElement, I: IntElement>(
+pub fn conv_transpose2d_direct<R: JitRuntime, E: JitElement>(
     input: JitTensor<R>,
     weight: JitTensor<R>,
     bias: Option<JitTensor<R>>,
diff --git a/crates/burn-jit/src/kernel/conv/conv2d/tune/conv2d.rs b/crates/burn-jit/src/kernel/conv/conv2d/tune/conv2d.rs
index 05ec7fd960..4a8122a478 100644
--- a/crates/burn-jit/src/kernel/conv/conv2d/tune/conv2d.rs
+++ b/crates/burn-jit/src/kernel/conv/conv2d/tune/conv2d.rs
@@ -16,13 +16,13 @@ use crate::{
         prng::random_uniform,
     },
     tensor::JitTensor,
-    FloatElement, IntElement, JitAutotuneKey, JitRuntime, JitTuneId,
+    FloatElement, JitAutotuneKey, JitRuntime, JitTuneId,
 };
 
 use super::Conv2dAutotuneKey;
 
 /// Executes autotune on conv2d operations
-pub fn conv2d_autotune<R: JitRuntime, E: FloatElement, I: IntElement>(
+pub fn conv2d_autotune<R: JitRuntime, E: FloatElement>(
     input: JitTensor<R>,
     weights: JitTensor<R>,
     bias: Option<JitTensor<R>>,
@@ -35,9 +35,7 @@ pub fn conv2d_autotune<R: JitRuntime, E: FloatElement, I: IntElement>(
     TUNER.execute(
         &JitTuneId::new::<R>(&input.device),
         &client,
-        Box::new(Conv2dOperations::<R, E, I>::new(
-            input, weights, bias, options,
-        )),
+        Box::new(Conv2dOperations::<R, E>::new(input, weights, bias, options)),
     )
 }
 
@@ -46,7 +44,7 @@ pub fn conv2d_autotune<R: JitRuntime, E: FloatElement, I: IntElement>(
     create_key = create_key::<R, E>,
     should_run = should_run
 )]
-pub fn conv2d_operations<R: JitRuntime, E: FloatElement, I: IntElement>(
+pub fn conv2d_operations<R: JitRuntime, E: FloatElement>(
     key: JitAutotuneKey,
     input: JitTensor<R>,
     weights: JitTensor<R>,
@@ -74,8 +72,8 @@ pub fn conv2d_operations<R: JitRuntime, E: FloatElement, I: IntElement>(
     tune_with!(input, weights, bias, options)
 }
 
-fn should_run<R: JitRuntime, F: FloatElement, I: IntElement>(
-    op: &Conv2dOperations<R, F, I>,
+fn should_run<R: JitRuntime, F: FloatElement>(
+    op: &Conv2dOperations<R, F>,
     key: &JitAutotuneKey,
     index: usize,
 ) -> bool {
diff --git a/crates/burn-jit/src/kernel/conv/conv2d/tune/conv_transpose2d.rs b/crates/burn-jit/src/kernel/conv/conv2d/tune/conv_transpose2d.rs
index 3a8c1d04f2..c2d546151a 100644
--- a/crates/burn-jit/src/kernel/conv/conv2d/tune/conv_transpose2d.rs
+++ b/crates/burn-jit/src/kernel/conv/conv2d/tune/conv_transpose2d.rs
@@ -10,13 +10,13 @@ use crate::{
         prng::random_uniform,
     },
     tensor::JitTensor,
-    FloatElement, IntElement, JitAutotuneKey, JitRuntime, JitTuneId,
+    FloatElement, JitAutotuneKey, JitRuntime, JitTuneId,
 };
 
 use super::ConvTranspose2dAutotuneKey;
 
 /// Executes autotune on conv2d operations
-pub fn conv_transpose2d_autotune<R: JitRuntime, E: FloatElement, I: IntElement>(
+pub fn conv_transpose2d_autotune<R: JitRuntime, E: FloatElement>(
     input: JitTensor<R>,
     weights: JitTensor<R>,
     bias: Option<JitTensor<R>>,
@@ -29,14 +29,14 @@ pub fn conv_transpose2d_autotune<R: JitRuntime, E: FloatElement, I: IntElement>(
     TUNER.execute(
         &JitTuneId::new::<R>(&input.device),
         &client,
-        Box::new(ConvTranspose2dOperations::<R, E, I>::new(
+        Box::new(ConvTranspose2dOperations::<R, E>::new(
             input, weights, bias, options,
         )),
     )
 }
 
 #[tune(operations(conv_transpose2d_direct, conv_transpose2d_col2im), create_key = create_key::<R, E>, should_run = should_run)]
-pub fn conv_transpose2d_operations<R: JitRuntime, E: FloatElement, I: IntElement>(
+pub fn conv_transpose2d_operations<R: JitRuntime, E: FloatElement>(
     key: JitAutotuneKey,
     input: JitTensor<R>,
     weights: JitTensor<R>,
@@ -95,8 +95,8 @@ fn create_key<R: JitRuntime, E: FloatElement>(
     ))
 }
 
-fn should_run<R: JitRuntime, F: FloatElement, I: IntElement>(
-    _op: &ConvTranspose2dOperations<R, F, I>,
+fn should_run<R: JitRuntime, F: FloatElement>(
+    _op: &ConvTranspose2dOperations<R, F>,
     key: &JitAutotuneKey,
     index: usize,
 ) -> bool {
diff --git a/crates/burn-jit/src/kernel/conv/deform_conv2d.rs b/crates/burn-jit/src/kernel/conv/deform_conv2d.rs
index b005a2384c..438850fe72 100644
--- a/crates/burn-jit/src/kernel/conv/deform_conv2d.rs
+++ b/crates/burn-jit/src/kernel/conv/deform_conv2d.rs
@@ -1,18 +1,22 @@
 use cubecl::{calculate_cube_count_elemwise, prelude::*};
 
 use burn_tensor::{
-    ops::{conv::calculate_conv_output_size, DeformConvOptions, FloatTensorOps as _},
+    ops::{conv::calculate_conv_output_size, DeformConvOptions},
     Shape,
 };
 
 use crate::{
-    kernel::into_contiguous,
+    kernel::{
+        into_contiguous, launch_binop,
+        matmul::{matmul, MatmulStrategy},
+        AddOp,
+    },
     ops::{
         numeric::{ones_device, zeros_device},
         reshape, swap_dims,
     },
     tensor::JitTensor,
-    FloatElement, IntElement, JitBackend, JitRuntime,
+    FloatElement, JitRuntime,
 };
 
 #[derive(CubeLaunch)]
@@ -251,7 +255,7 @@ pub(crate) fn deform_im2col<R: JitRuntime, E: FloatElement>(
     output
 }
 
-pub(crate) fn deform_conv2d<R: JitRuntime, E: FloatElement, I: IntElement>(
+pub(crate) fn deform_conv2d<R: JitRuntime, E: FloatElement>(
     input: JitTensor<R>,
     offset: JitTensor<R>,
     weight: JitTensor<R>,
@@ -294,24 +298,15 @@ pub(crate) fn deform_conv2d<R: JitRuntime, E: FloatElement, I: IntElement>(
 
     let weight = reshape(weight, Shape::new([groups, out_c_per_group, col_size_0]));
     let columns = reshape(columns, Shape::new([groups, col_size_0, col_size_1]));
-    let out = JitBackend::<R, E, I>::float_matmul(weight, columns);
+    let out = matmul::<R, E>(weight, columns, MatmulStrategy::default());
 
     let out = reshape(out, Shape::new([out_channels, batch_size, out_h, out_w]));
     let out = swap_dims(out, 0, 1);
 
     if let Some(bias) = bias {
         let bias = reshape(bias, Shape::new([1, out_channels, 1, 1]));
-        JitBackend::<R, E, I>::float_add(out, bias)
+        launch_binop::<R, E, AddOp>(out, bias)
     } else {
         out
     }
 }
-
-pub(crate) fn index<R: JitRuntime, E: FloatElement, I: IntElement>(
-    tensor: JitTensor<R>,
-    index: usize,
-) -> JitTensor<R> {
-    let [_, shape_0, shape_1] = tensor.shape.dims();
-    let tensor = JitBackend::<R, E, I>::float_narrow(tensor, 0, index, 1);
-    reshape(tensor, Shape::new([shape_0, shape_1]))
-}
diff --git a/crates/burn-jit/src/kernel/conv/deform_conv_transpose2d.rs b/crates/burn-jit/src/kernel/conv/deform_conv_transpose2d.rs
index 4022a0bbe2..907b5ef344 100644
--- a/crates/burn-jit/src/kernel/conv/deform_conv_transpose2d.rs
+++ b/crates/burn-jit/src/kernel/conv/deform_conv_transpose2d.rs
@@ -5,7 +5,12 @@ use burn_tensor::{
 use cubecl::{calculate_cube_count_elemwise, cube, prelude::*, CubeDim, CubeLaunch};
 
 use crate::{
-    kernel::{cast, into_contiguous},
+    element::BoolElement,
+    kernel::{
+        cast, into_contiguous,
+        matmul::{matmul, MatmulStrategy},
+        slice_assign,
+    },
     ops::{
         numeric::{empty_device, ones_device, zeros_device},
         reshape, swap_dims,
@@ -18,7 +23,12 @@ use super::{bilinear_interpolate, deform_im2col, index};
 
 /// Calculate the [deformable 2D convolution](crate::ops::ModuleOps::deform_conv2d) backward pass using convolutions.
 #[allow(clippy::single_range_in_vec_init)]
-pub(crate) fn deform_conv2d_backward<R: JitRuntime, E: FloatElement, I: IntElement>(
+pub(crate) fn deform_conv2d_backward<
+    R: JitRuntime,
+    E: FloatElement,
+    I: IntElement,
+    BT: BoolElement,
+>(
     input: JitTensor<R>,
     offset: JitTensor<R>,
     weight: JitTensor<R>,
@@ -26,14 +36,14 @@ pub(crate) fn deform_conv2d_backward<R: JitRuntime, E: FloatElement, I: IntEleme
     bias: Option<JitTensor<R>>,
     out_grad: JitTensor<R>,
     options: DeformConvOptions<2>,
-) -> DeformConv2dBackward<JitBackend<R, E, I>> {
+) -> DeformConv2dBackward<JitBackend<R, E, I, BT>> {
     let [_, _, out_h, out_w] = out_grad.shape.dims();
     let [_, _, kernel_h, kernel_w] = weight.shape.dims();
 
     let gradient_bias = bias.map(|bias| {
-        let grad = JitBackend::<R, E, I>::float_sum_dim(out_grad.clone(), 0);
-        let grad = JitBackend::<R, E, I>::float_sum_dim(grad, 2);
-        let grad = JitBackend::<R, E, I>::float_sum_dim(grad, 3);
+        let grad = JitBackend::<R, E, I, BT>::float_sum_dim(out_grad.clone(), 0);
+        let grad = JitBackend::<R, E, I, BT>::float_sum_dim(grad, 2);
+        let grad = JitBackend::<R, E, I, BT>::float_sum_dim(grad, 3);
 
         reshape(grad, bias.shape)
     });
@@ -42,7 +52,7 @@ pub(crate) fn deform_conv2d_backward<R: JitRuntime, E: FloatElement, I: IntEleme
     let offset = into_contiguous(offset);
     let mask = mask.map(|it| into_contiguous(it));
 
-    let (input_gradient, offset_gradient, mask_gradient) = backward_gradient_inputs::<R, E, I>(
+    let (input_gradient, offset_gradient, mask_gradient) = backward_gradient_inputs::<R, E>(
         input.clone(),
         weight.clone(),
         offset.clone(),
@@ -52,7 +62,7 @@ pub(crate) fn deform_conv2d_backward<R: JitRuntime, E: FloatElement, I: IntEleme
         (kernel_h, kernel_w),
     );
 
-    let weight_grad = compute_weight_grad::<R, E, I>(
+    let weight_grad = compute_weight_grad::<R, E>(
         input,
         offset,
         mask,
@@ -71,7 +81,7 @@ pub(crate) fn deform_conv2d_backward<R: JitRuntime, E: FloatElement, I: IntEleme
     )
 }
 
-fn compute_weight_grad<R: JitRuntime, E: FloatElement, I: IntElement>(
+fn compute_weight_grad<R: JitRuntime, E: FloatElement>(
     input: JitTensor<R>,
     offset: JitTensor<R>,
     mask: Option<JitTensor<R>>,
@@ -98,9 +108,9 @@ fn compute_weight_grad<R: JitRuntime, E: FloatElement, I: IntElement>(
     let columns = reshape(columns, Shape::new([groups, col_size_0, col_size_1]));
     let columns = swap_dims(columns, 1, 2);
 
-    let grad_weight = JitBackend::<R, E, I>::float_matmul(out_grad, columns);
+    let grad_weight = matmul::<R, E>(out_grad, columns, MatmulStrategy::default());
 
-    JitBackend::<R, E, I>::float_reshape(
+    reshape(
         grad_weight,
         Shape::new([out_channels, in_c_per_group, kernel_h, kernel_w]),
     )
@@ -108,7 +118,7 @@ fn compute_weight_grad<R: JitRuntime, E: FloatElement, I: IntElement>(
 
 type InputGradients<R> = (JitTensor<R>, JitTensor<R>, Option<JitTensor<R>>);
 
-fn backward_gradient_inputs<R: JitRuntime, E: FloatElement, I: IntElement>(
+fn backward_gradient_inputs<R: JitRuntime, E: FloatElement>(
     image: JitTensor<R>,
     weight: JitTensor<R>,
     offset: JitTensor<R>,
@@ -138,11 +148,11 @@ fn backward_gradient_inputs<R: JitRuntime, E: FloatElement, I: IntElement>(
     let out_grad = reshape(out_grad, out_grad_shape);
 
     for group in 0..groups {
-        let weight = swap_dims(index::<R, E, I>(weight.clone(), group), 0, 1);
-        let out_grad = index::<R, E, I>(out_grad.clone(), group);
-        let values = JitBackend::<R, E, I>::float_matmul(weight, out_grad);
+        let weight = swap_dims(index::<R, E>(weight.clone(), group), 0, 1);
+        let out_grad = index::<R, E>(out_grad.clone(), group);
+        let values = matmul::<R, E>(weight, out_grad, MatmulStrategy::default());
         let values = reshape(values, Shape::new([1, col_shape_0, col_shape_1]));
-        columns = JitBackend::<R, E, I>::float_slice_assign(
+        columns = slice_assign::<R, E>(
             columns,
             &[group..group + 1, 0..col_shape_0, 0..col_shape_1],
             values,
diff --git a/crates/burn-jit/src/kernel/index/flip.rs b/crates/burn-jit/src/kernel/index/flip.rs
index e35cac8b2c..583e0346d3 100644
--- a/crates/burn-jit/src/kernel/index/flip.rs
+++ b/crates/burn-jit/src/kernel/index/flip.rs
@@ -1,4 +1,6 @@
-use crate::{element::JitElement, ops::numeric::empty_device, tensor::JitTensor, JitRuntime};
+use crate::{
+    element::JitElement, ops::numeric::empty_device, tensor::JitTensor, BoolElement, JitRuntime,
+};
 use cubecl::{calculate_cube_count_elemwise, prelude::*};
 
 #[cube(launch_unchecked)]
@@ -31,7 +33,7 @@ fn flip_kernel<E: CubePrimitive, Bool: Int>(
     output[ABSOLUTE_POS] = input[offset_input];
 }
 
-pub(crate) fn flip<R: JitRuntime, E: JitElement>(
+pub(crate) fn flip<R: JitRuntime, E: JitElement, BT: BoolElement>(
     tensor: JitTensor<R>,
     indices: &[usize],
 ) -> JitTensor<R> {
@@ -40,26 +42,26 @@ pub(crate) fn flip<R: JitRuntime, E: JitElement>(
         tensor.device.clone(),
         tensor.shape.clone(),
     );
-    flip_on_output::<R, E>(tensor, output, indices)
+    flip_on_output::<R, E, BT>(tensor, output, indices)
 }
 
-pub(crate) fn flip_on_output<R: JitRuntime, E: JitElement>(
+pub(crate) fn flip_on_output<R: JitRuntime, E: JitElement, BT: BoolElement>(
     tensor: JitTensor<R>,
     output: JitTensor<R>,
     indices: &[usize],
 ) -> JitTensor<R> {
     let ndims = tensor.shape.num_dims();
-    let mut indices_sequence = SequenceArg::<'_, R, u32>::new();
+    let mut indices_sequence = SequenceArg::<'_, R, BT>::new();
 
     for i in 0..ndims {
-        indices_sequence.push(ScalarArg::new(indices.contains(&i) as u32));
+        indices_sequence.push(ScalarArg::new(BT::new_bool(indices.contains(&i))));
     }
 
     let cube_dim = CubeDim::default();
     let cube_count = calculate_cube_count_elemwise(output.shape.num_elements(), cube_dim);
 
     unsafe {
-        flip_kernel::launch_unchecked::<E, u32, R>(
+        flip_kernel::launch_unchecked::<E, BT, R>(
             &tensor.client,
             cube_count,
             cube_dim,
diff --git a/crates/burn-jit/src/kernel/mask/base.rs b/crates/burn-jit/src/kernel/mask/base.rs
index 2140972326..d37c6e05bb 100644
--- a/crates/burn-jit/src/kernel/mask/base.rs
+++ b/crates/burn-jit/src/kernel/mask/base.rs
@@ -1,8 +1,8 @@
 use super::{mask_where::MaskWhereStrategy, MaskFillStrategy};
-use crate::{element::JitElement, tensor::JitTensor, JitRuntime};
+use crate::{element::JitElement, tensor::JitTensor, BoolElement, JitRuntime};
 
 /// Execute the mask fill kernel.
-pub(crate) fn mask_fill_auto<R: JitRuntime, E: JitElement>(
+pub(crate) fn mask_fill_auto<R: JitRuntime, E: JitElement, BT: BoolElement>(
     tensor: JitTensor<R>,
     mask: JitTensor<R>,
     value: E,
@@ -13,11 +13,11 @@ pub(crate) fn mask_fill_auto<R: JitRuntime, E: JitElement>(
         MaskFillStrategy::Readonly
     };
 
-    super::mask_fill(tensor, mask, value, strategy)
+    super::mask_fill::<R, E, BT>(tensor, mask, value, strategy)
 }
 
 /// Execute the mask where kernel.
-pub(crate) fn mask_where_auto<R: JitRuntime, E: JitElement>(
+pub(crate) fn mask_where_auto<R: JitRuntime, E: JitElement, BT: BoolElement>(
     tensor: JitTensor<R>,
     mask: JitTensor<R>,
     value: JitTensor<R>,
@@ -30,5 +30,5 @@ pub(crate) fn mask_where_auto<R: JitRuntime, E: JitElement>(
         MaskWhereStrategy::Readonly
     };
 
-    super::mask_where::<R, E>(tensor, mask, value, strategy)
+    super::mask_where::<R, E, BT>(tensor, mask, value, strategy)
 }
diff --git a/crates/burn-jit/src/kernel/mask/mask_fill.rs b/crates/burn-jit/src/kernel/mask/mask_fill.rs
index e8b3f814d9..386e7a5039 100644
--- a/crates/burn-jit/src/kernel/mask/mask_fill.rs
+++ b/crates/burn-jit/src/kernel/mask/mask_fill.rs
@@ -1,11 +1,16 @@
 use cubecl::{calculate_cube_count_elemwise, linalg::tensor::index_offset_with_layout, prelude::*};
 
-use crate::{element::JitElement, ops::numeric::empty_device, tensor::JitTensor, JitRuntime};
+use crate::{
+    element::JitElement,
+    ops::{max_vectorization, numeric::empty_device},
+    tensor::JitTensor,
+    BoolElement, JitRuntime,
+};
 
 #[cube(launch)]
-fn mask_fill_readonly_kernel<T: Numeric>(
+fn mask_fill_readonly_kernel<T: Numeric, B: Int>(
     input: &Tensor<Line<T>>,
-    mask: &Tensor<Line<u32>>,
+    mask: &Tensor<Line<B>>,
     output: &mut Tensor<Line<T>>,
     value: T,
     #[comptime] rank: u32,
@@ -17,17 +22,15 @@ fn mask_fill_readonly_kernel<T: Numeric>(
     let index_input = index_offset_with_layout(input, output, ABSOLUTE_POS, 0, rank, true);
     let index_mask = index_offset_with_layout(mask, output, ABSOLUTE_POS, 0, rank, true);
 
-    if mask[index_mask] >= Line::new(1) {
-        output[ABSOLUTE_POS] = Line::new(value);
-    } else {
-        output[ABSOLUTE_POS] = input[index_input];
-    }
+    let mask = Line::cast_from(mask[index_mask]);
+
+    output[ABSOLUTE_POS] = select_many(mask, Line::new(value), input[index_input]);
 }
 
 #[cube(launch)]
-fn mask_fill_inplace_kernel<T: Numeric>(
+fn mask_fill_inplace_kernel<T: Numeric, B: Int>(
     input: &mut Tensor<Line<T>>,
-    mask: &Tensor<Line<u32>>,
+    mask: &Tensor<Line<B>>,
     value: T,
     #[comptime] rank: u32,
 ) {
@@ -36,10 +39,9 @@ fn mask_fill_inplace_kernel<T: Numeric>(
     }
 
     let index_mask = index_offset_with_layout(mask, input, ABSOLUTE_POS, 0, rank, true);
+    let mask = Line::cast_from(mask[index_mask]);
 
-    if mask[index_mask] >= Line::new(1) {
-        input[ABSOLUTE_POS] = Line::new(value);
-    }
+    input[ABSOLUTE_POS] = select_many(mask, Line::new(value), input[ABSOLUTE_POS]);
 }
 
 #[derive(Clone, Copy, Debug)]
@@ -56,19 +58,19 @@ pub enum MaskFillStrategy {
 }
 
 /// Execute the mask fill kernel with the given strategy.
-pub fn mask_fill<R: JitRuntime, E: JitElement>(
+pub fn mask_fill<R: JitRuntime, E: JitElement, BT: BoolElement>(
     input: JitTensor<R>,
     mask: JitTensor<R>,
     value: E,
     strategy: MaskFillStrategy,
 ) -> JitTensor<R> {
     match strategy {
-        MaskFillStrategy::Readonly => mask_fill_readonly::<R, E, u32>(input, mask, value),
-        MaskFillStrategy::Inplace => mask_fill_inplace::<R, E, u32>(input, mask, value),
+        MaskFillStrategy::Readonly => mask_fill_readonly::<R, E, BT>(input, mask, value),
+        MaskFillStrategy::Inplace => mask_fill_inplace::<R, E, BT>(input, mask, value),
     }
 }
 
-fn mask_fill_readonly<R: JitRuntime, EI: JitElement, EM: JitElement>(
+fn mask_fill_readonly<R: JitRuntime, EI: JitElement, EM: BoolElement>(
     input: JitTensor<R>,
     mask: JitTensor<R>,
     value: EI,
@@ -82,14 +84,15 @@ fn mask_fill_readonly<R: JitRuntime, EI: JitElement, EM: JitElement>(
 
     let cube_dim = CubeDim::default();
     let cube_count = calculate_cube_count_elemwise(input.shape.num_elements(), cube_dim);
+    let vectorization = max_vectorization(&input);
 
-    mask_fill_readonly_kernel::launch::<EI, R>(
+    mask_fill_readonly_kernel::launch::<EI, EM, R>(
         &input.client,
         cube_count,
         cube_dim,
-        input.as_tensor_arg::<EI>(1),
-        mask.as_tensor_arg::<EM>(1),
-        output.as_tensor_arg::<EI>(1),
+        input.as_tensor_arg::<EI>(vectorization),
+        mask.as_tensor_arg::<EM>(vectorization),
+        output.as_tensor_arg::<EI>(vectorization),
         ScalarArg::new(value),
         ndims as u32,
     );
@@ -97,7 +100,7 @@ fn mask_fill_readonly<R: JitRuntime, EI: JitElement, EM: JitElement>(
     output
 }
 
-fn mask_fill_inplace<R: JitRuntime, EI: JitElement, EM: JitElement>(
+fn mask_fill_inplace<R: JitRuntime, EI: JitElement, EM: BoolElement>(
     input: JitTensor<R>,
     mask: JitTensor<R>,
     value: EI,
@@ -105,13 +108,14 @@ fn mask_fill_inplace<R: JitRuntime, EI: JitElement, EM: JitElement>(
     let ndims = input.shape.num_dims();
     let cube_dim = CubeDim::default();
     let cube_count = calculate_cube_count_elemwise(input.shape.num_elements(), cube_dim);
+    let vectorization = max_vectorization(&input);
 
-    mask_fill_inplace_kernel::launch::<EI, R>(
+    mask_fill_inplace_kernel::launch::<EI, EM, R>(
         &input.client,
         cube_count,
         cube_dim,
-        input.as_tensor_arg::<EI>(1),
-        mask.as_tensor_arg::<EM>(1),
+        input.as_tensor_arg::<EI>(vectorization),
+        mask.as_tensor_arg::<EM>(vectorization),
         ScalarArg::new(value),
         ndims as u32,
     );
diff --git a/crates/burn-jit/src/kernel/mask/mask_where.rs b/crates/burn-jit/src/kernel/mask/mask_where.rs
index 73c7c8fcf1..5518e9648b 100644
--- a/crates/burn-jit/src/kernel/mask/mask_where.rs
+++ b/crates/burn-jit/src/kernel/mask/mask_where.rs
@@ -1,11 +1,16 @@
 use cubecl::{calculate_cube_count_elemwise, linalg::tensor::index_offset_with_layout, prelude::*};
 
-use crate::{element::JitElement, ops::numeric::empty_device, tensor::JitTensor, JitRuntime};
+use crate::{
+    element::JitElement,
+    ops::{max_vectorization, numeric::empty_device},
+    tensor::JitTensor,
+    BoolElement, JitRuntime,
+};
 
 #[cube(launch)]
-fn mask_where_readonly_kernel<T: CubePrimitive>(
+fn mask_where_readonly_kernel<T: CubePrimitive, B: Int>(
     input: &Tensor<Line<T>>,
-    mask: &Tensor<Line<u32>>,
+    mask: &Tensor<Line<B>>,
     value: &Tensor<Line<T>>,
     output: &mut Tensor<Line<T>>,
     #[comptime] rank: u32,
@@ -17,20 +22,17 @@ fn mask_where_readonly_kernel<T: CubePrimitive>(
     let index_input = index_offset_with_layout(input, output, ABSOLUTE_POS, 0, rank, true);
     let index_mask = index_offset_with_layout(mask, output, ABSOLUTE_POS, 0, rank, true);
     let index_value = index_offset_with_layout(value, output, ABSOLUTE_POS, 0, rank, true);
+    let mask = Line::cast_from(mask[index_mask]);
 
-    if mask[index_mask] >= Line::new(1) {
-        output[ABSOLUTE_POS] = value[index_value];
-    } else {
-        output[ABSOLUTE_POS] = input[index_input];
-    }
+    output[ABSOLUTE_POS] = select_many(mask, value[index_value], input[index_input]);
 }
 
 #[cube(launch)]
-fn mask_where_inplace_kernel<T: CubePrimitive>(
+fn mask_where_inplace_kernel<T: CubePrimitive, B: Int>(
     input: &mut Tensor<Line<T>>,
-    mask: &Tensor<Line<u32>>,
+    mask: &Tensor<Line<B>>,
     value: &Tensor<Line<T>>,
-    reverse: u32,
+    reverse: B,
     #[comptime] rank: u32,
 ) {
     if ABSOLUTE_POS >= input.len() {
@@ -40,9 +42,11 @@ fn mask_where_inplace_kernel<T: CubePrimitive>(
     let index_mask = index_offset_with_layout(mask, input, ABSOLUTE_POS, 0, rank, true);
     let index_value = index_offset_with_layout(value, input, ABSOLUTE_POS, 0, rank, true);
 
-    if mask[index_mask] != Line::new(reverse) {
-        input[ABSOLUTE_POS] = value[index_value];
-    }
+    input[ABSOLUTE_POS] = select(
+        mask[index_mask] != Line::new(reverse),
+        value[index_value],
+        input[ABSOLUTE_POS],
+    );
 }
 
 #[derive(Clone, Copy, Debug)]
@@ -61,20 +65,20 @@ pub enum MaskWhereStrategy {
 }
 
 /// Execute the mask where kernel with the given strategy.
-pub fn mask_where<R: JitRuntime, E: JitElement>(
+pub fn mask_where<R: JitRuntime, E: JitElement, BT: BoolElement>(
     input: JitTensor<R>,
     mask: JitTensor<R>,
     value: JitTensor<R>,
     strategy: MaskWhereStrategy,
 ) -> JitTensor<R> {
     match strategy {
-        MaskWhereStrategy::Readonly => mask_where_readonly::<R, E, u32>(input, mask, value),
-        MaskWhereStrategy::InplaceLhs => mask_where_inplace::<R, E, u32>(input, mask, value, false),
-        MaskWhereStrategy::InplaceRhs => mask_where_inplace::<R, E, u32>(value, mask, input, true),
+        MaskWhereStrategy::Readonly => mask_where_readonly::<R, E, BT>(input, mask, value),
+        MaskWhereStrategy::InplaceLhs => mask_where_inplace::<R, E, BT>(input, mask, value, false),
+        MaskWhereStrategy::InplaceRhs => mask_where_inplace::<R, E, BT>(value, mask, input, true),
     }
 }
 
-fn mask_where_readonly<R: JitRuntime, EI: JitElement, EM: JitElement>(
+fn mask_where_readonly<R: JitRuntime, EI: JitElement, EM: BoolElement>(
     input: JitTensor<R>,
     mask: JitTensor<R>,
     value: JitTensor<R>,
@@ -88,22 +92,23 @@ fn mask_where_readonly<R: JitRuntime, EI: JitElement, EM: JitElement>(
 
     let cube_dim = CubeDim::default();
     let cube_count = calculate_cube_count_elemwise(input.shape.num_elements(), cube_dim);
+    let vectorization = max_vectorization(&input);
 
-    mask_where_readonly_kernel::launch::<EI, R>(
+    mask_where_readonly_kernel::launch::<EI, EM, R>(
         &input.client,
         cube_count,
         cube_dim,
-        input.as_tensor_arg::<EI>(1),
-        mask.as_tensor_arg::<EM>(1),
-        value.as_tensor_arg::<EI>(1),
-        output.as_tensor_arg::<EI>(1),
+        input.as_tensor_arg::<EI>(vectorization),
+        mask.as_tensor_arg::<EM>(vectorization),
+        value.as_tensor_arg::<EI>(vectorization),
+        output.as_tensor_arg::<EI>(vectorization),
         ndims as u32,
     );
 
     output
 }
 
-fn mask_where_inplace<R: JitRuntime, EI: JitElement, EM: JitElement>(
+fn mask_where_inplace<R: JitRuntime, EI: JitElement, EM: BoolElement>(
     input: JitTensor<R>,
     mask: JitTensor<R>,
     value: JitTensor<R>,
@@ -112,15 +117,16 @@ fn mask_where_inplace<R: JitRuntime, EI: JitElement, EM: JitElement>(
     let ndims = input.shape.num_dims();
     let cube_dim = CubeDim::default();
     let cube_count = calculate_cube_count_elemwise(input.shape.num_elements(), cube_dim);
+    let vectorization = max_vectorization(&input);
 
-    mask_where_inplace_kernel::launch::<EI, R>(
+    mask_where_inplace_kernel::launch::<EI, EM, R>(
         &input.client,
         cube_count,
         cube_dim,
-        input.as_tensor_arg::<EI>(1),
-        mask.as_tensor_arg::<EM>(1),
-        value.as_tensor_arg::<EI>(1),
-        ScalarArg::new(reverse as u32),
+        input.as_tensor_arg::<EI>(vectorization),
+        mask.as_tensor_arg::<EM>(vectorization),
+        value.as_tensor_arg::<EI>(vectorization),
+        ScalarArg::new(EM::new_bool(reverse)),
         ndims as u32,
     );
 
diff --git a/crates/burn-jit/src/lib.rs b/crates/burn-jit/src/lib.rs
index 77a67df37a..ba953ae0d0 100644
--- a/crates/burn-jit/src/lib.rs
+++ b/crates/burn-jit/src/lib.rs
@@ -21,7 +21,7 @@ pub mod element;
 
 use burn_tensor::backend::{DeviceId, DeviceOps};
 use cubecl::{compute::CubeTask, Feature, Runtime};
-pub use element::{FloatElement, IntElement, JitElement};
+pub use element::{BoolElement, FloatElement, IntElement, JitElement};
 
 mod backend;
 
diff --git a/crates/burn-jit/src/ops/activation_ops.rs b/crates/burn-jit/src/ops/activation_ops.rs
index 7f6b921d16..eecd6849c8 100644
--- a/crates/burn-jit/src/ops/activation_ops.rs
+++ b/crates/burn-jit/src/ops/activation_ops.rs
@@ -1,10 +1,11 @@
-use crate::{FloatElement, IntElement, JitBackend, JitRuntime};
+use crate::{element::BoolElement, FloatElement, IntElement, JitBackend, JitRuntime};
 use burn_tensor::ops::ActivationOps;
 
-impl<R, F, I> ActivationOps<Self> for JitBackend<R, F, I>
+impl<R, F, I, BT> ActivationOps<Self> for JitBackend<R, F, I, BT>
 where
     R: JitRuntime,
     F: FloatElement,
     I: IntElement,
+    BT: BoolElement,
 {
 }
diff --git a/crates/burn-jit/src/ops/base.rs b/crates/burn-jit/src/ops/base.rs
index 58e3b25c0c..bce600604e 100644
--- a/crates/burn-jit/src/ops/base.rs
+++ b/crates/burn-jit/src/ops/base.rs
@@ -1,6 +1,6 @@
-use crate::{element::JitElement, kernel, tensor::JitTensor, JitRuntime};
+use crate::{element::JitElement, kernel, tensor::JitTensor, BoolElement, JitRuntime};
 use burn_tensor::{Shape, TensorData};
-use cubecl::{tensor_vectorization_factor, CubeElement};
+use cubecl::tensor_vectorization_factor;
 
 pub(crate) fn from_data<R: JitRuntime, E: JitElement>(
     data: TensorData,
@@ -29,11 +29,16 @@ pub fn into_data_sync<R: JitRuntime, E: JitElement>(tensor: JitTensor<R>) -> Ten
     TensorData::new(E::from_bytes(&bytes).to_vec(), tensor.shape)
 }
 
-pub(crate) async fn bool_into_data<R: JitRuntime>(tensor: JitTensor<R>) -> TensorData {
+pub(crate) async fn bool_into_data<R: JitRuntime, BT: BoolElement>(
+    tensor: JitTensor<R>,
+) -> TensorData {
     let tensor = kernel::into_contiguous(tensor);
     let bytes = tensor.client.read_one_async(tensor.handle.binding()).await;
     TensorData::new(
-        u32::from_bytes(&bytes).iter().map(|i| *i != 0).collect(),
+        BT::from_bytes(&bytes)
+            .iter()
+            .map(|i| *i != BT::false_val())
+            .collect(),
         tensor.shape,
     )
 }
diff --git a/crates/burn-jit/src/ops/bool_ops.rs b/crates/burn-jit/src/ops/bool_ops.rs
index 036913e88d..017e76f2c4 100644
--- a/crates/burn-jit/src/ops/bool_ops.rs
+++ b/crates/burn-jit/src/ops/bool_ops.rs
@@ -1,31 +1,32 @@
-use crate::{kernel, FloatElement, IntElement, JitBackend, JitRuntime};
+use crate::{element::BoolElement, kernel, FloatElement, IntElement, JitBackend, JitRuntime};
 use burn_tensor::ops::{BoolTensor, Device, FloatTensor, IntTensor};
 use burn_tensor::{ops::BoolTensorOps, Shape, TensorData};
 use std::ops::Range;
 
 use super::{expand, permute};
 
-impl<R, F, I> BoolTensorOps<Self> for JitBackend<R, F, I>
+impl<R, F, I, BT> BoolTensorOps<Self> for JitBackend<R, F, I, BT>
 where
     R: JitRuntime,
     F: FloatElement,
     I: IntElement,
+    BT: BoolElement,
 {
     fn bool_empty(shape: Shape, device: &Device<Self>) -> BoolTensor<Self> {
-        super::empty::<R, u32>(shape, device)
+        super::empty::<R, BT>(shape, device)
     }
 
     async fn bool_into_data(tensor: BoolTensor<Self>) -> TensorData {
-        super::bool_into_data(tensor).await
+        super::bool_into_data::<R, BT>(tensor).await
     }
 
     fn bool_from_data(data: TensorData, device: &Device<Self>) -> BoolTensor<Self> {
-        let data: TensorData = TensorData::new(data.iter::<u32>().collect(), data.shape);
-        super::from_data::<R, u32>(data, device)
+        let data: TensorData = TensorData::new(data.iter::<BT>().collect(), data.shape);
+        super::from_data::<R, BT>(data, device)
     }
 
     fn bool_into_int(tensor: BoolTensor<Self>) -> IntTensor<Self> {
-        kernel::bool_cast::<R, I>(tensor)
+        kernel::bool_cast::<R, BT, I>(tensor)
     }
 
     fn bool_device(tensor: &BoolTensor<Self>) -> Device<Self> {
@@ -41,7 +42,7 @@ where
     }
 
     fn bool_slice(tensor: BoolTensor<Self>, ranges: &[Range<usize>]) -> BoolTensor<Self> {
-        kernel::slice::<R, u32>(tensor, ranges)
+        kernel::slice::<R, BT>(tensor, ranges)
     }
 
     fn bool_slice_assign(
@@ -49,19 +50,19 @@ where
         ranges: &[Range<usize>],
         value: BoolTensor<Self>,
     ) -> BoolTensor<Self> {
-        kernel::slice_assign::<R, u32>(tensor, ranges, value)
+        kernel::slice_assign::<R, BT>(tensor, ranges, value)
     }
 
     fn bool_equal(lhs: BoolTensor<Self>, rhs: BoolTensor<Self>) -> BoolTensor<Self> {
-        kernel::equal::<R, u32>(lhs, rhs)
+        kernel::equal::<R, BT, BT>(lhs, rhs)
     }
 
     fn bool_not(tensor: BoolTensor<Self>) -> BoolTensor<Self> {
-        kernel::equal_elem::<R, u32>(tensor, 0)
+        kernel::equal_elem::<R, BT, BT>(tensor, BT::false_val())
     }
 
     fn bool_into_float(tensor: BoolTensor<Self>) -> FloatTensor<Self> {
-        kernel::bool_cast::<R, F>(tensor)
+        kernel::bool_cast::<R, BT, F>(tensor)
     }
 
     fn bool_swap_dims(mut tensor: BoolTensor<Self>, dim1: usize, dim2: usize) -> BoolTensor<Self> {
@@ -72,7 +73,7 @@ where
     }
 
     fn bool_repeat_dim(tensor: BoolTensor<Self>, dim: usize, times: usize) -> BoolTensor<Self> {
-        kernel::repeat_dim::<R, u32>(tensor, dim, times)
+        kernel::repeat_dim::<R, BT>(tensor, dim, times)
     }
 
     fn bool_permute(tensor: BoolTensor<Self>, axes: &[usize]) -> BoolTensor<Self> {
@@ -84,6 +85,6 @@ where
     }
 
     fn bool_flip(tensor: BoolTensor<Self>, axes: &[usize]) -> BoolTensor<Self> {
-        kernel::flip::<R, u32>(tensor, axes)
+        kernel::flip::<R, BT, BT>(tensor, axes)
     }
 }
diff --git a/crates/burn-jit/src/ops/float_ops.rs b/crates/burn-jit/src/ops/float_ops.rs
index 52b013ec0e..f97b1609ff 100644
--- a/crates/burn-jit/src/ops/float_ops.rs
+++ b/crates/burn-jit/src/ops/float_ops.rs
@@ -1,7 +1,10 @@
 use super::{expand, numeric, permute};
-use crate::kernel::matmul::{matmul, MatmulStrategy};
 use crate::kernel::prng::{random_bernoulli, random_normal, random_uniform};
 use crate::kernel::{self, launch_unary, reduce, unary_op, UnaryOp};
+use crate::{
+    element::BoolElement,
+    kernel::matmul::{matmul, MatmulStrategy},
+};
 use crate::{execute_with_dtype, JitBackend};
 use crate::{FloatElement, IntElement, JitRuntime};
 use burn_tensor::ops::{BoolTensor, Device, FloatElem, FloatTensor, IntTensor};
@@ -11,11 +14,12 @@ use cubecl::prelude::*;
 use half::{bf16, f16};
 use std::ops::Range;
 
-impl<R, F, I> FloatTensorOps<Self> for JitBackend<R, F, I>
+impl<R, F, I, BT> FloatTensorOps<Self> for JitBackend<R, F, I, BT>
 where
     R: JitRuntime,
     F: FloatElement,
     I: IntElement,
+    BT: BoolElement,
 {
     fn float_from_data(data: TensorData, device: &Device<Self>) -> FloatTensor<Self> {
         super::from_data::<R, F>(data, device)
@@ -248,7 +252,7 @@ where
         execute_with_dtype!(
             float(tensor.dtype, value.dtype),
             E,
-            kernel::mask_where_auto::<R, E>(tensor, mask, value)
+            kernel::mask_where_auto::<R, E, BT>(tensor, mask, value)
         )
     }
 
@@ -260,7 +264,7 @@ where
         execute_with_dtype!(
             float(tensor.dtype),
             E,
-            kernel::mask_fill_auto::<R, E>(tensor, mask, value.elem())
+            kernel::mask_fill_auto::<R, E, BT>(tensor, mask, value.elem())
         )
     }
 
@@ -268,7 +272,7 @@ where
         execute_with_dtype!(
             float(lhs.dtype, rhs.dtype),
             E,
-            kernel::equal::<R, E>(lhs, rhs)
+            kernel::equal::<R, E, BT>(lhs, rhs)
         )
     }
 
@@ -276,7 +280,7 @@ where
         execute_with_dtype!(
             float(lhs.dtype),
             E,
-            kernel::equal_elem::<R, E>(lhs, rhs.elem())
+            kernel::equal_elem::<R, E, BT>(lhs, rhs.elem())
         )
     }
 
@@ -284,7 +288,7 @@ where
         execute_with_dtype!(
             float(lhs.dtype, rhs.dtype),
             E,
-            kernel::greater::<R, E>(lhs, rhs)
+            kernel::greater::<R, E, BT>(lhs, rhs)
         )
     }
 
@@ -292,7 +296,7 @@ where
         execute_with_dtype!(
             float(lhs.dtype),
             E,
-            kernel::greater_elem::<R, E>(lhs, rhs.elem())
+            kernel::greater_elem::<R, E, BT>(lhs, rhs.elem())
         )
     }
 
@@ -300,7 +304,7 @@ where
         execute_with_dtype!(
             float(lhs.dtype, rhs.dtype),
             E,
-            kernel::greater_equal::<R, E>(lhs, rhs)
+            kernel::greater_equal::<R, E, BT>(lhs, rhs)
         )
     }
 
@@ -308,7 +312,7 @@ where
         execute_with_dtype!(
             float(lhs.dtype),
             E,
-            kernel::greater_equal_elem::<R, E>(lhs, rhs.elem())
+            kernel::greater_equal_elem::<R, E, BT>(lhs, rhs.elem())
         )
     }
 
@@ -316,7 +320,7 @@ where
         execute_with_dtype!(
             float(lhs.dtype, rhs.dtype),
             E,
-            kernel::lower::<R, E>(lhs, rhs)
+            kernel::lower::<R, E, BT>(lhs, rhs)
         )
     }
 
@@ -324,7 +328,7 @@ where
         execute_with_dtype!(
             float(lhs.dtype),
             E,
-            kernel::lower_elem::<R, E>(lhs, rhs.elem())
+            kernel::lower_elem::<R, E, BT>(lhs, rhs.elem())
         )
     }
 
@@ -332,7 +336,7 @@ where
         execute_with_dtype!(
             float(lhs.dtype, rhs.dtype),
             E,
-            kernel::lower_equal::<R, E>(lhs, rhs)
+            kernel::lower_equal::<R, E, BT>(lhs, rhs)
         )
     }
 
@@ -340,7 +344,7 @@ where
         execute_with_dtype!(
             float(lhs.dtype),
             E,
-            kernel::lower_equal_elem::<R, E>(lhs, rhs.elem())
+            kernel::lower_equal_elem::<R, E, BT>(lhs, rhs.elem())
         )
     }
 
@@ -633,7 +637,11 @@ where
     }
 
     fn float_flip(tensor: FloatTensor<Self>, axes: &[usize]) -> FloatTensor<Self> {
-        execute_with_dtype!(float(tensor.dtype), E, kernel::flip::<R, E>(tensor, axes))
+        execute_with_dtype!(
+            float(tensor.dtype),
+            E,
+            kernel::flip::<R, E, BT>(tensor, axes)
+        )
     }
 
     fn float_cast(tensor: FloatTensor<Self>, dtype: FloatDType) -> FloatTensor<Self> {
diff --git a/crates/burn-jit/src/ops/int_ops.rs b/crates/burn-jit/src/ops/int_ops.rs
index cb6603bf80..25bb92521f 100644
--- a/crates/burn-jit/src/ops/int_ops.rs
+++ b/crates/burn-jit/src/ops/int_ops.rs
@@ -1,6 +1,9 @@
 use super::{expand, numeric, permute};
-use crate::kernel::prng::{random_bernoulli, random_normal, random_uniform};
 use crate::kernel::{launch_unary, unary_op, UnaryOp};
+use crate::{
+    element::BoolElement,
+    kernel::prng::{random_bernoulli, random_normal, random_uniform},
+};
 use crate::{kernel, FloatElement, IntElement, JitBackend, JitRuntime};
 use burn_tensor::ops::{BoolTensor, Device, FloatTensor, IntElem, IntTensor};
 use burn_tensor::{ops::IntTensorOps, Distribution, ElementConversion, Shape, TensorData};
@@ -8,11 +11,12 @@ use cubecl::frontend::Numeric;
 use cubecl::prelude::*;
 use std::ops::Range;
 
-impl<R, F, I> IntTensorOps<Self> for JitBackend<R, F, I>
+impl<R, F, I, BT> IntTensorOps<Self> for JitBackend<R, F, I, BT>
 where
     R: JitRuntime,
     F: FloatElement,
     I: IntElement,
+    BT: BoolElement,
 {
     fn int_empty(shape: Shape, device: &Device<Self>) -> IntTensor<Self> {
         super::empty::<R, I>(shape, device)
@@ -55,7 +59,7 @@ where
         mask: BoolTensor<Self>,
         value: IntTensor<Self>,
     ) -> IntTensor<Self> {
-        kernel::mask_where_auto::<R, I>(tensor, mask, value)
+        kernel::mask_where_auto::<R, I, BT>(tensor, mask, value)
     }
 
     fn int_mask_fill(
@@ -63,7 +67,7 @@ where
         mask: BoolTensor<Self>,
         value: IntElem<Self>,
     ) -> IntTensor<Self> {
-        kernel::mask_fill_auto(tensor, mask, value)
+        kernel::mask_fill_auto::<R, I, BT>(tensor, mask, value)
     }
 
     fn int_gather(
@@ -101,43 +105,43 @@ where
     }
 
     fn int_equal(lhs: IntTensor<Self>, rhs: IntTensor<Self>) -> BoolTensor<Self> {
-        kernel::equal::<R, I>(lhs, rhs)
+        kernel::equal::<R, I, BT>(lhs, rhs)
     }
 
     fn int_equal_elem(lhs: IntTensor<Self>, rhs: IntElem<Self>) -> BoolTensor<Self> {
-        kernel::equal_elem::<R, I>(lhs, rhs)
+        kernel::equal_elem::<R, I, BT>(lhs, rhs)
     }
 
     fn int_greater(lhs: IntTensor<Self>, rhs: IntTensor<Self>) -> BoolTensor<Self> {
-        kernel::greater::<R, I>(lhs, rhs)
+        kernel::greater::<R, I, BT>(lhs, rhs)
     }
 
     fn int_greater_elem(lhs: IntTensor<Self>, rhs: IntElem<Self>) -> BoolTensor<Self> {
-        kernel::greater_elem::<R, I>(lhs, rhs)
+        kernel::greater_elem::<R, I, BT>(lhs, rhs)
     }
 
     fn int_greater_equal(lhs: IntTensor<Self>, rhs: IntTensor<Self>) -> BoolTensor<Self> {
-        kernel::greater_equal::<R, I>(lhs, rhs)
+        kernel::greater_equal::<R, I, BT>(lhs, rhs)
     }
 
     fn int_greater_equal_elem(lhs: IntTensor<Self>, rhs: IntElem<Self>) -> BoolTensor<Self> {
-        kernel::greater_equal_elem::<R, I>(lhs, rhs)
+        kernel::greater_equal_elem::<R, I, BT>(lhs, rhs)
     }
 
     fn int_lower(lhs: IntTensor<Self>, rhs: IntTensor<Self>) -> BoolTensor<Self> {
-        kernel::lower::<R, I>(lhs, rhs)
+        kernel::lower::<R, I, BT>(lhs, rhs)
     }
 
     fn int_lower_elem(lhs: IntTensor<Self>, rhs: IntElem<Self>) -> BoolTensor<Self> {
-        kernel::lower_elem::<R, I>(lhs, rhs)
+        kernel::lower_elem::<R, I, BT>(lhs, rhs)
     }
 
     fn int_lower_equal(lhs: IntTensor<Self>, rhs: IntTensor<Self>) -> BoolTensor<Self> {
-        kernel::lower_equal::<R, I>(lhs, rhs)
+        kernel::lower_equal::<R, I, BT>(lhs, rhs)
     }
 
     fn int_lower_equal_elem(lhs: IntTensor<Self>, rhs: IntElem<Self>) -> BoolTensor<Self> {
-        kernel::lower_equal_elem::<R, I>(lhs, rhs)
+        kernel::lower_equal_elem::<R, I, BT>(lhs, rhs)
     }
 
     fn int_add(lhs: IntTensor<Self>, rhs: IntTensor<Self>) -> IntTensor<Self> {
@@ -277,6 +281,6 @@ where
     }
 
     fn int_flip(tensor: IntTensor<Self>, axes: &[usize]) -> IntTensor<Self> {
-        kernel::flip::<R, I>(tensor, axes)
+        kernel::flip::<R, I, BT>(tensor, axes)
     }
 }
diff --git a/crates/burn-jit/src/ops/module_ops.rs b/crates/burn-jit/src/ops/module_ops.rs
index 5539dfc9f2..b5c96058f9 100644
--- a/crates/burn-jit/src/ops/module_ops.rs
+++ b/crates/burn-jit/src/ops/module_ops.rs
@@ -1,4 +1,5 @@
 use crate::{
+    element::BoolElement,
     kernel::{
         self,
         conv::{Conv2dStrategy, ConvTranspose2dStrategy},
@@ -11,11 +12,12 @@ use burn_tensor::ops::{
 };
 use burn_tensor::ops::{FloatTensor, IntTensor};
 
-impl<R, F, I> ModuleOps<Self> for JitBackend<R, F, I>
+impl<R, F, I, BT> ModuleOps<Self> for JitBackend<R, F, I, BT>
 where
     R: JitRuntime,
     F: FloatElement,
     I: IntElement,
+    BT: BoolElement,
 {
     fn conv2d(
         x: FloatTensor<Self>,
@@ -23,7 +25,7 @@ where
         bias: Option<FloatTensor<Self>>,
         options: ConvOptions<2>,
     ) -> FloatTensor<Self> {
-        kernel::conv::conv2d::<R, F, I>(x, weight, bias, options, Conv2dStrategy::default())
+        kernel::conv::conv2d::<R, F>(x, weight, bias, options, Conv2dStrategy::default())
     }
 
     fn deform_conv2d(
@@ -34,7 +36,7 @@ where
         bias: Option<FloatTensor<Self>>,
         options: DeformConvOptions<2>,
     ) -> FloatTensor<Self> {
-        kernel::conv::deform_conv2d::<R, F, I>(x, offset, weight, mask, bias, options)
+        kernel::conv::deform_conv2d::<R, F>(x, offset, weight, mask, bias, options)
     }
 
     fn deform_conv2d_backward(
@@ -46,7 +48,7 @@ where
         output_grad: FloatTensor<Self>,
         options: DeformConvOptions<2>,
     ) -> DeformConv2dBackward<Self> {
-        kernel::conv::deform_conv2d_backward::<R, F, I>(
+        kernel::conv::deform_conv2d_backward::<R, F, I, BT>(
             x,
             offset,
             weight,
diff --git a/crates/burn-jit/src/ops/qtensor.rs b/crates/burn-jit/src/ops/qtensor.rs
index e5eb4005a6..94b1a6f2ee 100644
--- a/crates/burn-jit/src/ops/qtensor.rs
+++ b/crates/burn-jit/src/ops/qtensor.rs
@@ -9,6 +9,7 @@ use burn_tensor::{
 };
 
 use crate::{
+    element::BoolElement,
     kernel,
     tensor::{JitQuantizationParameters, JitTensor, QJitTensor},
     FloatElement, IntElement, JitBackend, JitRuntime,
@@ -27,11 +28,12 @@ fn packed_tensor<R: JitRuntime, S: Into<Shape>>(
     JitTensor::new_contiguous(client, device.clone(), shape.into(), buffer, DType::U32)
 }
 
-impl<R, F, I> QTensorOps<Self> for JitBackend<R, F, I>
+impl<R, F, I, BT> QTensorOps<Self> for JitBackend<R, F, I, BT>
 where
     R: JitRuntime,
     F: FloatElement,
     I: IntElement,
+    BT: BoolElement,
 {
     fn q_from_data(data: TensorData, device: &Device<Self>) -> QuantizedTensor<Self> {
         match data.dtype {
diff --git a/crates/burn-jit/src/ops/transaction.rs b/crates/burn-jit/src/ops/transaction.rs
index 62477d3ce1..7320186570 100644
--- a/crates/burn-jit/src/ops/transaction.rs
+++ b/crates/burn-jit/src/ops/transaction.rs
@@ -3,13 +3,14 @@ use burn_tensor::{
     DType, TensorData,
 };
 
-use crate::{FloatElement, IntElement, JitBackend, JitRuntime};
+use crate::{element::BoolElement, FloatElement, IntElement, JitBackend, JitRuntime};
 
-impl<R, F, I> TransactionOps<Self> for JitBackend<R, F, I>
+impl<R, F, I, BT> TransactionOps<Self> for JitBackend<R, F, I, BT>
 where
     R: JitRuntime,
     F: FloatElement,
     I: IntElement,
+    BT: BoolElement,
 {
     fn tr_execute(
         transaction: burn_tensor::ops::TransactionPrimitive<Self>,
@@ -51,7 +52,7 @@ where
                 client = Some(t.client.clone());
             }
 
-            kinds.push(Kind::Bool(num_bindings, t.shape.into(), DType::U32));
+            kinds.push(Kind::Bool(num_bindings, t.shape.into(), BT::dtype()));
             num_bindings += 1;
             bindings.push(t.handle.binding())
         });
@@ -64,7 +65,7 @@ where
                 .await
                 .into_iter()
                 .map(Some)
-                .collect::<Vec<_>>();
+                .collect::<Vec<Option<_>>>();
 
             let mut result = TransactionPrimitiveResult::default();
 
diff --git a/crates/burn-jit/src/tensor/base.rs b/crates/burn-jit/src/tensor/base.rs
index 112260c3bf..3eb44b3e02 100644
--- a/crates/burn-jit/src/tensor/base.rs
+++ b/crates/burn-jit/src/tensor/base.rs
@@ -162,9 +162,9 @@ macro_rules! execute_with_dtype {
                 type $element = i8;
                 $op
             }
-            // NOTE: bool and qfloat dtypes are actually represented as u32
+            // NOTE: bool and qfloat dtypes are actually represented as u32/u8
             // burn_tensor::DType::Bool => {
-            //     type $element = u32;
+            //     type $element = u32/u8;
             //     $op
             // }
             // burn_tensor::DType::QFloat(_) => {
diff --git a/crates/burn-jit/src/tensor/qtensor.rs b/crates/burn-jit/src/tensor/qtensor.rs
index fdf7068e1a..4ef5f77589 100644
--- a/crates/burn-jit/src/tensor/qtensor.rs
+++ b/crates/burn-jit/src/tensor/qtensor.rs
@@ -6,7 +6,9 @@ use burn_tensor::{
     read_sync, DType, TensorData, TensorMetadata,
 };
 
-use crate::{ops::into_data, FloatElement, IntElement, JitBackend, JitRuntime};
+use crate::{
+    element::BoolElement, ops::into_data, FloatElement, IntElement, JitBackend, JitRuntime,
+};
 
 use super::JitTensor;
 
@@ -96,10 +98,11 @@ impl<R: JitRuntime> Clone for JitQuantizationParameters<R> {
     }
 }
 
-impl<R: JitRuntime, F: FloatElement, I: IntElement>
-    From<QuantizationParametersPrimitive<JitBackend<R, F, I>>> for JitQuantizationParameters<R>
+impl<R: JitRuntime, F: FloatElement, I: IntElement, BT: BoolElement>
+    From<QuantizationParametersPrimitive<JitBackend<R, F, I, BT>>>
+    for JitQuantizationParameters<R>
 {
-    fn from(value: QuantizationParametersPrimitive<JitBackend<R, F, I>>) -> Self {
+    fn from(value: QuantizationParametersPrimitive<JitBackend<R, F, I, BT>>) -> Self {
         JitQuantizationParameters {
             scale: value.scale,
             offset: value.offset,
diff --git a/crates/burn-jit/src/tests/mask_fill.rs b/crates/burn-jit/src/tests/mask_fill.rs
index 4542bbe3f1..c768373d13 100644
--- a/crates/burn-jit/src/tests/mask_fill.rs
+++ b/crates/burn-jit/src/tests/mask_fill.rs
@@ -11,6 +11,7 @@ mod tests {
         let actual = Tensor::<TestBackend, 3>::from_primitive(TensorPrimitive::Float(mask_fill::<
             _,
             <TestBackend as Backend>::FloatElem,
+            <TestBackend as Backend>::BoolElem,
         >(
             tensor.into_primitive().tensor(),
             mask.into_primitive(),
@@ -31,6 +32,7 @@ mod tests {
         let actual = Tensor::<TestBackend, 3>::from_primitive(TensorPrimitive::Float(mask_fill::<
             _,
             <TestBackend as Backend>::FloatElem,
+            <TestBackend as Backend>::BoolElem,
         >(
             tensor.into_primitive().tensor(),
             mask.into_primitive(),
diff --git a/crates/burn-jit/src/tests/mask_where.rs b/crates/burn-jit/src/tests/mask_where.rs
index befdb76af6..a14993995c 100644
--- a/crates/burn-jit/src/tests/mask_where.rs
+++ b/crates/burn-jit/src/tests/mask_where.rs
@@ -23,6 +23,7 @@ mod tests {
             Tensor::<TestBackend, 3>::from_primitive(TensorPrimitive::Float(mask_where::<
                 _,
                 <TestBackend as Backend>::FloatElem,
+                <TestBackend as Backend>::BoolElem,
             >(
                 tensor.into_primitive().tensor(),
                 mask.into_primitive(),
@@ -44,6 +45,7 @@ mod tests {
             Tensor::<TestBackend, 3>::from_primitive(TensorPrimitive::Float(mask_where::<
                 _,
                 <TestBackend as Backend>::FloatElem,
+                <TestBackend as Backend>::BoolElem,
             >(
                 tensor.into_primitive().tensor(),
                 mask.into_primitive(),
diff --git a/crates/burn-jit/src/tests/mod.rs b/crates/burn-jit/src/tests/mod.rs
index b1ee4ce26d..f60edc2a1b 100644
--- a/crates/burn-jit/src/tests/mod.rs
+++ b/crates/burn-jit/src/tests/mod.rs
@@ -38,12 +38,12 @@ pub use serial_test;
 #[macro_export]
 macro_rules! testgen_all {
     () => {
-        use burn_tensor::{Float, Int};
-        $crate::testgen_all!([Float], [Int]);
+        use burn_tensor::{Float, Int, Bool};
+        $crate::testgen_all!([Float], [Int], [Bool]);
     };
-    ([$($float:ident),*], [$($int:ident),*]) => {
+    ([$($float:ident),*], [$($int:ident),*], [$($bool:ident),*]) => {
         mod jit {
-            burn_jit::testgen_jit!([$($float),*], [$($int),*]);
+            burn_jit::testgen_jit!([$($float),*], [$($int),*], [$($bool),*]);
 
             mod kernel {
                 use super::*;
@@ -84,7 +84,7 @@ macro_rules! testgen_all {
             }
         }
         mod jit_fusion {
-            burn_jit::testgen_jit_fusion!([$($float),*], [$($int),*]);
+            burn_jit::testgen_jit_fusion!([$($float),*], [$($int),*], [$($bool),*]);
         }
     };
 }
@@ -92,31 +92,31 @@ macro_rules! testgen_all {
 #[macro_export]
 macro_rules! testgen_jit {
     () => {
-        use burn_tensor::{Float, Int};
-        $crate::testgen_jit!([Float], [Int]);
+        use burn_tensor::{Float, Int, Bool};
+        $crate::testgen_jit!([Float], [Int], [Bool]);
     };
-    ([$($float:ident),*], [$($int:ident),*]) => {
+    ([$($float:ident),*], [$($int:ident),*], [$($bool:ident),*]) => {
         pub use super::*;
         use burn_jit::tests::{burn_autodiff, burn_ndarray, burn_tensor, serial_test};
 
-        pub type TestBackend = JitBackend<TestRuntime, f32, i32>;
-        pub type TestBackend2<F, I> = JitBackend<TestRuntime, F, I>;
+        pub type TestBackend = JitBackend<TestRuntime, f32, i32, u32>;
+        pub type TestBackend2<F, I, B> = JitBackend<TestRuntime, F, I, B>;
         pub type ReferenceBackend = burn_ndarray::NdArray<f32>;
 
         pub type TestTensor<const D: usize> = burn_tensor::Tensor<TestBackend, D>;
-        pub type TestTensor2<F, I, const D: usize> = burn_tensor::Tensor<TestBackend2<F, I>, D>;
+        pub type TestTensor2<F, I, B, const D: usize> = burn_tensor::Tensor<TestBackend2<F, I, B>, D>;
         pub type TestTensorInt<const D: usize> =
             burn_tensor::Tensor<TestBackend, D, burn_tensor::Int>;
-        pub type TestTensorInt2<F, I, const D: usize> =
-            burn_tensor::Tensor<TestBackend2<F, I>, D, burn_tensor::Int>;
+        pub type TestTensorInt2<F, I, B, const D: usize> =
+            burn_tensor::Tensor<TestBackend2<F, I, B>, D, burn_tensor::Int>;
         pub type TestTensorBool<const D: usize> =
             burn_tensor::Tensor<TestBackend, D, burn_tensor::Bool>;
-        pub type TestTensorBool2<F, I, const D: usize> =
-            burn_tensor::Tensor<TestBackend2<F, I>, D, burn_tensor::Bool>;
+        pub type TestTensorBool2<F, I, B, const D: usize> =
+            burn_tensor::Tensor<TestBackend2<F, I, B>, D, burn_tensor::Bool>;
 
         pub type ReferenceTensor<const D: usize> = burn_tensor::Tensor<ReferenceBackend, D>;
 
-        burn_tensor::testgen_all!([$($float),*], [$($int),*]);
+        burn_tensor::testgen_all!([$($float),*], [$($int),*], [$($bool),*]);
         burn_autodiff::testgen_all!([$($float),*]);
 
         // Not all ops are implemented for quantization yet, notably missing:
@@ -135,28 +135,28 @@ macro_rules! testgen_jit_fusion {
         use burn_tensor::{Float, Int};
         $crate::testgen_jit_fusion!([Float], [Int]);
     };
-    ([$($float:ident),*], [$($int:ident),*]) => {
+    ([$($float:ident),*], [$($int:ident),*], [$($bool:ident),*]) => {
         use super::*;
         use burn_jit::tests::{burn_autodiff, burn_fusion, burn_ndarray, burn_tensor};
 
-        pub type TestBackend = burn_fusion::Fusion<JitBackend<TestRuntime, f32, i32>>;
-        pub type TestBackend2<F, I> = burn_fusion::Fusion<JitBackend<TestRuntime, F, I>>;
+        pub type TestBackend = burn_fusion::Fusion<JitBackend<TestRuntime, f32, i32, u32>>;
+        pub type TestBackend2<F, I, B> = burn_fusion::Fusion<JitBackend<TestRuntime, F, I, B>>;
         pub type ReferenceBackend = burn_ndarray::NdArray<f32>;
 
         pub type TestTensor<const D: usize> = burn_tensor::Tensor<TestBackend, D>;
-        pub type TestTensor2<F, I, const D: usize> = burn_tensor::Tensor<TestBackend2<F, I>, D>;
+        pub type TestTensor2<F, I, B, const D: usize> = burn_tensor::Tensor<TestBackend2<F, I, B>, D>;
         pub type TestTensorInt<const D: usize> =
             burn_tensor::Tensor<TestBackend, D, burn_tensor::Int>;
-        pub type TestTensorInt2<F, I, const D: usize> =
-            burn_tensor::Tensor<TestBackend2<F, I>, D, burn_tensor::Int>;
+        pub type TestTensorInt2<F, I, B, const D: usize> =
+            burn_tensor::Tensor<TestBackend2<F, I, B>, D, burn_tensor::Int>;
         pub type TestTensorBool<const D: usize> =
             burn_tensor::Tensor<TestBackend, D, burn_tensor::Bool>;
-        pub type TestTensorBool2<F, I, const D: usize> =
-            burn_tensor::Tensor<TestBackend2<F, I>, D, burn_tensor::Bool>;
+        pub type TestTensorBool2<F, I, B, const D: usize> =
+            burn_tensor::Tensor<TestBackend2<F, I, B>, D, burn_tensor::Bool>;
 
         pub type ReferenceTensor<const D: usize> = burn_tensor::Tensor<ReferenceBackend, D>;
 
-        burn_tensor::testgen_all!([$($float),*], [$($int),*]);
+        burn_tensor::testgen_all!([$($float),*], [$($int),*], [$($bool),*]);
         burn_autodiff::testgen_all!([$($float),*]);
 
         // Not all ops are implemented for quantization yet, notably missing:
diff --git a/crates/burn-ndarray/src/backend.rs b/crates/burn-ndarray/src/backend.rs
index 74957f5f1f..060899b979 100644
--- a/crates/burn-ndarray/src/backend.rs
+++ b/crates/burn-ndarray/src/backend.rs
@@ -53,6 +53,7 @@ impl<E: FloatNdArrayElement, I: IntNdArrayElement, Q: QuantElement> Backend for
     type IntElem = I;
 
     type BoolTensorPrimitive = NdArrayTensor<bool>;
+    type BoolElem = bool;
 
     type QuantizedTensorPrimitive = NdArrayQTensor<Q>;
     type QuantizedEncoding = Q;
diff --git a/crates/burn-remote/src/client/channel.rs b/crates/burn-remote/src/client/channel.rs
index 6c431702af..d7102dd97a 100644
--- a/crates/burn-remote/src/client/channel.rs
+++ b/crates/burn-remote/src/client/channel.rs
@@ -19,6 +19,8 @@ impl RunnerChannel for WsChannel {
 
     type IntElem = i32;
 
+    type BoolElem = u32;
+
     fn name() -> String {
         "remote".into()
     }
diff --git a/crates/burn-router/src/backend.rs b/crates/burn-router/src/backend.rs
index 6fcf80ce3d..a5ada5e5fd 100644
--- a/crates/burn-router/src/backend.rs
+++ b/crates/burn-router/src/backend.rs
@@ -55,6 +55,8 @@ impl<R: RunnerChannel> Backend for BackendRouter<R> {
 
     type BoolTensorPrimitive = RouterTensor<R::Client>;
 
+    type BoolElem = R::BoolElem;
+
     type QuantizedTensorPrimitive = RouterTensor<R::Client>;
 
     type QuantizedEncoding = u32;
diff --git a/crates/burn-router/src/channel/base.rs b/crates/burn-router/src/channel/base.rs
index 876d273f62..887190ecfa 100644
--- a/crates/burn-router/src/channel/base.rs
+++ b/crates/burn-router/src/channel/base.rs
@@ -18,6 +18,8 @@ pub trait RunnerChannel: Clone + Send + Sync + 'static + Sized {
     type FloatElem: Element;
     /// Int element type.
     type IntElem: Element;
+    /// Bool element type.
+    type BoolElem: Element;
 
     /// Name of the channel.
     fn name() -> String;
diff --git a/crates/burn-router/src/types.rs b/crates/burn-router/src/types.rs
index 3b694d8779..f36e436638 100644
--- a/crates/burn-router/src/types.rs
+++ b/crates/burn-router/src/types.rs
@@ -206,6 +206,7 @@ macro_rules! impl_multi_backend_types {
 
                 type FloatElem = $DefaultBackend::FloatElem;
                 type IntElem = $DefaultBackend::IntElem;
+                type BoolElem = $DefaultBackend::BoolElem;
 
                 type Client = MultiRunnerClient<$DefaultBackend, $($OtherBackend),+>;
 
diff --git a/crates/burn-tch/src/backend.rs b/crates/burn-tch/src/backend.rs
index cef9a8d586..c294ae0025 100644
--- a/crates/burn-tch/src/backend.rs
+++ b/crates/burn-tch/src/backend.rs
@@ -101,6 +101,7 @@ impl<E: TchElement, Q: QuantElement> Backend for LibTorch<E, Q> {
     type IntElem = i64;
 
     type BoolTensorPrimitive = TchTensor;
+    type BoolElem = bool;
 
     type QuantizedTensorPrimitive = TchQTensor;
     type QuantizedEncoding = Q;
diff --git a/crates/burn-tensor/src/tensor/backend/base.rs b/crates/burn-tensor/src/tensor/backend/base.rs
index f951a9b6f3..973f4ede65 100644
--- a/crates/burn-tensor/src/tensor/backend/base.rs
+++ b/crates/burn-tensor/src/tensor/backend/base.rs
@@ -83,6 +83,8 @@ pub trait Backend:
 
     /// Tensor primitive to be used for all bool operations.
     type BoolTensorPrimitive: TensorMetadata + 'static;
+    /// Tensor primitive to be used for all bool operations.
+    type BoolElem: Element;
 
     /// Tensor primitive to be used for all quantized operations.
     type QuantizedTensorPrimitive: TensorMetadata + QTensorPrimitive + 'static;
diff --git a/crates/burn-tensor/src/tests/mod.rs b/crates/burn-tensor/src/tests/mod.rs
index 5b15a45591..8aa41ee24d 100644
--- a/crates/burn-tensor/src/tests/mod.rs
+++ b/crates/burn-tensor/src/tests/mod.rs
@@ -17,28 +17,28 @@ macro_rules! testgen_all {
 
             pub type FloatType = <TestBackend as $crate::backend::Backend>::FloatElem;
             pub type IntType = <TestBackend as $crate::backend::Backend>::IntElem;
-            pub type BoolType = <TestBackend as $crate::backend::Backend>::BoolTensorPrimitive;
+            pub type BoolType = <TestBackend as $crate::backend::Backend>::BoolElem;
 
             $crate::testgen_with_float_param!();
             $crate::testgen_no_param!();
         }
     };
-    ([$($float:ident),*], [$($int:ident),*]) => {
+    ([$($float:ident),*], [$($int:ident),*], [$($bool:ident),*]) => {
         pub mod tensor {
             pub use super::*;
 
             pub type FloatType = <TestBackend as $crate::backend::Backend>::FloatElem;
             pub type IntType = <TestBackend as $crate::backend::Backend>::IntElem;
-            pub type BoolType = <TestBackend as $crate::backend::Backend>::BoolTensorPrimitive;
+            pub type BoolType = <TestBackend as $crate::backend::Backend>::BoolElem;
 
             ::paste::paste! {
                 $(mod [<$float _ty>] {
                     pub use super::*;
 
-                    pub type TestBackend = TestBackend2<$float, IntType>;
-                    pub type TestTensor<const D: usize> = TestTensor2<$float, IntType, D>;
-                    pub type TestTensorInt<const D: usize> = TestTensorInt2<$float, IntType, D>;
-                    pub type TestTensorBool<const D: usize> = TestTensorBool2<$float, IntType, D>;
+                    pub type TestBackend = TestBackend2<$float, IntType, BoolType>;
+                    pub type TestTensor<const D: usize> = TestTensor2<$float, IntType, BoolType, D>;
+                    pub type TestTensorInt<const D: usize> = TestTensorInt2<$float, IntType, BoolType, D>;
+                    pub type TestTensorBool<const D: usize> = TestTensorBool2<$float, IntType, BoolType, D>;
 
                     pub type FloatType = $float;
 
@@ -47,13 +47,25 @@ macro_rules! testgen_all {
                 $(mod [<$int _ty>] {
                     pub use super::*;
 
-                    pub type TestBackend = TestBackend2<FloatType, $int>;
-                    pub type TestTensor<const D: usize> = TestTensor2<FloatType, $int, D>;
-                    pub type TestTensorInt<const D: usize> = TestTensorInt2<FloatType, $int, D>;
-                    pub type TestTensorBool<const D: usize> = TestTensorBool2<FloatType, $int, D>;
+                    pub type TestBackend = TestBackend2<FloatType, $int, BoolType>;
+                    pub type TestTensor<const D: usize> = TestTensor2<FloatType, $int, BoolType, D>;
+                    pub type TestTensorInt<const D: usize> = TestTensorInt2<FloatType, $int, BoolType, D>;
+                    pub type TestTensorBool<const D: usize> = TestTensorBool2<FloatType, $int, BoolType, D>;
 
                     pub type IntType = $int;
 
+                    $crate::testgen_with_int_param!();
+                })*
+                $(mod [<$bool _bool_ty>] {
+                    pub use super::*;
+
+                    pub type TestBackend = TestBackend2<FloatType, IntType, $bool>;
+                    pub type TestTensor<const D: usize> = TestTensor2<FloatType, IntType, $bool, D>;
+                    pub type TestTensorInt<const D: usize> = TestTensorInt2<FloatType, IntType, $bool, D>;
+                    pub type TestTensorBool<const D: usize> = TestTensorBool2<FloatType, IntType, $bool, D>;
+
+                    pub type BoolType = $bool;
+
                     $crate::testgen_with_int_param!();
                 })*
             }
@@ -307,6 +319,29 @@ macro_rules! testgen_with_int_param {
     };
 }
 
+#[allow(missing_docs)]
+#[macro_export]
+macro_rules! testgen_with_bool_param {
+    () => {
+        burn_tensor::testgen_all_op!();
+        burn_tensor::testgen_any_op!();
+        burn_tensor::testgen_argwhere_nonzero!();
+        burn_tensor::testgen_cast!();
+        burn_tensor::testgen_cat!();
+        burn_tensor::testgen_expand!();
+        burn_tensor::testgen_full!();
+        burn_tensor::testgen_map_comparison!();
+        burn_tensor::testgen_mask!();
+        burn_tensor::testgen_nan!();
+        burn_tensor::testgen_repeat_dim!();
+        burn_tensor::testgen_repeat!();
+        burn_tensor::testgen_reshape!();
+        burn_tensor::testgen_stack!();
+        burn_tensor::testgen_transpose!();
+        burn_tensor::tri_mask!();
+    };
+}
+
 #[allow(missing_docs)]
 #[macro_export]
 macro_rules! testgen_no_param {
diff --git a/crates/burn-tensor/src/tests/ops/remainder.rs b/crates/burn-tensor/src/tests/ops/remainder.rs
index fa75630fe8..996c71a7b7 100644
--- a/crates/burn-tensor/src/tests/ops/remainder.rs
+++ b/crates/burn-tensor/src/tests/ops/remainder.rs
@@ -67,7 +67,7 @@ mod tests {
     fn should_be_zero() {
         let device = Default::default();
         let lhs = Tensor::<TestBackend, 1>::from_data(TensorData::from([0.0, 0.0, 0.0]), &device);
-        let rhs = Tensor::<TestBackend, 1>::from_data(TensorData::from([3.5, -2.1, 1e-5]), &device);
+        let rhs = Tensor::<TestBackend, 1>::from_data(TensorData::from([3.5, -2.1, 1e-4]), &device);
 
         let output = lhs.remainder(rhs);
         let expected = TensorData::from([0.0, 0.0, 0.0]);
diff --git a/crates/burn-wgpu/src/lib.rs b/crates/burn-wgpu/src/lib.rs
index 7c26dcc31b..0751ad9f41 100644
--- a/crates/burn-wgpu/src/lib.rs
+++ b/crates/burn-wgpu/src/lib.rs
@@ -10,7 +10,7 @@ pub use burn_jit::{
 };
 
 pub use burn_jit::{tensor::JitTensor, JitBackend};
-pub use burn_jit::{FloatElement, IntElement};
+pub use burn_jit::{BoolElement, FloatElement, IntElement};
 pub use cubecl::flex32;
 pub use cubecl::ir::CubeDim;
 pub use cubecl::wgpu::*;
@@ -21,8 +21,12 @@ pub type SpirV = cubecl::wgpu::spirv::VkSpirvCompiler;
 
 #[cfg(feature = "spirv")]
 type Compiler = SpirV;
+#[cfg(feature = "spirv")]
+type Byte = u8;
 #[cfg(not(feature = "spirv"))]
 type Compiler = Wgsl;
+#[cfg(not(feature = "spirv"))]
+type Bool = u32;
 
 #[cfg(feature = "fusion")]
 /// Tensor backend that uses the wgpu crate for executing GPU compute shaders.
@@ -56,8 +60,8 @@ type Compiler = Wgsl;
 ///
 /// You can disable the `fusion` feature flag to remove that functionality, which might be
 /// necessary on `wasm` for now.
-pub type Wgpu<F = f32, I = i32, C = Compiler> =
-    burn_fusion::Fusion<JitBackend<cubecl::wgpu::WgpuRuntime<C>, F, I>>;
+pub type Wgpu<F = f32, I = i32, B = Bool, C = Compiler> =
+    burn_fusion::Fusion<JitBackend<cubecl::wgpu::WgpuRuntime<C>, F, I, B>>;
 
 #[cfg(not(feature = "fusion"))]
 /// Tensor backend that uses the wgpu crate for executing GPU compute shaders.
@@ -91,7 +95,8 @@ pub type Wgpu<F = f32, I = i32, C = Compiler> =
 ///
 /// You can enable the `fusion` feature flag to add that functionality, which might improve
 /// performance.
-pub type Wgpu<F = f32, I = i32, C = Compiler> = JitBackend<cubecl::wgpu::WgpuRuntime<C>, F, I>;
+pub type Wgpu<F = f32, I = i32, B = Bool, C = Compiler> =
+    JitBackend<cubecl::wgpu::WgpuRuntime<C>, F, I, B>;
 
 #[cfg(test)]
 mod tests {
@@ -103,7 +108,7 @@ mod tests {
     // Don't test `flex32` for now, burn sees it as `f32` but is actually `f16` precision, so it
     // breaks a lot of tests from precision issues
     #[cfg(feature = "spirv")]
-    burn_jit::testgen_all!([f16, f32], [i8, i16, i32, i64]);
+    burn_jit::testgen_all!([f16, f32], [i8, i16, i32, i64], [u8, u32]);
     #[cfg(not(feature = "spirv"))]
-    burn_jit::testgen_all!([f32], [i32]);
+    burn_jit::testgen_all!([f32], [i32], [u32]);
 }
diff --git a/examples/custom-cubecl-kernel/examples/custom-cubecl-kernel.rs b/examples/custom-cubecl-kernel/examples/custom-cubecl-kernel.rs
index 6cb9e6a6ae..de6bfcc7d4 100644
--- a/examples/custom-cubecl-kernel/examples/custom-cubecl-kernel.rs
+++ b/examples/custom-cubecl-kernel/examples/custom-cubecl-kernel.rs
@@ -71,7 +71,7 @@ fn autodiff<B: AutodiffBackend>(device: &B::Device) {
 }
 
 fn main() {
-    type MyBackend = burn::backend::wgpu::JitBackend<WgpuRuntime, f32, i32>;
+    type MyBackend = burn::backend::wgpu::JitBackend<WgpuRuntime, f32, i32, u32>;
     type MyAutodiffBackend = burn::backend::Autodiff<MyBackend>;
     let device = Default::default();
     inference::<MyBackend>(&device);
diff --git a/examples/custom-cubecl-kernel/src/backward.rs b/examples/custom-cubecl-kernel/src/backward.rs
index 3c66ae8e0e..a894f4e446 100644
--- a/examples/custom-cubecl-kernel/src/backward.rs
+++ b/examples/custom-cubecl-kernel/src/backward.rs
@@ -10,10 +10,10 @@ use burn::{
     },
     tensor::{Shape, TensorMetadata},
 };
-use burn_jit::{FloatElement, IntElement, JitBackend, JitRuntime};
+use burn_jit::{element::BoolElement, FloatElement, IntElement, JitBackend, JitRuntime};
 
-impl<R: JitRuntime, F: FloatElement, I: IntElement> AutodiffBackend
-    for Autodiff<JitBackend<R, F, I>>
+impl<R: JitRuntime, F: FloatElement, I: IntElement, BT: BoolElement> AutodiffBackend
+    for Autodiff<JitBackend<R, F, I, BT>>
 {
 }
 
diff --git a/examples/custom-cubecl-kernel/src/forward.rs b/examples/custom-cubecl-kernel/src/forward.rs
index a8bf17fcd7..0e180e231a 100644
--- a/examples/custom-cubecl-kernel/src/forward.rs
+++ b/examples/custom-cubecl-kernel/src/forward.rs
@@ -3,12 +3,15 @@ use crate::{kernel::fused_matmul_add_relu_kernel, FloatTensor};
 use super::Backend;
 use burn::tensor::Shape;
 use burn_jit::{
-    kernel::into_contiguous, tensor::JitTensor, FloatElement, IntElement, JitBackend, JitRuntime,
+    element::BoolElement, kernel::into_contiguous, tensor::JitTensor, FloatElement, IntElement,
+    JitBackend, JitRuntime,
 };
 use cubecl::{CubeCount, CubeDim};
 
 /// Implement our custom backend trait for the generic `JitBackend`.
-impl<R: JitRuntime, F: FloatElement, I: IntElement> Backend for JitBackend<R, F, I> {
+impl<R: JitRuntime, F: FloatElement, I: IntElement, BT: BoolElement> Backend
+    for JitBackend<R, F, I, BT>
+{
     fn fused_matmul_add_relu(
         lhs: FloatTensor<Self>,
         rhs: FloatTensor<Self>,
diff --git a/examples/custom-wgpu-kernel/examples/custom-wgpu-kernel.rs b/examples/custom-wgpu-kernel/examples/custom-wgpu-kernel.rs
index a309ea5716..0c2201080e 100644
--- a/examples/custom-wgpu-kernel/examples/custom-wgpu-kernel.rs
+++ b/examples/custom-wgpu-kernel/examples/custom-wgpu-kernel.rs
@@ -71,7 +71,7 @@ fn autodiff<B: AutodiffBackend>(device: &B::Device) {
 }
 
 fn main() {
-    type MyBackend = burn::backend::wgpu::JitBackend<WgpuRuntime, f32, i32>;
+    type MyBackend = burn::backend::wgpu::JitBackend<WgpuRuntime, f32, i32, u32>;
     type MyAutodiffBackend = burn::backend::Autodiff<MyBackend>;
     let device = Default::default();
     inference::<MyBackend>(&device);
diff --git a/examples/custom-wgpu-kernel/src/backward.rs b/examples/custom-wgpu-kernel/src/backward.rs
index b9032413bc..eb374d6c10 100644
--- a/examples/custom-wgpu-kernel/src/backward.rs
+++ b/examples/custom-wgpu-kernel/src/backward.rs
@@ -9,12 +9,15 @@ use burn::{
             ops::{broadcast_shape, Backward, Ops, OpsKind},
             Autodiff, NodeID,
         },
-        wgpu::{FloatElement, IntElement, JitBackend, WgpuRuntime},
+        wgpu::{BoolElement, FloatElement, IntElement, JitBackend, WgpuRuntime},
     },
     tensor::{Shape, TensorMetadata},
 };
 
-impl<F: FloatElement, I: IntElement> AutodiffBackend for Autodiff<JitBackend<WgpuRuntime, F, I>> {}
+impl<F: FloatElement, I: IntElement, BT: BoolElement> AutodiffBackend
+    for Autodiff<JitBackend<WgpuRuntime, F, I, BT>>
+{
+}
 
 // Implement our custom backend trait for any backend that also implements our custom backend trait.
 //
diff --git a/examples/custom-wgpu-kernel/src/forward.rs b/examples/custom-wgpu-kernel/src/forward.rs
index c8476230d2..e257e13bf0 100644
--- a/examples/custom-wgpu-kernel/src/forward.rs
+++ b/examples/custom-wgpu-kernel/src/forward.rs
@@ -3,8 +3,8 @@ use crate::FloatTensor;
 use super::Backend;
 use burn::{
     backend::wgpu::{
-        build_info, into_contiguous, kernel_source, FloatElement, IntElement, JitBackend,
-        JitTensor, KernelSource, SourceKernel, SourceTemplate, WgpuRuntime,
+        build_info, into_contiguous, kernel_source, BoolElement, FloatElement, IntElement,
+        JitBackend, JitTensor, KernelSource, SourceKernel, SourceTemplate, WgpuRuntime,
     },
     tensor::Shape,
 };
@@ -41,7 +41,9 @@ impl<E: FloatElement> KernelSource for FusedMatmulAddRelu<E> {
 }
 
 /// Implement our custom backend trait for the existing backend `WgpuBackend`.
-impl<F: FloatElement, I: IntElement> Backend for JitBackend<WgpuRuntime, F, I> {
+impl<F: FloatElement, I: IntElement, BT: BoolElement> Backend
+    for JitBackend<WgpuRuntime, F, I, BT>
+{
     fn fused_matmul_add_relu(
         lhs: FloatTensor<Self>,
         rhs: FloatTensor<Self>,