diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
index 60355b8f0db0..daef47e2ee96 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/ConvertToROCDL.cpp
@@ -75,18 +75,38 @@ static void populateConvertGPUToAMDGPUPatterns(RewritePatternSet &patterns) {
 
 } // namespace
 
+template <typename... Floats>
+static bool containsAPred(Type type) {
+  type = getElementTypeOrSelf(type);
+  return llvm::isa<Floats...>(type);
+}
+
 // Function to check valid data types on the ROCm backend.
-static LogicalResult validateDataTypes(Operation *op) {
-  auto operandTypes = llvm::to_vector(op->getOperandTypes());
-  auto resultTypes = llvm::to_vector(op->getResultTypes());
-  if (llvm::any_of(llvm::concat<Type>(operandTypes, resultTypes),
-                   llvm::IsaPred<Float8E4M3FNType, Float8E5M2Type>)) {
-    op->emitOpError()
-        << "F8E5M2 and F8E4M3FN types are not supported on "
-           "the ROCm backend; try F8E5M2FNUZ or F8E4M3FNUZ instead.";
-    return failure();
+// Note to readers: different chips take different FP8 formats but re-use the
+// same instruction and intrinsic names, so we must filter out the "wrong" FP8
+// here.
+static LogicalResult validateDataTypes(Operation *op,
+                                       const amdgpu::Chipset &chipset) {
+  constexpr amdgpu::Chipset kGfx942 = amdgpu::Chipset(9, 4, 2);
+  if (!amdgpu::hasOcpFp8(chipset)) {
+    auto pred = containsAPred<Float8E5M2Type, Float8E4M3FNType>;
+    if (llvm::any_of(op->getOperandTypes(), pred) ||
+        llvm::any_of(op->getResultTypes(), pred)) {
+      return op->emitOpError("F8E5M2 and F8E4M3FN types are not supported on "
+                             "gfx942 (MI-300) or older chipsets; try "
+                             "F8E5M2FNUZ or F8E4M3FNUZ instead.");
+    }
   }
 
+  if (chipset != kGfx942) {
+    auto pred = containsAPred<Float8E5M2FNUZType, Float8E4M3FNUZType>;
+    if (llvm::any_of(op->getOperandTypes(), pred) ||
+        llvm::any_of(op->getResultTypes(), pred)) {
+      return op->emitOpError(
+          "F8E5M2FNUZ and F8E4M3FNUZ types are not supported on non-gfx942 "
+          "(MI-300) chipsets; try F8E5M2 or F8E4M3FN instead.");
+    }
+  }
   return success();
 }
 
@@ -108,11 +128,6 @@ struct ConvertToROCDLPass final
   void runOnOperation() override {
     ModuleOp m = getOperation();
 
-    m.walk([&](Operation *op) {
-      if (failed(validateDataTypes(op)))
-        return signalPassFailure();
-    });
-
     if (clROCMIndexingBits != 32 && clROCMIndexingBits != 64) {
       m.emitOpError() << "unsupported: ROCm index bit widths must either be "
                          "64 or 32, got "
@@ -152,6 +167,16 @@ struct ConvertToROCDLPass final
         m.emitOpError() << "Invalid chipset name: " << chipset;
         return signalPassFailure();
       }
+      WalkResult allTypesValid = m.walk([&](Operation *op) {
+        if (failed(validateDataTypes(op, *maybeChipset))) {
+          return WalkResult::interrupt();
+        }
+        return WalkResult::advance();
+      });
+      if (allTypesValid.wasInterrupted()) {
+        return signalPassFailure();
+      }
+
       arith::populateArithToAMDGPUConversionPatterns(
           patterns, /*convertFP8Arithmetic=*/true, /*saturateFP8Truncf=*/false,
           /*allowPackedF16Rtz=*/false, /*chipset=*/*maybeChipset);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel
index 77b36bcc116b..6cc6f300627d 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel
@@ -26,6 +26,8 @@ iree_lit_test_suite(
             "config_vector_distribute_reduction_gfx942.mlir",
             "config_user_vector_distribute.mlir",
             "lowering_scalar_dispatch.mlir",
+            "pipeline_elementwise_f8fnuz.mlir",
+            "pipeline_elementwise_f8ocp.mlir",
             "pipeline_igemm_tile_and_fuse.mlir",
             "pipeline_tile_and_fuse.mlir",
             "pipeline_vector_distribute_gfx942.mlir",
@@ -39,5 +41,6 @@ iree_lit_test_suite(
     tools = [
         "//tools:iree-opt",
         "@llvm-project//llvm:FileCheck",
+        "@llvm-project//llvm:not",
     ],
 )
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt
index 56891c8f5d93..6732627e1cd8 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt
@@ -22,6 +22,8 @@ iree_lit_test_suite(
     "config_vector_distribute_gfx942.mlir"
     "config_vector_distribute_reduction_gfx942.mlir"
     "lowering_scalar_dispatch.mlir"
+    "pipeline_elementwise_f8fnuz.mlir"
+    "pipeline_elementwise_f8ocp.mlir"
     "pipeline_igemm_tile_and_fuse.mlir"
     "pipeline_tile_and_fuse.mlir"
     "pipeline_vector_distribute_gfx1100.mlir"
@@ -31,6 +33,7 @@ iree_lit_test_suite(
   TOOLS
     FileCheck
     iree-opt
+    not
 )
 
 ### BAZEL_TO_CMAKE_PRESERVES_ALL_CONTENT_BELOW_THIS_LINE ###
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_elementwise_f8fnuz.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_elementwise_f8fnuz.mlir
new file mode 100644
index 000000000000..363fdc9ca00d
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_elementwise_f8fnuz.mlir
@@ -0,0 +1,53 @@
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-rocdl-pipeline)))" %s | FileCheck %s --check-prefix=CDNA3
+// RUN: not iree-opt --split-input-file --iree-gpu-test-target=gfx908 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-rocdl-pipeline)))" -o /dev/null 2>&1 %s | FileCheck %s --check-prefix=ERRORS
+// RUN: not iree-opt --split-input-file --iree-gpu-test-target=gfx1201 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-rocdl-pipeline)))" -o /dev/null 2>&1 %s | FileCheck %s --check-prefix=ERRORS
+
+#map = affine_map<(d0) -> (d0)>
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable @ext_fp8_dispatch {
+  hal.executable.variant @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+    hal.executable.export @ext_fp8_dispatch layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @ext_fp8_dispatch() {
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096xf8E4M3FNUZ>>
+        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096xf8E5M2FNUZ>>
+        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
+        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4096], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4096xf8E4M3FNUZ>> -> tensor<4096xf8E4M3FNUZ>
+        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4096], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4096xf8E5M2FNUZ>> -> tensor<4096xf8E5M2FNUZ>
+        %5 = tensor.empty() : tensor<4096xf32>
+        %6 = linalg.generic {indexing_maps = [#map, #map, #map],
+                             iterator_types = ["parallel"]}
+                             ins(%3, %4 : tensor<4096xf8E4M3FNUZ>, tensor<4096xf8E5M2FNUZ>)
+                             outs(%5 : tensor<4096xf32>) {
+        ^bb0(%in0: f8E4M3FNUZ, %in1: f8E5M2FNUZ, %out: f32):
+          %7 = arith.extf %in0 : f8E4M3FNUZ to f32
+          %8 = arith.extf %in1 : f8E5M2FNUZ to f32
+          %9 = arith.addf %7, %8 : f32
+          linalg.yield %9 : f32
+        } -> tensor<4096xf32>
+        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
+        return
+      }
+    }
+  }
+}
+
+// ERRORS: F8E5M2FNUZ and F8E4M3FNUZ types are not supported on non-gfx942 (MI-300) chipsets; try F8E5M2 or F8E4M3FN instead.
+
+//   CDNA3-LABEL: hal.executable public @ext_fp8_dispatch
+//         CDNA3:   hal.executable.variant public @rocm
+// CDNA3-COUNT-16:     rocdl.cvt.f32.fp8 %{{.*}} : f32
+// CDNA3-COUNT-16:     rocdl.cvt.f32.bf8 %{{.*}} : f32
+//         CDNA3:     %[[ADD:.+]] = llvm.fadd %{{.*}}, %{{.*}} : vector<16xf32>
+//         CDNA3:     llvm.store %[[ADD]], %{{.*}} : vector<16xf32>, !llvm.ptr<1>
+
+// -----
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_elementwise_f8ocp.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_elementwise_f8ocp.mlir
new file mode 100644
index 000000000000..89a4b9bd7c92
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_elementwise_f8ocp.mlir
@@ -0,0 +1,53 @@
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx1201 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-rocdl-pipeline)))" %s | FileCheck %s --check-prefix=RDNA4
+// RUN: not iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-rocdl-pipeline)))" -o /dev/null 2>&1 %s | FileCheck %s --check-prefix=ERRORS
+// RUN: not iree-opt --split-input-file --iree-gpu-test-target=gfx908 --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline), iree-codegen-linalg-to-rocdl-pipeline)))" -o /dev/null 2>&1 %s | FileCheck %s --check-prefix=ERRORS
+
+#map = affine_map<(d0) -> (d0)>
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable @ext_fp8_dispatch {
+  hal.executable.variant @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+    hal.executable.export @ext_fp8_dispatch layout(#pipeline_layout) {
+    ^bb0(%arg0: !hal.device, %arg1: index, %arg2 : index, %arg3 : index):
+      %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @ext_fp8_dispatch() {
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096xf8E4M3FN>>
+        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<4096xf8E5M2>>
+        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
+        %3 = flow.dispatch.tensor.load %0, offsets = [0], sizes = [4096], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4096xf8E4M3FN>> -> tensor<4096xf8E4M3FN>
+        %4 = flow.dispatch.tensor.load %1, offsets = [0], sizes = [4096], strides = [1] : !flow.dispatch.tensor<readonly:tensor<4096xf8E5M2>> -> tensor<4096xf8E5M2>
+        %5 = tensor.empty() : tensor<4096xf32>
+        %6 = linalg.generic {indexing_maps = [#map, #map, #map],
+                             iterator_types = ["parallel"]}
+                             ins(%3, %4 : tensor<4096xf8E4M3FN>, tensor<4096xf8E5M2>)
+                             outs(%5 : tensor<4096xf32>) {
+        ^bb0(%in0: f8E4M3FN, %in1: f8E5M2, %out: f32):
+          %7 = arith.extf %in0 : f8E4M3FN to f32
+          %8 = arith.extf %in1 : f8E5M2 to f32
+          %9 = arith.addf %7, %8 : f32
+          linalg.yield %9 : f32
+        } -> tensor<4096xf32>
+        flow.dispatch.tensor.store %6, %2, offsets = [0], sizes = [4096], strides = [1] : tensor<4096xf32> -> !flow.dispatch.tensor<writeonly:tensor<4096xf32>>
+        return
+      }
+    }
+  }
+}
+
+// ERRORS: F8E5M2 and F8E4M3FN types are not supported on gfx942 (MI-300) or older chipsets; try F8E5M2FNUZ or F8E4M3FNUZ instead.
+
+//   RDNA4-LABEL: hal.executable public @ext_fp8_dispatch
+//         RDNA4:   hal.executable.variant public @rocm
+// RDNA4-COUNT-16:     rocdl.cvt.f32.fp8 %{{.*}} : f32
+// RDNA4-COUNT-16:     rocdl.cvt.f32.bf8 %{{.*}} : f32
+//         RDNA4:     %[[ADD:.+]] = llvm.fadd %{{.*}}, %{{.*}} : vector<16xf32>
+//         RDNA4:     llvm.store %[[ADD]], %{{.*}} : vector<16xf32>, !llvm.ptr<1>
+
+// -----