Skip to content

Commit

Permalink
add config test for erf
Browse files Browse the repository at this point in the history
  • Loading branch information
nirvedhmeshram committed Feb 14, 2025
1 parent bedca73 commit 56e6f82
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 82 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -219,25 +219,6 @@ module {

// -----

module {
func.func @elementwise_dynamic_dim_large(%11: tensor<?x512xf16>, %12: tensor<?x512xf16>) -> tensor<?x512xf16> {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%8 = tensor.dim %11, %c0 : tensor<?x512xf16>
%13 = tensor.empty(%8) : tensor<?x512xf16>
%15 = linalg.add ins(%11, %12 : tensor<?x512xf16>, tensor<?x512xf16>) outs(%13 : tensor<?x512xf16>) -> tensor<?x512xf16>
return %15 : tensor<?x512xf16>
}
}

// CHECK-LABEL: func.func @elementwise_dynamic_dim_large
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
// CHECK: linalg.add {{.*}}lowering_config = #iree_gpu.lowering_config
// CHECK-SAME: thread = [1, 8]
// CHECK-SAME: workgroup = [1, 512]

// -----

module @elementwise_unaligned {
func.func @elementwise_unaligned(%11: tensor<180x180xf16>, %12: tensor<180x180xf16>) -> tensor<180x180xf16> {
%cst = arith.constant 0.000000e+00 : f32
Expand Down Expand Up @@ -673,3 +654,25 @@ func.func @pack_dynamic_tile(%arg0: tensor<32x32xi8>, %d0: index, %d1: index, %t
// CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
// CHECK-SAME: thread = [1, 4]
// CHECK-SAME: workgroup = [8, 32]

// -----

module {
func.func @erf(%13 : tensor<2x1024x5120xf16>, %12 : tensor<2x1024x5120xf16>, %9 : tensor<5120xf16>, %10 : tensor<f32>) -> tensor<2x1024x5120xi8> {
%cst = arith.constant 0.000000e+00 : f16
%11 = tensor.empty() : tensor<2x1024x5120xi8>
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%13, %12, %9, %10 : tensor<2x1024x5120xf16>, tensor<2x1024x5120xf16>, tensor<5120xf16>, tensor<f32>) outs(%11 : tensor<2x1024x5120xi8>) {
^bb0(%in: f16, %in_4: f16, %in_5: f16, %in_6: f32, %out: i8):
%17 = math.erf %in : f16
%30 = arith.fptosi %17 : f16 to i8
linalg.yield %30 : i8
} -> tensor<2x1024x5120xi8>
return %14 : tensor<2x1024x5120xi8>
}
}

// CHECK-LABEL: func.func @erf
// CHECK-SAME: #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
// CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
// CHECK-SAME: thread = [1, 1, 8]
// CHECK-SAME: workgroup = [1, 1, 512]
Original file line number Diff line number Diff line change
Expand Up @@ -191,66 +191,3 @@ hal.executable @ceildiv_expand_dispatch {
// CDNA3-COUNT-1: llvm.select {{.*}} : vector<1xi1>, vector<1xi32>

// -----
#pipeline_layout = #hal.pipeline.layout<bindings = [
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>,
#hal.pipeline.binding<storage_buffer>
]>
hal.executable @erf_dispatch {
hal.executable.variant @rocm target(<"rocm", "rocm-hsaco-fb">) {
hal.executable.export @erf layout(#pipeline_layout) {
^bb0(%arg0: !hal.device, %arg1: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @erf() {
%cst = arith.constant 1.270000e+02 : f16
%cst_0 = arith.constant -1.280000e+02 : f16
%cst_1 = arith.constant 5.000000e-01 : f16
%cst_2 = arith.constant 1.000000e+00 : f16
%cst_3 = arith.constant 2.000000e+00 : f16
%c0 = arith.constant 0 : index
%5 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) : !flow.dispatch.tensor<readonly:tensor<2x1024x10240xf16>>
%6 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) : !flow.dispatch.tensor<readonly:tensor<5120xf16>>
%7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) : !flow.dispatch.tensor<readonly:tensor<f32>>
%8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) : !flow.dispatch.tensor<writeonly:tensor<2x1024x5120xi8>>
%9 = flow.dispatch.tensor.load %6, offsets = [0], sizes = [5120], strides = [1] : !flow.dispatch.tensor<readonly:tensor<5120xf16>> -> tensor<5120xf16>
%10 = flow.dispatch.tensor.load %7, offsets = [], sizes = [], strides = [] : !flow.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
%11 = tensor.empty() : tensor<2x1024x5120xi8>
%12 = flow.dispatch.tensor.load %5, offsets = [0, 0, 5120], sizes = [2, 1024, 5120], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1024x10240xf16>> -> tensor<2x1024x5120xf16>
%13 = flow.dispatch.tensor.load %5, offsets = [0, 0, 0], sizes = [2, 1024, 5120], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x1024x10240xf16>> -> tensor<2x1024x5120xf16>
%14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%13, %12, %9, %10 : tensor<2x1024x5120xf16>, tensor<2x1024x5120xf16>, tensor<5120xf16>, tensor<f32>) outs(%11 : tensor<2x1024x5120xi8>) {
^bb0(%in: f16, %in_4: f16, %in_5: f16, %in_6: f32, %out: i8):
%15 = math.sqrt %cst_3 : f16
%16 = arith.divf %in_4, %15 : f16
%17 = math.erf %16 : f16
%18 = arith.addf %17, %cst_2 : f16
%19 = arith.mulf %18, %cst_1 : f16
%20 = arith.mulf %in_4, %19 : f16
%21 = arith.mulf %in, %20 : f16
%22 = arith.mulf %21, %in_5 : f16
%23 = arith.truncf %in_6 : f32 to f16
%24 = arith.divf %22, %23 : f16
%25 = math.roundeven %24 : f16
%26 = arith.cmpf ult, %25, %cst_0 : f16
%27 = arith.select %26, %cst_0, %25 : f16
%28 = arith.cmpf ugt, %27, %cst : f16
%29 = arith.select %28, %cst, %27 : f16
%30 = arith.fptosi %29 : f16 to i8
linalg.yield %30 : i8
} -> tensor<2x1024x5120xi8>
flow.dispatch.tensor.store %14, %8, offsets = [0, 0, 0], sizes = [2, 1024, 5120], strides = [1, 1, 1] : tensor<2x1024x5120xi8> -> !flow.dispatch.tensor<writeonly:tensor<2x1024x5120xi8>>
return
}
}
}
}

// CDNA3-LABEL: hal.executable public @erf_dispatch
// CDNA3: hal.executable.variant public @rocm
// CDNA3-COUNT-19: llvm.select {{.*}} : vector<8xi1>, vector<8xf32>
// CDNA3-COUNT-8: llvm.intr.fma{{.*}} : (vector<8xf32>, vector<8xf32>, vector<8xf32>) -> vector<8xf32>
// CDNA3: %[[RESULT:.+]] = llvm.fptosi {{.*}} : vector<8xf16> to vector<8xi8>
// CDNA3: llvm.store %[[RESULT]], %{{.*}} : vector<8xi8>, !llvm.ptr<1>

0 comments on commit 56e6f82

Please sign in to comment.