-
Notifications
You must be signed in to change notification settings - Fork 89
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
401 additions
and
2 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
31 changes: 31 additions & 0 deletions
31
test/unit_tests/aievec_tests/bf16_softmax/bf16_softmax.mlir
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16 test-fastest-varying=0 vectorize-reductions=true" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc | ||
// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I%aie_runtime_lib%/AIE2 -L%aie_runtime_lib%/AIE2 -llut_based_ops -I %aietools/include -D__AIEARCH__=20 -D__AIENGINE__ -I. %S/testbench.cc dut.cc | ||
// RUN: mkdir -p data | ||
// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout | ||
// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s | ||
// CHECK: TEST PASSED | ||
module { | ||
func.func @dut(%arg0: memref<1024xbf16>, %arg1: memref<1024xbf16>) { | ||
%cst = arith.constant 1.000000e+00 : f32 | ||
%cst_0 = arith.constant 0.000000e+00 : f32 | ||
affine.for %arg2 = 0 to 1024 { | ||
%3 = affine.load %arg0[%arg2] : memref<1024xbf16> | ||
%4 = math.exp %3 : bf16 | ||
affine.store %4, %arg0[%arg2] : memref<1024xbf16> | ||
} | ||
%0 = affine.for %arg2 = 0 to 1024 iter_args(%arg3 = %cst_0) -> (f32) { | ||
%3 = affine.load %arg0[%arg2] : memref<1024xbf16> | ||
%4 = arith.extf %3 : bf16 to f32 | ||
%5 = arith.addf %arg3, %4 : f32 | ||
affine.yield %5 : f32 | ||
} | ||
%1 = arith.divf %cst, %0 : f32 | ||
%2 = arith.truncf %1 : f32 to bf16 | ||
affine.for %arg2 = 0 to 1024 { | ||
%3 = affine.load %arg0[%arg2] : memref<1024xbf16> | ||
%4 = arith.mulf %3, %2 : bf16 | ||
affine.store %4, %arg1[%arg2] : memref<1024xbf16> | ||
} | ||
return | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#pragma once | ||
constexpr unsigned const IN0_SIZE = 1024; | ||
constexpr unsigned const OUT0_SIZE = 1024; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
// Cycle count: 3245 | ||
#include "lut_based_ops.h" | ||
|
||
void dut(bfloat16 *restrict v1, bfloat16 *restrict v2) { | ||
int32_t v3 = 0; | ||
int32_t v4 = 4; | ||
int32_t v5 = 8; | ||
int32_t v6 = 16; | ||
int32_t v7 = 32; | ||
v16float v8 = broadcast_zero_float(); | ||
bfloat16 v9 = 0.0e+00; | ||
size_t v10 = 0; | ||
size_t v11 = 1024; | ||
size_t v12 = 16; | ||
for (size_t v13 = v10; v13 < v11; v13 += v12) | ||
chess_prepare_for_pipelining chess_loop_range(64, 64) { | ||
v16bfloat16 v14 = *(v16bfloat16 *)(v1 + v13); | ||
v16accfloat v15 = getExpBf16(v14); | ||
v16bfloat16 v16 = to_v16bfloat16(v15); | ||
*(v16bfloat16 *)(v1 + v13) = v16; | ||
} | ||
size_t v17 = 0; | ||
size_t v18 = 1024; | ||
size_t v19 = 16; | ||
v16float v20; | ||
v16float v21 = v8; | ||
for (size_t v22 = v17; v22 < v18; v22 += v19) | ||
chess_prepare_for_pipelining chess_loop_range(64, 64) { | ||
v16bfloat16 v23 = *(v16bfloat16 *)(v1 + v22); | ||
v16accfloat v24 = ups_to_v16accfloat(v23); | ||
v16accfloat v25 = v16accfloat(v21); | ||
v16accfloat v26 = add(v24, v25); | ||
v16float v27 = v16float(v26); | ||
v21 = v27; | ||
} | ||
v20 = v21; | ||
v16float v28 = shift_bytes(v20, v20, v7); | ||
v16accfloat v29 = v16accfloat(v20); | ||
v16accfloat v30 = v16accfloat(v28); | ||
v16accfloat v31 = add(v29, v30); | ||
v16float v32 = v16float(v31); | ||
v16float v33 = shift_bytes(v32, v32, v6); | ||
v16accfloat v34 = v16accfloat(v32); | ||
v16accfloat v35 = v16accfloat(v33); | ||
v16accfloat v36 = add(v34, v35); | ||
v16float v37 = v16float(v36); | ||
v16float v38 = shift_bytes(v37, v37, v5); | ||
v16accfloat v39 = v16accfloat(v37); | ||
v16accfloat v40 = v16accfloat(v38); | ||
v16accfloat v41 = add(v39, v40); | ||
v16float v42 = v16float(v41); | ||
v16float v43 = shift_bytes(v42, v42, v4); | ||
v16accfloat v44 = v16accfloat(v42); | ||
v16accfloat v45 = v16accfloat(v43); | ||
v16accfloat v46 = add(v44, v45); | ||
v16float v47 = v16float(v46); | ||
float v48 = extract_elem(v47, v3); | ||
bfloat16 v49 = getInvBf16(v48); | ||
v32bfloat16 v50 = broadcast_to_v32bfloat16(v49); | ||
v16bfloat16 v51 = extract_v16bfloat16(v50, 0); | ||
v32bfloat16 v52 = broadcast_to_v32bfloat16(v9); | ||
v16bfloat16 v53 = extract_v16bfloat16(v52, 0); | ||
v32bfloat16 v54 = concat(v51, v53); | ||
size_t v55 = 0; | ||
size_t v56 = 1024; | ||
size_t v57 = 16; | ||
for (size_t v58 = v55; v58 < v56; v58 += v57) | ||
chess_prepare_for_pipelining chess_loop_range(64, 64) { | ||
v16bfloat16 v59 = *(v16bfloat16 *)(v1 + v58); | ||
v32bfloat16 v60 = concat(v59, v53); | ||
v16accfloat v61 = mul_elem_16_2(v54, v60); | ||
v16bfloat16 v62 = to_v16bfloat16(v61); | ||
*(v16bfloat16 *)(v2 + v58) = v62; | ||
} | ||
return; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#include "../common/testbench.h" | ||
#include "defines.h" | ||
#include <algorithm> | ||
#include <cstdint> | ||
#include <cstdio> | ||
#include <cstdlib> | ||
|
||
void dut(bfloat16 *restrict in0, bfloat16 *restrict out0); | ||
void dut_ref(bfloat16 *in0, bfloat16 *out0); | ||
|
||
alignas(32) bfloat16 g_in0[IN0_SIZE]; | ||
alignas(32) bfloat16 g_out0[OUT0_SIZE]; | ||
alignas(32) bfloat16 g_out0Ref[OUT0_SIZE]; | ||
|
||
int main(int argc, char *argv[]) { | ||
std::string dataDir(TO_STR(DATA_DIR)); | ||
srand(10); | ||
std::generate(g_in0, g_in0 + IN0_SIZE, | ||
[&]() { return random_bfloat16(-2, 0, 2); }); | ||
|
||
writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt"); | ||
|
||
chess_memory_fence(); | ||
auto cyclesBegin = chess_cycle_count(); | ||
dut(g_in0, g_out0); | ||
auto cyclesEnd = chess_cycle_count(); | ||
chess_memory_fence(); | ||
|
||
auto cycleCount = (int)(cyclesEnd - cyclesBegin); | ||
reportCycleCount(cycleCount, dataDir + "/cycle_count.txt"); | ||
|
||
writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt"); | ||
|
||
dut_ref(g_in0, g_out0Ref); | ||
writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt"); | ||
|
||
bool ok = true; | ||
ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE, 0, 1e-2, 1e-2); | ||
|
||
if (ok) | ||
printf("TEST PASSED\n"); | ||
else | ||
printf("TEST FAILED\n"); | ||
|
||
return ok ? 0 : 1; | ||
} | ||
|
||
void dut_ref(bfloat16 *in0, bfloat16 *out0) { | ||
float sum = 0.0f; | ||
|
||
for (unsigned k = 0; k < IN0_SIZE; ++k) { | ||
float in = in0[k]; | ||
float out = exp(in); | ||
in0[k] = (bfloat16)out; | ||
sum += in0[k]; | ||
} | ||
|
||
bfloat16 sum_inv = (bfloat16)(1.0f / sum); | ||
for (unsigned k = 0; k < IN0_SIZE; ++k) { | ||
out0[k] = in0[k] * sum_inv; | ||
} | ||
} |
40 changes: 40 additions & 0 deletions
40
test/unit_tests/aievec_tests/bf16_softmax_2/bf16_softmax.mlir
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16 test-fastest-varying=0 vectorize-reductions=true" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc | ||
// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I%aie_runtime_lib%/AIE2 -L%aie_runtime_lib%/AIE2 -llut_based_ops -I %aietools/include -D__AIEARCH__=20 -D__AIENGINE__ -I. %S/testbench.cc dut.cc | ||
// RUN: mkdir -p data | ||
// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout | ||
// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s | ||
// CHECK: TEST PASSED | ||
module { | ||
func.func @dut(%arg0: memref<1024xbf16>, %arg1: memref<1024xbf16>) { | ||
%cst = arith.constant 0.000000e+00 : f32 | ||
%cst_0 = arith.constant 1.000000e+00 : f32 | ||
%cst_1 = arith.constant 0.000000e+00 : bf16 | ||
%cst_2 = arith.constant dense<0xFF80> : vector<32xbf16> | ||
%0 = affine.for %arg2 = 0 to 1024 step 32 iter_args(%arg3 = %cst_2) -> (vector<32xbf16>) { | ||
%5 = vector.transfer_read %arg0[%arg2], %cst_1 : memref<1024xbf16>, vector<32xbf16> | ||
%6 = arith.maxf %arg3, %5 : vector<32xbf16> | ||
affine.yield %6 : vector<32xbf16> | ||
} | ||
%1 = vector.reduction <maxf>, %0 : vector<32xbf16> into bf16 | ||
affine.for %arg2 = 0 to 1024 { | ||
%5 = affine.load %arg0[%arg2] : memref<1024xbf16> | ||
%6 = arith.subf %5, %1 : bf16 | ||
%7 = math.exp %6 : bf16 | ||
affine.store %7, %arg0[%arg2] : memref<1024xbf16> | ||
} | ||
%2 = affine.for %arg2 = 0 to 1024 iter_args(%arg3 = %cst) -> (f32) { | ||
%5 = affine.load %arg0[%arg2] : memref<1024xbf16> | ||
%6 = arith.extf %5 : bf16 to f32 | ||
%7 = arith.addf %arg3, %6 : f32 | ||
affine.yield %7 : f32 | ||
} | ||
%3 = arith.divf %cst_0, %2 : f32 | ||
%4 = arith.truncf %3 : f32 to bf16 | ||
affine.for %arg2 = 0 to 1024 { | ||
%5 = affine.load %arg0[%arg2] : memref<1024xbf16> | ||
%6 = arith.mulf %5, %4 : bf16 | ||
affine.store %6, %arg1[%arg2] : memref<1024xbf16> | ||
} | ||
return | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
#pragma once | ||
constexpr unsigned const IN0_SIZE = 1024; | ||
constexpr unsigned const OUT0_SIZE = 1024; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
// Cycle count: 3712 | ||
#include "lut_based_ops.h" | ||
|
||
void dut(bfloat16 *restrict v1, bfloat16 *restrict v2) { | ||
int32_t v3 = 0; | ||
int32_t v4 = 2; | ||
int32_t v5 = 4; | ||
int32_t v6 = 8; | ||
int32_t v7 = 16; | ||
int32_t v8 = 32; | ||
v16float v9 = broadcast_zero_float(); | ||
bfloat16 v10 = 0.0e+00; | ||
v32bfloat16 v11 = broadcast_to_v32bfloat16( | ||
(bfloat16)-338953138925153547590470800371487866880.000000); | ||
size_t v12 = 0; | ||
size_t v13 = 1024; | ||
size_t v14 = 32; | ||
v32bfloat16 v15; | ||
v32bfloat16 v16 = v11; | ||
for (size_t v17 = v12; v17 < v13; v17 += v14) | ||
chess_prepare_for_pipelining chess_loop_range(32, 32) { | ||
v32bfloat16 v18 = *(v32bfloat16 *)(v1 + v17); | ||
v32bfloat16 v19 = max(v16, v18); | ||
v16 = v19; | ||
} | ||
v15 = v16; | ||
v32bfloat16 v20 = shift_bytes(v15, v15, v8); | ||
v32bfloat16 v21 = max(v15, v20); | ||
v32bfloat16 v22 = shift_bytes(v21, v21, v7); | ||
v32bfloat16 v23 = max(v21, v22); | ||
v32bfloat16 v24 = shift_bytes(v23, v23, v6); | ||
v32bfloat16 v25 = max(v23, v24); | ||
v32bfloat16 v26 = shift_bytes(v25, v25, v5); | ||
v32bfloat16 v27 = max(v25, v26); | ||
v32bfloat16 v28 = shift_bytes(v27, v27, v4); | ||
v32bfloat16 v29 = max(v27, v28); | ||
bfloat16 v30 = extract_elem(v29, v3); | ||
v32bfloat16 v31 = broadcast_to_v32bfloat16(v30); | ||
v16bfloat16 v32 = extract_v16bfloat16(v31, 0); | ||
v16accfloat v33 = ups_to_v16accfloat(v32); | ||
size_t v34 = 0; | ||
size_t v35 = 1024; | ||
size_t v36 = 16; | ||
for (size_t v37 = v34; v37 < v35; v37 += v36) | ||
chess_prepare_for_pipelining chess_loop_range(64, 64) { | ||
v16bfloat16 v38 = *(v16bfloat16 *)(v1 + v37); | ||
v16accfloat v39 = ups_to_v16accfloat(v38); | ||
v16accfloat v40 = sub(v39, v33); | ||
v16bfloat16 v41 = to_v16bfloat16(v40); | ||
v16accfloat v42 = getExpBf16(v41); | ||
v16bfloat16 v43 = to_v16bfloat16(v42); | ||
*(v16bfloat16 *)(v1 + v37) = v43; | ||
} | ||
size_t v44 = 0; | ||
size_t v45 = 1024; | ||
size_t v46 = 16; | ||
v16float v47; | ||
v16float v48 = v9; | ||
for (size_t v49 = v44; v49 < v45; v49 += v46) | ||
chess_prepare_for_pipelining chess_loop_range(64, 64) { | ||
v16bfloat16 v50 = *(v16bfloat16 *)(v1 + v49); | ||
v16accfloat v51 = ups_to_v16accfloat(v50); | ||
v16accfloat v52 = v16accfloat(v48); | ||
v16accfloat v53 = add(v51, v52); | ||
v16float v54 = v16float(v53); | ||
v48 = v54; | ||
} | ||
v47 = v48; | ||
v16float v55 = shift_bytes(v47, v47, v8); | ||
v16accfloat v56 = v16accfloat(v47); | ||
v16accfloat v57 = v16accfloat(v55); | ||
v16accfloat v58 = add(v56, v57); | ||
v16float v59 = v16float(v58); | ||
v16float v60 = shift_bytes(v59, v59, v7); | ||
v16accfloat v61 = v16accfloat(v59); | ||
v16accfloat v62 = v16accfloat(v60); | ||
v16accfloat v63 = add(v61, v62); | ||
v16float v64 = v16float(v63); | ||
v16float v65 = shift_bytes(v64, v64, v6); | ||
v16accfloat v66 = v16accfloat(v64); | ||
v16accfloat v67 = v16accfloat(v65); | ||
v16accfloat v68 = add(v66, v67); | ||
v16float v69 = v16float(v68); | ||
v16float v70 = shift_bytes(v69, v69, v5); | ||
v16accfloat v71 = v16accfloat(v69); | ||
v16accfloat v72 = v16accfloat(v70); | ||
v16accfloat v73 = add(v71, v72); | ||
v16float v74 = v16float(v73); | ||
float v75 = extract_elem(v74, v3); | ||
bfloat16 v76 = getInvBf16(v75); | ||
v32bfloat16 v77 = broadcast_to_v32bfloat16(v76); | ||
v16bfloat16 v78 = extract_v16bfloat16(v77, 0); | ||
v32bfloat16 v79 = broadcast_to_v32bfloat16(v10); | ||
v16bfloat16 v80 = extract_v16bfloat16(v79, 0); | ||
v32bfloat16 v81 = concat(v78, v80); | ||
size_t v82 = 0; | ||
size_t v83 = 1024; | ||
size_t v84 = 16; | ||
for (size_t v85 = v82; v85 < v83; v85 += v84) | ||
chess_prepare_for_pipelining chess_loop_range(64, 64) { | ||
v16bfloat16 v86 = *(v16bfloat16 *)(v1 + v85); | ||
v32bfloat16 v87 = concat(v86, v80); | ||
v16accfloat v88 = mul_elem_16_2(v81, v87); | ||
v16bfloat16 v89 = to_v16bfloat16(v88); | ||
*(v16bfloat16 *)(v2 + v85) = v89; | ||
} | ||
return; | ||
} |
Oops, something went wrong.