Skip to content

Commit

Permalink
Add softmax test cases (#635)
Browse files Browse the repository at this point in the history
  • Loading branch information
linay-xsj authored Sep 12, 2023
1 parent 47ff7d3 commit df94633
Show file tree
Hide file tree
Showing 10 changed files with 401 additions and 2 deletions.
Binary file modified aie_runtime_lib/AIE2/liblut_based_ops.a
Binary file not shown.
15 changes: 13 additions & 2 deletions lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@
#include "mlir/Support/IndentedOstream.h"
#include "mlir/Support/MathExtras.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
Expand Down Expand Up @@ -213,6 +215,8 @@ struct CppEmitter {
/// names of values in a scope.
std::stack<int64_t> valueInScopeCount;
std::stack<int64_t> labelInScopeCount;

llvm::SmallSet<StringRef, 16> includeNames;
};
} // namespace

Expand Down Expand Up @@ -2867,9 +2871,16 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) {
LogicalResult status =
llvm::TypeSwitch<Operation *, LogicalResult>(&op)
// EmitC ops.
.Case<emitc::ApplyOp, emitc::CallOp, emitc::ConstantOp,
emitc::IncludeOp>(
.Case<emitc::ApplyOp, emitc::CallOp, emitc::ConstantOp>(
[&](auto op) { return printOperation(*this, op); })
.Case<emitc::IncludeOp>([&](auto op) {
StringRef name = op.getInclude();
if (!includeNames.count(name)) {
includeNames.insert(name);
return printOperation(*this, op);
}
return success();
})
// SCF ops.
.Case<scf::ForOp, scf::IfOp, scf::YieldOp>(
[&](auto op) { return printOperation(*this, op); })
Expand Down
31 changes: 31 additions & 0 deletions test/unit_tests/aievec_tests/bf16_softmax/bf16_softmax.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16 test-fastest-varying=0 vectorize-reductions=true" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc
// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I%aie_runtime_lib%/AIE2 -L%aie_runtime_lib%/AIE2 -llut_based_ops -I %aietools/include -D__AIEARCH__=20 -D__AIENGINE__ -I. %S/testbench.cc dut.cc
// RUN: mkdir -p data
// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
// CHECK: TEST PASSED
module {
func.func @dut(%arg0: memref<1024xbf16>, %arg1: memref<1024xbf16>) {
%cst = arith.constant 1.000000e+00 : f32
%cst_0 = arith.constant 0.000000e+00 : f32
affine.for %arg2 = 0 to 1024 {
%3 = affine.load %arg0[%arg2] : memref<1024xbf16>
%4 = math.exp %3 : bf16
affine.store %4, %arg0[%arg2] : memref<1024xbf16>
}
%0 = affine.for %arg2 = 0 to 1024 iter_args(%arg3 = %cst_0) -> (f32) {
%3 = affine.load %arg0[%arg2] : memref<1024xbf16>
%4 = arith.extf %3 : bf16 to f32
%5 = arith.addf %arg3, %4 : f32
affine.yield %5 : f32
}
%1 = arith.divf %cst, %0 : f32
%2 = arith.truncf %1 : f32 to bf16
affine.for %arg2 = 0 to 1024 {
%3 = affine.load %arg0[%arg2] : memref<1024xbf16>
%4 = arith.mulf %3, %2 : bf16
affine.store %4, %arg1[%arg2] : memref<1024xbf16>
}
return
}
}
3 changes: 3 additions & 0 deletions test/unit_tests/aievec_tests/bf16_softmax/defines.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#pragma once
constexpr unsigned const IN0_SIZE = 1024;
constexpr unsigned const OUT0_SIZE = 1024;
76 changes: 76 additions & 0 deletions test/unit_tests/aievec_tests/bf16_softmax/dut.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Cycle count: 3245
#include "lut_based_ops.h"

void dut(bfloat16 *restrict v1, bfloat16 *restrict v2) {
int32_t v3 = 0;
int32_t v4 = 4;
int32_t v5 = 8;
int32_t v6 = 16;
int32_t v7 = 32;
v16float v8 = broadcast_zero_float();
bfloat16 v9 = 0.0e+00;
size_t v10 = 0;
size_t v11 = 1024;
size_t v12 = 16;
for (size_t v13 = v10; v13 < v11; v13 += v12)
chess_prepare_for_pipelining chess_loop_range(64, 64) {
v16bfloat16 v14 = *(v16bfloat16 *)(v1 + v13);
v16accfloat v15 = getExpBf16(v14);
v16bfloat16 v16 = to_v16bfloat16(v15);
*(v16bfloat16 *)(v1 + v13) = v16;
}
size_t v17 = 0;
size_t v18 = 1024;
size_t v19 = 16;
v16float v20;
v16float v21 = v8;
for (size_t v22 = v17; v22 < v18; v22 += v19)
chess_prepare_for_pipelining chess_loop_range(64, 64) {
v16bfloat16 v23 = *(v16bfloat16 *)(v1 + v22);
v16accfloat v24 = ups_to_v16accfloat(v23);
v16accfloat v25 = v16accfloat(v21);
v16accfloat v26 = add(v24, v25);
v16float v27 = v16float(v26);
v21 = v27;
}
v20 = v21;
v16float v28 = shift_bytes(v20, v20, v7);
v16accfloat v29 = v16accfloat(v20);
v16accfloat v30 = v16accfloat(v28);
v16accfloat v31 = add(v29, v30);
v16float v32 = v16float(v31);
v16float v33 = shift_bytes(v32, v32, v6);
v16accfloat v34 = v16accfloat(v32);
v16accfloat v35 = v16accfloat(v33);
v16accfloat v36 = add(v34, v35);
v16float v37 = v16float(v36);
v16float v38 = shift_bytes(v37, v37, v5);
v16accfloat v39 = v16accfloat(v37);
v16accfloat v40 = v16accfloat(v38);
v16accfloat v41 = add(v39, v40);
v16float v42 = v16float(v41);
v16float v43 = shift_bytes(v42, v42, v4);
v16accfloat v44 = v16accfloat(v42);
v16accfloat v45 = v16accfloat(v43);
v16accfloat v46 = add(v44, v45);
v16float v47 = v16float(v46);
float v48 = extract_elem(v47, v3);
bfloat16 v49 = getInvBf16(v48);
v32bfloat16 v50 = broadcast_to_v32bfloat16(v49);
v16bfloat16 v51 = extract_v16bfloat16(v50, 0);
v32bfloat16 v52 = broadcast_to_v32bfloat16(v9);
v16bfloat16 v53 = extract_v16bfloat16(v52, 0);
v32bfloat16 v54 = concat(v51, v53);
size_t v55 = 0;
size_t v56 = 1024;
size_t v57 = 16;
for (size_t v58 = v55; v58 < v56; v58 += v57)
chess_prepare_for_pipelining chess_loop_range(64, 64) {
v16bfloat16 v59 = *(v16bfloat16 *)(v1 + v58);
v32bfloat16 v60 = concat(v59, v53);
v16accfloat v61 = mul_elem_16_2(v54, v60);
v16bfloat16 v62 = to_v16bfloat16(v61);
*(v16bfloat16 *)(v2 + v58) = v62;
}
return;
}
62 changes: 62 additions & 0 deletions test/unit_tests/aievec_tests/bf16_softmax/testbench.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#include "../common/testbench.h"
#include "defines.h"
#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <cstdlib>

void dut(bfloat16 *restrict in0, bfloat16 *restrict out0);
void dut_ref(bfloat16 *in0, bfloat16 *out0);

alignas(32) bfloat16 g_in0[IN0_SIZE];
alignas(32) bfloat16 g_out0[OUT0_SIZE];
alignas(32) bfloat16 g_out0Ref[OUT0_SIZE];

int main(int argc, char *argv[]) {
std::string dataDir(TO_STR(DATA_DIR));
srand(10);
std::generate(g_in0, g_in0 + IN0_SIZE,
[&]() { return random_bfloat16(-2, 0, 2); });

writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");

chess_memory_fence();
auto cyclesBegin = chess_cycle_count();
dut(g_in0, g_out0);
auto cyclesEnd = chess_cycle_count();
chess_memory_fence();

auto cycleCount = (int)(cyclesEnd - cyclesBegin);
reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");

writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt");

dut_ref(g_in0, g_out0Ref);
writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");

bool ok = true;
ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE, 0, 1e-2, 1e-2);

if (ok)
printf("TEST PASSED\n");
else
printf("TEST FAILED\n");

return ok ? 0 : 1;
}

void dut_ref(bfloat16 *in0, bfloat16 *out0) {
float sum = 0.0f;

for (unsigned k = 0; k < IN0_SIZE; ++k) {
float in = in0[k];
float out = exp(in);
in0[k] = (bfloat16)out;
sum += in0[k];
}

bfloat16 sum_inv = (bfloat16)(1.0f / sum);
for (unsigned k = 0; k < IN0_SIZE; ++k) {
out0[k] = in0[k] * sum_inv;
}
}
40 changes: 40 additions & 0 deletions test/unit_tests/aievec_tests/bf16_softmax_2/bf16_softmax.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16 test-fastest-varying=0 vectorize-reductions=true" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc
// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I%aie_runtime_lib%/AIE2 -L%aie_runtime_lib%/AIE2 -llut_based_ops -I %aietools/include -D__AIEARCH__=20 -D__AIENGINE__ -I. %S/testbench.cc dut.cc
// RUN: mkdir -p data
// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
// CHECK: TEST PASSED
module {
func.func @dut(%arg0: memref<1024xbf16>, %arg1: memref<1024xbf16>) {
%cst = arith.constant 0.000000e+00 : f32
%cst_0 = arith.constant 1.000000e+00 : f32
%cst_1 = arith.constant 0.000000e+00 : bf16
%cst_2 = arith.constant dense<0xFF80> : vector<32xbf16>
%0 = affine.for %arg2 = 0 to 1024 step 32 iter_args(%arg3 = %cst_2) -> (vector<32xbf16>) {
%5 = vector.transfer_read %arg0[%arg2], %cst_1 : memref<1024xbf16>, vector<32xbf16>
%6 = arith.maxf %arg3, %5 : vector<32xbf16>
affine.yield %6 : vector<32xbf16>
}
%1 = vector.reduction <maxf>, %0 : vector<32xbf16> into bf16
affine.for %arg2 = 0 to 1024 {
%5 = affine.load %arg0[%arg2] : memref<1024xbf16>
%6 = arith.subf %5, %1 : bf16
%7 = math.exp %6 : bf16
affine.store %7, %arg0[%arg2] : memref<1024xbf16>
}
%2 = affine.for %arg2 = 0 to 1024 iter_args(%arg3 = %cst) -> (f32) {
%5 = affine.load %arg0[%arg2] : memref<1024xbf16>
%6 = arith.extf %5 : bf16 to f32
%7 = arith.addf %arg3, %6 : f32
affine.yield %7 : f32
}
%3 = arith.divf %cst_0, %2 : f32
%4 = arith.truncf %3 : f32 to bf16
affine.for %arg2 = 0 to 1024 {
%5 = affine.load %arg0[%arg2] : memref<1024xbf16>
%6 = arith.mulf %5, %4 : bf16
affine.store %6, %arg1[%arg2] : memref<1024xbf16>
}
return
}
}
3 changes: 3 additions & 0 deletions test/unit_tests/aievec_tests/bf16_softmax_2/defines.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#pragma once
constexpr unsigned const IN0_SIZE = 1024;
constexpr unsigned const OUT0_SIZE = 1024;
108 changes: 108 additions & 0 deletions test/unit_tests/aievec_tests/bf16_softmax_2/dut.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
// Cycle count: 3712
#include "lut_based_ops.h"

void dut(bfloat16 *restrict v1, bfloat16 *restrict v2) {
int32_t v3 = 0;
int32_t v4 = 2;
int32_t v5 = 4;
int32_t v6 = 8;
int32_t v7 = 16;
int32_t v8 = 32;
v16float v9 = broadcast_zero_float();
bfloat16 v10 = 0.0e+00;
v32bfloat16 v11 = broadcast_to_v32bfloat16(
(bfloat16)-338953138925153547590470800371487866880.000000);
size_t v12 = 0;
size_t v13 = 1024;
size_t v14 = 32;
v32bfloat16 v15;
v32bfloat16 v16 = v11;
for (size_t v17 = v12; v17 < v13; v17 += v14)
chess_prepare_for_pipelining chess_loop_range(32, 32) {
v32bfloat16 v18 = *(v32bfloat16 *)(v1 + v17);
v32bfloat16 v19 = max(v16, v18);
v16 = v19;
}
v15 = v16;
v32bfloat16 v20 = shift_bytes(v15, v15, v8);
v32bfloat16 v21 = max(v15, v20);
v32bfloat16 v22 = shift_bytes(v21, v21, v7);
v32bfloat16 v23 = max(v21, v22);
v32bfloat16 v24 = shift_bytes(v23, v23, v6);
v32bfloat16 v25 = max(v23, v24);
v32bfloat16 v26 = shift_bytes(v25, v25, v5);
v32bfloat16 v27 = max(v25, v26);
v32bfloat16 v28 = shift_bytes(v27, v27, v4);
v32bfloat16 v29 = max(v27, v28);
bfloat16 v30 = extract_elem(v29, v3);
v32bfloat16 v31 = broadcast_to_v32bfloat16(v30);
v16bfloat16 v32 = extract_v16bfloat16(v31, 0);
v16accfloat v33 = ups_to_v16accfloat(v32);
size_t v34 = 0;
size_t v35 = 1024;
size_t v36 = 16;
for (size_t v37 = v34; v37 < v35; v37 += v36)
chess_prepare_for_pipelining chess_loop_range(64, 64) {
v16bfloat16 v38 = *(v16bfloat16 *)(v1 + v37);
v16accfloat v39 = ups_to_v16accfloat(v38);
v16accfloat v40 = sub(v39, v33);
v16bfloat16 v41 = to_v16bfloat16(v40);
v16accfloat v42 = getExpBf16(v41);
v16bfloat16 v43 = to_v16bfloat16(v42);
*(v16bfloat16 *)(v1 + v37) = v43;
}
size_t v44 = 0;
size_t v45 = 1024;
size_t v46 = 16;
v16float v47;
v16float v48 = v9;
for (size_t v49 = v44; v49 < v45; v49 += v46)
chess_prepare_for_pipelining chess_loop_range(64, 64) {
v16bfloat16 v50 = *(v16bfloat16 *)(v1 + v49);
v16accfloat v51 = ups_to_v16accfloat(v50);
v16accfloat v52 = v16accfloat(v48);
v16accfloat v53 = add(v51, v52);
v16float v54 = v16float(v53);
v48 = v54;
}
v47 = v48;
v16float v55 = shift_bytes(v47, v47, v8);
v16accfloat v56 = v16accfloat(v47);
v16accfloat v57 = v16accfloat(v55);
v16accfloat v58 = add(v56, v57);
v16float v59 = v16float(v58);
v16float v60 = shift_bytes(v59, v59, v7);
v16accfloat v61 = v16accfloat(v59);
v16accfloat v62 = v16accfloat(v60);
v16accfloat v63 = add(v61, v62);
v16float v64 = v16float(v63);
v16float v65 = shift_bytes(v64, v64, v6);
v16accfloat v66 = v16accfloat(v64);
v16accfloat v67 = v16accfloat(v65);
v16accfloat v68 = add(v66, v67);
v16float v69 = v16float(v68);
v16float v70 = shift_bytes(v69, v69, v5);
v16accfloat v71 = v16accfloat(v69);
v16accfloat v72 = v16accfloat(v70);
v16accfloat v73 = add(v71, v72);
v16float v74 = v16float(v73);
float v75 = extract_elem(v74, v3);
bfloat16 v76 = getInvBf16(v75);
v32bfloat16 v77 = broadcast_to_v32bfloat16(v76);
v16bfloat16 v78 = extract_v16bfloat16(v77, 0);
v32bfloat16 v79 = broadcast_to_v32bfloat16(v10);
v16bfloat16 v80 = extract_v16bfloat16(v79, 0);
v32bfloat16 v81 = concat(v78, v80);
size_t v82 = 0;
size_t v83 = 1024;
size_t v84 = 16;
for (size_t v85 = v82; v85 < v83; v85 += v84)
chess_prepare_for_pipelining chess_loop_range(64, 64) {
v16bfloat16 v86 = *(v16bfloat16 *)(v1 + v85);
v32bfloat16 v87 = concat(v86, v80);
v16accfloat v88 = mul_elem_16_2(v81, v87);
v16bfloat16 v89 = to_v16bfloat16(v88);
*(v16bfloat16 *)(v2 + v85) = v89;
}
return;
}
Loading

0 comments on commit df94633

Please sign in to comment.