Add softmax test cases (#635)

Xilinx · Sep 12, 2023 · df94633 · df94633
1 parent 47ff7d3
commit df94633
Show file tree

Hide file tree

Showing 10 changed files with 401 additions and 2 deletions.
diff --git a/aie_runtime_lib/AIE2/liblut_based_ops.a b/aie_runtime_lib/AIE2/liblut_based_ops.a
diff --git a/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp b/lib/Targets/AIEVecToCpp/TranslateAIEVecToCpp.cpp
@@ -27,8 +27,10 @@
 #include "mlir/Support/IndentedOstream.h"
 #include "mlir/Support/MathExtras.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -213,6 +215,8 @@ struct CppEmitter {
   /// names of values in a scope.
   std::stack<int64_t> valueInScopeCount;
   std::stack<int64_t> labelInScopeCount;
+
+  llvm::SmallSet<StringRef, 16> includeNames;
 };
 } // namespace
 
@@ -2867,9 +2871,16 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) {
   LogicalResult status =
       llvm::TypeSwitch<Operation *, LogicalResult>(&op)
           // EmitC ops.
-          .Case<emitc::ApplyOp, emitc::CallOp, emitc::ConstantOp,
-                emitc::IncludeOp>(
+          .Case<emitc::ApplyOp, emitc::CallOp, emitc::ConstantOp>(
               [&](auto op) { return printOperation(*this, op); })
+          .Case<emitc::IncludeOp>([&](auto op) {
+            StringRef name = op.getInclude();
+            if (!includeNames.count(name)) {
+              includeNames.insert(name);
+              return printOperation(*this, op);
+            }
+            return success();
+          })
           // SCF ops.
           .Case<scf::ForOp, scf::IfOp, scf::YieldOp>(
               [&](auto op) { return printOperation(*this, op); })

diff --git a/test/unit_tests/aievec_tests/bf16_softmax/bf16_softmax.mlir b/test/unit_tests/aievec_tests/bf16_softmax/bf16_softmax.mlir
@@ -0,0 +1,31 @@
+// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16 test-fastest-varying=0 vectorize-reductions=true" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I%aie_runtime_lib%/AIE2 -L%aie_runtime_lib%/AIE2 -llut_based_ops -I %aietools/include -D__AIEARCH__=20 -D__AIENGINE__ -I. %S/testbench.cc dut.cc
+// RUN: mkdir -p data
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+module {
+  func.func @dut(%arg0: memref<1024xbf16>, %arg1: memref<1024xbf16>) {
+    %cst = arith.constant 1.000000e+00 : f32
+    %cst_0 = arith.constant 0.000000e+00 : f32
+    affine.for %arg2 = 0 to 1024 {
+      %3 = affine.load %arg0[%arg2] : memref<1024xbf16>
+      %4 = math.exp %3 : bf16
+      affine.store %4, %arg0[%arg2] : memref<1024xbf16>
+    }
+    %0 = affine.for %arg2 = 0 to 1024 iter_args(%arg3 = %cst_0) -> (f32) {
+      %3 = affine.load %arg0[%arg2] : memref<1024xbf16>
+      %4 = arith.extf %3 : bf16 to f32
+      %5 = arith.addf %arg3, %4 : f32
+      affine.yield %5 : f32
+    }
+    %1 = arith.divf %cst, %0 : f32
+    %2 = arith.truncf %1 : f32 to bf16
+    affine.for %arg2 = 0 to 1024 {
+      %3 = affine.load %arg0[%arg2] : memref<1024xbf16>
+      %4 = arith.mulf %3, %2 : bf16
+      affine.store %4, %arg1[%arg2] : memref<1024xbf16>
+    }
+    return
+  }
+}
diff --git a/test/unit_tests/aievec_tests/bf16_softmax/defines.h b/test/unit_tests/aievec_tests/bf16_softmax/defines.h
@@ -0,0 +1,3 @@
+#pragma once
+constexpr unsigned const IN0_SIZE = 1024;
+constexpr unsigned const OUT0_SIZE = 1024;
diff --git a/test/unit_tests/aievec_tests/bf16_softmax/dut.cc b/test/unit_tests/aievec_tests/bf16_softmax/dut.cc
@@ -0,0 +1,76 @@
+// Cycle count: 3245
+#include "lut_based_ops.h"
+
+void dut(bfloat16 *restrict v1, bfloat16 *restrict v2) {
+  int32_t v3 = 0;
+  int32_t v4 = 4;
+  int32_t v5 = 8;
+  int32_t v6 = 16;
+  int32_t v7 = 32;
+  v16float v8 = broadcast_zero_float();
+  bfloat16 v9 = 0.0e+00;
+  size_t v10 = 0;
+  size_t v11 = 1024;
+  size_t v12 = 16;
+  for (size_t v13 = v10; v13 < v11; v13 += v12)
+    chess_prepare_for_pipelining chess_loop_range(64, 64) {
+      v16bfloat16 v14 = *(v16bfloat16 *)(v1 + v13);
+      v16accfloat v15 = getExpBf16(v14);
+      v16bfloat16 v16 = to_v16bfloat16(v15);
+      *(v16bfloat16 *)(v1 + v13) = v16;
+    }
+  size_t v17 = 0;
+  size_t v18 = 1024;
+  size_t v19 = 16;
+  v16float v20;
+  v16float v21 = v8;
+  for (size_t v22 = v17; v22 < v18; v22 += v19)
+    chess_prepare_for_pipelining chess_loop_range(64, 64) {
+      v16bfloat16 v23 = *(v16bfloat16 *)(v1 + v22);
+      v16accfloat v24 = ups_to_v16accfloat(v23);
+      v16accfloat v25 = v16accfloat(v21);
+      v16accfloat v26 = add(v24, v25);
+      v16float v27 = v16float(v26);
+      v21 = v27;
+    }
+  v20 = v21;
+  v16float v28 = shift_bytes(v20, v20, v7);
+  v16accfloat v29 = v16accfloat(v20);
+  v16accfloat v30 = v16accfloat(v28);
+  v16accfloat v31 = add(v29, v30);
+  v16float v32 = v16float(v31);
+  v16float v33 = shift_bytes(v32, v32, v6);
+  v16accfloat v34 = v16accfloat(v32);
+  v16accfloat v35 = v16accfloat(v33);
+  v16accfloat v36 = add(v34, v35);
+  v16float v37 = v16float(v36);
+  v16float v38 = shift_bytes(v37, v37, v5);
+  v16accfloat v39 = v16accfloat(v37);
+  v16accfloat v40 = v16accfloat(v38);
+  v16accfloat v41 = add(v39, v40);
+  v16float v42 = v16float(v41);
+  v16float v43 = shift_bytes(v42, v42, v4);
+  v16accfloat v44 = v16accfloat(v42);
+  v16accfloat v45 = v16accfloat(v43);
+  v16accfloat v46 = add(v44, v45);
+  v16float v47 = v16float(v46);
+  float v48 = extract_elem(v47, v3);
+  bfloat16 v49 = getInvBf16(v48);
+  v32bfloat16 v50 = broadcast_to_v32bfloat16(v49);
+  v16bfloat16 v51 = extract_v16bfloat16(v50, 0);
+  v32bfloat16 v52 = broadcast_to_v32bfloat16(v9);
+  v16bfloat16 v53 = extract_v16bfloat16(v52, 0);
+  v32bfloat16 v54 = concat(v51, v53);
+  size_t v55 = 0;
+  size_t v56 = 1024;
+  size_t v57 = 16;
+  for (size_t v58 = v55; v58 < v56; v58 += v57)
+    chess_prepare_for_pipelining chess_loop_range(64, 64) {
+      v16bfloat16 v59 = *(v16bfloat16 *)(v1 + v58);
+      v32bfloat16 v60 = concat(v59, v53);
+      v16accfloat v61 = mul_elem_16_2(v54, v60);
+      v16bfloat16 v62 = to_v16bfloat16(v61);
+      *(v16bfloat16 *)(v2 + v58) = v62;
+    }
+  return;
+}
diff --git a/test/unit_tests/aievec_tests/bf16_softmax/testbench.cc b/test/unit_tests/aievec_tests/bf16_softmax/testbench.cc
@@ -0,0 +1,62 @@
+#include "../common/testbench.h"
+#include "defines.h"
+#include <algorithm>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+
+void dut(bfloat16 *restrict in0, bfloat16 *restrict out0);
+void dut_ref(bfloat16 *in0, bfloat16 *out0);
+
+alignas(32) bfloat16 g_in0[IN0_SIZE];
+alignas(32) bfloat16 g_out0[OUT0_SIZE];
+alignas(32) bfloat16 g_out0Ref[OUT0_SIZE];
+
+int main(int argc, char *argv[]) {
+  std::string dataDir(TO_STR(DATA_DIR));
+  srand(10);
+  std::generate(g_in0, g_in0 + IN0_SIZE,
+                [&]() { return random_bfloat16(-2, 0, 2); });
+
+  writeData(g_in0, IN0_SIZE, dataDir + "/in0.txt");
+
+  chess_memory_fence();
+  auto cyclesBegin = chess_cycle_count();
+  dut(g_in0, g_out0);
+  auto cyclesEnd = chess_cycle_count();
+  chess_memory_fence();
+
+  auto cycleCount = (int)(cyclesEnd - cyclesBegin);
+  reportCycleCount(cycleCount, dataDir + "/cycle_count.txt");
+
+  writeData(g_out0, OUT0_SIZE, dataDir + "/out0.txt");
+
+  dut_ref(g_in0, g_out0Ref);
+  writeData(g_out0Ref, OUT0_SIZE, dataDir + "/out0_ref.txt");
+
+  bool ok = true;
+  ok &= checkData(g_out0, g_out0Ref, OUT0_SIZE, 0, 1e-2, 1e-2);
+
+  if (ok)
+    printf("TEST PASSED\n");
+  else
+    printf("TEST FAILED\n");
+
+  return ok ? 0 : 1;
+}
+
+void dut_ref(bfloat16 *in0, bfloat16 *out0) {
+  float sum = 0.0f;
+
+  for (unsigned k = 0; k < IN0_SIZE; ++k) {
+    float in = in0[k];
+    float out = exp(in);
+    in0[k] = (bfloat16)out;
+    sum += in0[k];
+  }
+
+  bfloat16 sum_inv = (bfloat16)(1.0f / sum);
+  for (unsigned k = 0; k < IN0_SIZE; ++k) {
+    out0[k] = in0[k] * sum_inv;
+  }
+}
diff --git a/test/unit_tests/aievec_tests/bf16_softmax_2/bf16_softmax.mlir b/test/unit_tests/aievec_tests/bf16_softmax_2/bf16_softmax.mlir
@@ -0,0 +1,40 @@
+// RUN: aie-opt %s -affine-super-vectorize="virtual-vector-size=16 test-fastest-varying=0 vectorize-reductions=true" --convert-vector-to-aievec="aie-target=aieml" -lower-affine | aie-translate -aieml=true --aievec-to-cpp -o dut.cc
+// RUN: xchesscc_wrapper aie2 -f -g +s +w work +o work -I%S -I%aie_runtime_lib%/AIE2 -L%aie_runtime_lib%/AIE2 -llut_based_ops -I %aietools/include -D__AIEARCH__=20 -D__AIENGINE__ -I. %S/testbench.cc dut.cc
+// RUN: mkdir -p data
+// RUN: xca_udm_dbg --aiearch aie-ml -qf -T -P %aietools/data/aie_ml/lib/ -t "%S/../profiling.tcl ./work/a.out" >& xca_udm_dbg.stdout
+// RUN: FileCheck --input-file=./xca_udm_dbg.stdout %s
+// CHECK: TEST PASSED
+module {
+  func.func @dut(%arg0: memref<1024xbf16>, %arg1: memref<1024xbf16>) {
+    %cst = arith.constant 0.000000e+00 : f32
+    %cst_0 = arith.constant 1.000000e+00 : f32
+    %cst_1 = arith.constant 0.000000e+00 : bf16
+    %cst_2 = arith.constant dense<0xFF80> : vector<32xbf16>
+    %0 = affine.for %arg2 = 0 to 1024 step 32 iter_args(%arg3 = %cst_2) -> (vector<32xbf16>) {
+      %5 = vector.transfer_read %arg0[%arg2], %cst_1 : memref<1024xbf16>, vector<32xbf16>
+      %6 = arith.maxf %arg3, %5 : vector<32xbf16>
+      affine.yield %6 : vector<32xbf16>
+    }
+    %1 = vector.reduction <maxf>, %0 : vector<32xbf16> into bf16
+    affine.for %arg2 = 0 to 1024 {
+      %5 = affine.load %arg0[%arg2] : memref<1024xbf16>
+      %6 = arith.subf %5, %1 : bf16
+      %7 = math.exp %6 : bf16
+      affine.store %7, %arg0[%arg2] : memref<1024xbf16>
+    }
+    %2 = affine.for %arg2 = 0 to 1024 iter_args(%arg3 = %cst) -> (f32) {
+      %5 = affine.load %arg0[%arg2] : memref<1024xbf16>
+      %6 = arith.extf %5 : bf16 to f32
+      %7 = arith.addf %arg3, %6 : f32
+      affine.yield %7 : f32
+    }
+    %3 = arith.divf %cst_0, %2 : f32
+    %4 = arith.truncf %3 : f32 to bf16
+    affine.for %arg2 = 0 to 1024 {
+      %5 = affine.load %arg0[%arg2] : memref<1024xbf16>
+      %6 = arith.mulf %5, %4 : bf16
+      affine.store %6, %arg1[%arg2] : memref<1024xbf16>
+    }
+    return
+  }
+}
diff --git a/test/unit_tests/aievec_tests/bf16_softmax_2/defines.h b/test/unit_tests/aievec_tests/bf16_softmax_2/defines.h
@@ -0,0 +1,3 @@
+#pragma once
+constexpr unsigned const IN0_SIZE = 1024;
+constexpr unsigned const OUT0_SIZE = 1024;
diff --git a/test/unit_tests/aievec_tests/bf16_softmax_2/dut.cc b/test/unit_tests/aievec_tests/bf16_softmax_2/dut.cc
@@ -0,0 +1,108 @@
+// Cycle count: 3712
+#include "lut_based_ops.h"
+
+void dut(bfloat16 *restrict v1, bfloat16 *restrict v2) {
+  int32_t v3 = 0;
+  int32_t v4 = 2;
+  int32_t v5 = 4;
+  int32_t v6 = 8;
+  int32_t v7 = 16;
+  int32_t v8 = 32;
+  v16float v9 = broadcast_zero_float();
+  bfloat16 v10 = 0.0e+00;
+  v32bfloat16 v11 = broadcast_to_v32bfloat16(
+      (bfloat16)-338953138925153547590470800371487866880.000000);
+  size_t v12 = 0;
+  size_t v13 = 1024;
+  size_t v14 = 32;
+  v32bfloat16 v15;
+  v32bfloat16 v16 = v11;
+  for (size_t v17 = v12; v17 < v13; v17 += v14)
+    chess_prepare_for_pipelining chess_loop_range(32, 32) {
+      v32bfloat16 v18 = *(v32bfloat16 *)(v1 + v17);
+      v32bfloat16 v19 = max(v16, v18);
+      v16 = v19;
+    }
+  v15 = v16;
+  v32bfloat16 v20 = shift_bytes(v15, v15, v8);
+  v32bfloat16 v21 = max(v15, v20);
+  v32bfloat16 v22 = shift_bytes(v21, v21, v7);
+  v32bfloat16 v23 = max(v21, v22);
+  v32bfloat16 v24 = shift_bytes(v23, v23, v6);
+  v32bfloat16 v25 = max(v23, v24);
+  v32bfloat16 v26 = shift_bytes(v25, v25, v5);
+  v32bfloat16 v27 = max(v25, v26);
+  v32bfloat16 v28 = shift_bytes(v27, v27, v4);
+  v32bfloat16 v29 = max(v27, v28);
+  bfloat16 v30 = extract_elem(v29, v3);
+  v32bfloat16 v31 = broadcast_to_v32bfloat16(v30);
+  v16bfloat16 v32 = extract_v16bfloat16(v31, 0);
+  v16accfloat v33 = ups_to_v16accfloat(v32);
+  size_t v34 = 0;
+  size_t v35 = 1024;
+  size_t v36 = 16;
+  for (size_t v37 = v34; v37 < v35; v37 += v36)
+    chess_prepare_for_pipelining chess_loop_range(64, 64) {
+      v16bfloat16 v38 = *(v16bfloat16 *)(v1 + v37);
+      v16accfloat v39 = ups_to_v16accfloat(v38);
+      v16accfloat v40 = sub(v39, v33);
+      v16bfloat16 v41 = to_v16bfloat16(v40);
+      v16accfloat v42 = getExpBf16(v41);
+      v16bfloat16 v43 = to_v16bfloat16(v42);
+      *(v16bfloat16 *)(v1 + v37) = v43;
+    }
+  size_t v44 = 0;
+  size_t v45 = 1024;
+  size_t v46 = 16;
+  v16float v47;
+  v16float v48 = v9;
+  for (size_t v49 = v44; v49 < v45; v49 += v46)
+    chess_prepare_for_pipelining chess_loop_range(64, 64) {
+      v16bfloat16 v50 = *(v16bfloat16 *)(v1 + v49);
+      v16accfloat v51 = ups_to_v16accfloat(v50);
+      v16accfloat v52 = v16accfloat(v48);
+      v16accfloat v53 = add(v51, v52);
+      v16float v54 = v16float(v53);
+      v48 = v54;
+    }
+  v47 = v48;
+  v16float v55 = shift_bytes(v47, v47, v8);
+  v16accfloat v56 = v16accfloat(v47);
+  v16accfloat v57 = v16accfloat(v55);
+  v16accfloat v58 = add(v56, v57);
+  v16float v59 = v16float(v58);
+  v16float v60 = shift_bytes(v59, v59, v7);
+  v16accfloat v61 = v16accfloat(v59);
+  v16accfloat v62 = v16accfloat(v60);
+  v16accfloat v63 = add(v61, v62);
+  v16float v64 = v16float(v63);
+  v16float v65 = shift_bytes(v64, v64, v6);
+  v16accfloat v66 = v16accfloat(v64);
+  v16accfloat v67 = v16accfloat(v65);
+  v16accfloat v68 = add(v66, v67);
+  v16float v69 = v16float(v68);
+  v16float v70 = shift_bytes(v69, v69, v5);
+  v16accfloat v71 = v16accfloat(v69);
+  v16accfloat v72 = v16accfloat(v70);
+  v16accfloat v73 = add(v71, v72);
+  v16float v74 = v16float(v73);
+  float v75 = extract_elem(v74, v3);
+  bfloat16 v76 = getInvBf16(v75);
+  v32bfloat16 v77 = broadcast_to_v32bfloat16(v76);
+  v16bfloat16 v78 = extract_v16bfloat16(v77, 0);
+  v32bfloat16 v79 = broadcast_to_v32bfloat16(v10);
+  v16bfloat16 v80 = extract_v16bfloat16(v79, 0);
+  v32bfloat16 v81 = concat(v78, v80);
+  size_t v82 = 0;
+  size_t v83 = 1024;
+  size_t v84 = 16;
+  for (size_t v85 = v82; v85 < v83; v85 += v84)
+    chess_prepare_for_pipelining chess_loop_range(64, 64) {
+      v16bfloat16 v86 = *(v16bfloat16 *)(v1 + v85);
+      v32bfloat16 v87 = concat(v86, v80);
+      v16accfloat v88 = mul_elem_16_2(v81, v87);
+      v16bfloat16 v89 = to_v16bfloat16(v88);
+      *(v16bfloat16 *)(v2 + v85) = v89;
+    }
+  return;
+}