From 96adfab914679c28b00950d4d41a210980a5c8a5 Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <nirvedh@gmail.com>
Date: Wed, 17 Jul 2024 10:56:40 -0600
Subject: [PATCH] Add e2e tests that model switching costs

---
 build_tools/ci/cpu_comparison/run_test.sh     |  9 +++--
 .../test_files/matmul_f32_8_4_8.mlir          | 26 ++++++++++++++
 .../test_files/matmul_f32_8_8_4.mlir          | 32 +++++++++++++++++
 .../test_files/two_matmul_switching.mlir      | 35 +++++++++++++++++++
 4 files changed, 99 insertions(+), 3 deletions(-)
 create mode 100644 build_tools/ci/cpu_comparison/test_files/matmul_f32_8_4_8.mlir
 create mode 100644 build_tools/ci/cpu_comparison/test_files/matmul_f32_8_8_4.mlir
 create mode 100644 build_tools/ci/cpu_comparison/test_files/two_matmul_switching.mlir

diff --git a/build_tools/ci/cpu_comparison/run_test.sh b/build_tools/ci/cpu_comparison/run_test.sh
index 9e255375b..85620c6f3 100755
--- a/build_tools/ci/cpu_comparison/run_test.sh
+++ b/build_tools/ci/cpu_comparison/run_test.sh
@@ -148,8 +148,6 @@ fi
 
 
 source $XRT_DIR/setup.sh
-# Circumvent xclbin security (no longer needed as of April 2024 XDNA driver)
-export XRT_HACK_UNSECURE_LOADING_XCLBIN=1
 
 cd ${OUTPUT_DIR}
 
@@ -329,7 +327,7 @@ function run_test() {
       --iree-amd-aie-vitis-install-dir=${vitis_path} \
       --iree-hal-dump-executable-files-to=$PWD \
       --iree-amd-aie-show-invoked-commands \
-      --mlir-disable-threading -o ${aie_vmfb}"
+      --iree-scheduling-optimize-bindings=false -o ${aie_vmfb}"
 
 
   # TODO(newling) The following logic is copied from run_matmul_test.sh,
@@ -406,6 +404,11 @@ run_test --test_file ${THIS_DIR}/test_files/matmul_int32.mlir
 # An example of an arbitrary graph with three matmuls which form three dispatches.
 run_test --test_file ${THIS_DIR}/test_files/three_matmuls.mlir --function 'three_$mm$'
 
+# tests that model kernel swicting costs
+run_test --test_file ${THIS_DIR}/test_files/two_matmul_switching.mlir
+run_test --test_file ${THIS_DIR}/test_files/matmul_f32_8_8_4.mlir
+run_test --test_file ${THIS_DIR}/test_files/matmul_f32_8_4_8.mlir
+
 # Example of generating a matmul test from a template, and then running it.
 test_name=${OUTPUT_DIR}/test_from_template.mlir
 matmul_template_dir=${THIS_DIR}/matmul_template
diff --git a/build_tools/ci/cpu_comparison/test_files/matmul_f32_8_4_8.mlir b/build_tools/ci/cpu_comparison/test_files/matmul_f32_8_4_8.mlir
new file mode 100644
index 000000000..1f0ad5109
--- /dev/null
+++ b/build_tools/ci/cpu_comparison/test_files/matmul_f32_8_4_8.mlir
@@ -0,0 +1,26 @@
+// This test is useful to compare against the `two_matmul_switching` when no switching happens
+// and we successively call the same matmul.
+
+// These 2 lines are required by the script which generates input data:
+//
+// input 8x8xf32
+// input 8x4xf32
+
+!A_TYPE = tensor<8x8xf32>
+!B_TYPE = tensor<8x4xf32>
+!C_TYPE = tensor<8x4xf32>
+func.func @matmul_8_4_8(%lhs : !A_TYPE,
+    %rhs : !B_TYPE) -> !C_TYPE {
+  %empty = tensor.empty() : !C_TYPE
+  %cst = arith.constant 0.0 : f32
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : !C_TYPE) -> !C_TYPE
+  %1 = linalg.matmul ins(%lhs, %rhs : !A_TYPE, !B_TYPE)
+      outs(%fill : !C_TYPE) -> !C_TYPE
+  %2 = linalg.matmul ins(%lhs, %1 : !A_TYPE, !B_TYPE)
+      outs(%fill : !C_TYPE) -> !C_TYPE
+  %3 = linalg.matmul ins(%lhs, %2 : !A_TYPE, !B_TYPE)
+      outs(%fill : !C_TYPE) -> !C_TYPE
+  %4 = linalg.matmul ins(%lhs, %3 : !A_TYPE, !B_TYPE)
+      outs(%fill : !C_TYPE) -> !C_TYPE
+  return %4 : !C_TYPE
+}
diff --git a/build_tools/ci/cpu_comparison/test_files/matmul_f32_8_8_4.mlir b/build_tools/ci/cpu_comparison/test_files/matmul_f32_8_8_4.mlir
new file mode 100644
index 000000000..9427515c7
--- /dev/null
+++ b/build_tools/ci/cpu_comparison/test_files/matmul_f32_8_8_4.mlir
@@ -0,0 +1,32 @@
+// This test is useful to compare against the `two_matmul_switching` when no switching happens
+// and we successively call the same matmul
+
+// These 2 lines are required by the script which generates input data:
+//
+// input 8x4xf32
+// input 4x8xf32
+
+!A_TYPE = tensor<8x4xf32>
+!B_TYPE = tensor<4x8xf32>
+!C_TYPE = tensor<8x8xf32>
+func.func @matmul_8_8_4(%lhs : !A_TYPE,
+    %rhs : !B_TYPE) -> !C_TYPE {
+  %empty = tensor.empty() : !C_TYPE
+  %cst = arith.constant 0.0 : f32
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : !C_TYPE) -> !C_TYPE
+  %1 = linalg.matmul ins(%lhs, %rhs : !A_TYPE, !B_TYPE)
+      outs(%fill : !C_TYPE) -> !C_TYPE
+  %slice1 = tensor.extract_slice %1[0, 0][4, 8][1, 1] :
+  !C_TYPE to !B_TYPE
+  %2 = linalg.matmul ins(%lhs, %slice1 : !A_TYPE, !B_TYPE)
+      outs(%fill : !C_TYPE) -> !C_TYPE
+  %slice2 = tensor.extract_slice %2[0, 0][4, 8][1, 1] :
+  !C_TYPE to !B_TYPE
+  %3 = linalg.matmul ins(%lhs, %slice2 : !A_TYPE, !B_TYPE)
+      outs(%fill : !C_TYPE) -> !C_TYPE
+  %slice3 = tensor.extract_slice %3[0, 0][4, 8][1, 1] :
+  !C_TYPE to !B_TYPE
+  %4 = linalg.matmul ins(%lhs, %slice3 : !A_TYPE, !B_TYPE)
+      outs(%fill : !C_TYPE) -> !C_TYPE
+  return %4 : !C_TYPE
+}
diff --git a/build_tools/ci/cpu_comparison/test_files/two_matmul_switching.mlir b/build_tools/ci/cpu_comparison/test_files/two_matmul_switching.mlir
new file mode 100644
index 000000000..3bec22115
--- /dev/null
+++ b/build_tools/ci/cpu_comparison/test_files/two_matmul_switching.mlir
@@ -0,0 +1,35 @@
+// This test shows switching between two matmuls and is useful to model the switching cost
+
+// These 2 lines are required by the script which generates input data:
+//
+// input 8x4xf32
+// input 4x8xf32
+
+!A_TYPE = tensor<8x4xf32>
+!B_TYPE = tensor<4x8xf32>
+!C_TYPE = tensor<8x8xf32>
+func.func @matmul_small(%lhs : !A_TYPE,
+    %rhs : !B_TYPE) -> !A_TYPE {
+  %empty = tensor.empty() : !C_TYPE
+  %empty2 = tensor.empty() : !A_TYPE
+  %cst = arith.constant 0.0 : f32
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : !C_TYPE) -> !C_TYPE
+  %fill2 = linalg.fill ins(%cst : f32) outs(%empty2 : !A_TYPE) -> !A_TYPE
+  %1 = linalg.matmul ins(%lhs, %rhs : !A_TYPE, !B_TYPE)
+      outs(%fill : !C_TYPE) -> !C_TYPE
+  %2 = linalg.matmul ins(%1, %lhs : !C_TYPE, !A_TYPE)
+      outs(%fill2 : !A_TYPE) -> !A_TYPE
+  %3 = linalg.matmul ins(%2, %rhs : !A_TYPE, !B_TYPE)
+      outs(%fill : !C_TYPE) -> !C_TYPE
+  %4 = linalg.matmul ins(%3, %lhs : !C_TYPE, !A_TYPE)
+      outs(%fill2 : !A_TYPE) -> !A_TYPE
+  %5 = linalg.matmul ins(%4, %rhs : !A_TYPE, !B_TYPE)
+      outs(%fill : !C_TYPE) -> !C_TYPE
+  %6 = linalg.matmul ins(%5, %lhs : !C_TYPE, !A_TYPE)
+      outs(%fill2 : !A_TYPE) -> !A_TYPE
+  %7 = linalg.matmul ins(%6, %rhs : !A_TYPE, !B_TYPE)
+      outs(%fill : !C_TYPE) -> !C_TYPE
+  %8 = linalg.matmul ins(%7, %lhs : !C_TYPE, !A_TYPE)
+      outs(%fill2 : !A_TYPE) -> !A_TYPE
+  return %8 : !A_TYPE
+}