From 0a91fd62b17cfcc2c5895985cab928d56f239b54 Mon Sep 17 00:00:00 2001
From: Sasha Lopoukhine <superlopuh@gmail.com>
Date: Sat, 4 Nov 2023 12:23:07 +0000
Subject: [PATCH 1/2] add linalg.fill

---
 kernels/fill/16x16xf64/Makefile     |   9 +
 kernels/fill/16x16xf64/baseline.c   |  14 ++
 kernels/fill/16x16xf64/baseline.csv |   1 +
 kernels/fill/16x16xf64/cycles.csv   |   2 +
 kernels/fill/16x16xf64/data.c       | 264 ++++++++++++++++++++++++++++
 kernels/fill/16x16xf64/data.h       |   7 +
 kernels/fill/16x16xf64/linalg.csv   |   1 +
 kernels/fill/16x16xf64/linalg.mlir  |   6 +
 kernels/fill/16x16xf64/main.c       |  36 ++++
 kernels/fill/gendata.py             | 121 +++++++++++++
 snitch/Makefile.rules               |   1 +
 xdsl                                |   2 +-
 12 files changed, 463 insertions(+), 1 deletion(-)
 create mode 100644 kernels/fill/16x16xf64/Makefile
 create mode 100644 kernels/fill/16x16xf64/baseline.c
 create mode 100644 kernels/fill/16x16xf64/baseline.csv
 create mode 100644 kernels/fill/16x16xf64/cycles.csv
 create mode 100644 kernels/fill/16x16xf64/data.c
 create mode 100644 kernels/fill/16x16xf64/data.h
 create mode 100644 kernels/fill/16x16xf64/linalg.csv
 create mode 100644 kernels/fill/16x16xf64/linalg.mlir
 create mode 100644 kernels/fill/16x16xf64/main.c
 create mode 100644 kernels/fill/gendata.py

diff --git a/kernels/fill/16x16xf64/Makefile b/kernels/fill/16x16xf64/Makefile
new file mode 100644
index 00000000..26ef0923
--- /dev/null
+++ b/kernels/fill/16x16xf64/Makefile
@@ -0,0 +1,9 @@
+.DEFAULT_GOAL := all
+
+include ../../../snitch/Makefile.rules
+
+TESTS =
+TESTS += baseline.x
+TESTS += linalg.x
+
+include ../../Makefile.kernels
diff --git a/kernels/fill/16x16xf64/baseline.c b/kernels/fill/16x16xf64/baseline.c
new file mode 100644
index 00000000..3be19b6f
--- /dev/null
+++ b/kernels/fill/16x16xf64/baseline.c
@@ -0,0 +1,14 @@
+#include "data.h"
+
+#include <snrt.h>
+
+#include <stdint.h>
+
+void fill(const double x, double* y) {
+    for (uint32_t i = 0; i < M; ++i) {
+        for (uint32_t j = 0; j < N; ++j) {
+            // row-major accesses
+            y[i * N + j] = x;
+        }
+    }
+}
diff --git a/kernels/fill/16x16xf64/baseline.csv b/kernels/fill/16x16xf64/baseline.csv
new file mode 100644
index 00000000..5b0cffbc
--- /dev/null
+++ b/kernels/fill/16x16xf64/baseline.csv
@@ -0,0 +1 @@
+370
diff --git a/kernels/fill/16x16xf64/cycles.csv b/kernels/fill/16x16xf64/cycles.csv
new file mode 100644
index 00000000..9b648834
--- /dev/null
+++ b/kernels/fill/16x16xf64/cycles.csv
@@ -0,0 +1,2 @@
+baseline,370
+linalg,37
diff --git a/kernels/fill/16x16xf64/data.c b/kernels/fill/16x16xf64/data.c
new file mode 100644
index 00000000..7f20063f
--- /dev/null
+++ b/kernels/fill/16x16xf64/data.c
@@ -0,0 +1,264 @@
+#define M 16
+#define N 16
+
+const double X = 4.0;
+
+const double Y[M * N] = {
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+ 4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+ 4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+ 4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+ 4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+ 4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+ 4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+ 4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+ 4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+ 4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+ 4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+ 4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+ 4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+ 4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+4.,
+ 4.,
+4.,
+4.,
+4.
+};
+
diff --git a/kernels/fill/16x16xf64/data.h b/kernels/fill/16x16xf64/data.h
new file mode 100644
index 00000000..d7fe28a2
--- /dev/null
+++ b/kernels/fill/16x16xf64/data.h
@@ -0,0 +1,7 @@
+#pragma once
+
+#define M 16
+#define N 16
+
+extern const double X;
+extern const double Y[M * N];
diff --git a/kernels/fill/16x16xf64/linalg.csv b/kernels/fill/16x16xf64/linalg.csv
new file mode 100644
index 00000000..81b5c5d0
--- /dev/null
+++ b/kernels/fill/16x16xf64/linalg.csv
@@ -0,0 +1 @@
+37
diff --git a/kernels/fill/16x16xf64/linalg.mlir b/kernels/fill/16x16xf64/linalg.mlir
new file mode 100644
index 00000000..0af39d07
--- /dev/null
+++ b/kernels/fill/16x16xf64/linalg.mlir
@@ -0,0 +1,6 @@
+
+func.func public @fill(%X: f64,
+                       %Y: tensor<16x16xf64>) -> () {
+  %res = linalg.fill ins(%X : f64) outs(%Y : tensor<16x16xf64>) -> tensor<16x16xf64>
+  return
+}
diff --git a/kernels/fill/16x16xf64/main.c b/kernels/fill/16x16xf64/main.c
new file mode 100644
index 00000000..bf5023c2
--- /dev/null
+++ b/kernels/fill/16x16xf64/main.c
@@ -0,0 +1,36 @@
+#include "data.h"
+
+#include <snrt.h>
+
+#include <math.h>
+
+// Kernel provided via external definition
+void fill(double x, double *y);
+
+int main() {
+    // Allocate shared local memory
+    // By avoiding allocators and bumping by a known offset a base pointer
+    // (snrt_l1_next()) that is the same for all the cores in the cluster, we are
+    // essentially providing the same memory regions to all the cores in this cluster.
+    double local_x = X;
+    double *local_y = (double *)snrt_l1_next();
+
+    snrt_cluster_hw_barrier();
+
+    // Launch kernel: from this point on only core 0 is required to be alive.
+    int thiscore = snrt_cluster_core_idx();
+    if (thiscore != 0) return 0;
+
+    (void)snrt_mcycle();
+    fill(local_x, local_y);
+    (void)snrt_mcycle();
+
+    // Correctness check
+    int nerr = 0;
+    for (int i = 0; i < M * N; i++) {
+        double d = fabs(local_y[i] - Y[i]);
+        nerr += !(d <= 1E-2f);  // Make sure to take into account NaNs (e.g.: happy path
+                                // on the taken branch)
+    }
+    return nerr;
+}
diff --git a/kernels/fill/gendata.py b/kernels/fill/gendata.py
new file mode 100644
index 00000000..2846452f
--- /dev/null
+++ b/kernels/fill/gendata.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+
+import numpy as np
+import argparse
+import sys
+
+
+C_TYPES = {
+    "32": "float",
+    "64": "double",
+}
+
+NUMPY_TYPES = {
+    "32": np.single,
+    "64": np.double,
+}
+
+MLIR_TYPES = {
+    "32": "f32",
+    "64": "f64",
+}
+
+MEMREF_GLOBAL = """
+memref.global constant @{symbol} : memref<{shape}x{type}> = dense<[
+{initializer}
+]>
+"""
+
+
+ARRAY_GLOBAL = """
+const {type} {symbol}[{shape}] = {{
+{initializer}
+}};
+"""
+
+
+def array_to_memref_initializer(array: np.array):
+    return ",\n".join(f"  {np.array2string(row, separator=', ')}" for row in array)
+
+
+def array_to_memref(array: np.array, precision: int, shape=None, symbol=None):
+    return MEMREF_GLOBAL.format(
+        symbol=symbol or "array",
+        type=MLIR_TYPES[str(precision)],
+        shape=shape or "x".join(str(dim) for dim in array.shape),
+        initializer=array_to_memref_initializer(array),
+    )
+
+
+def array_to_c_initializer(array: np.array):
+    return np.array2string(array.flatten(), separator=",\n").strip(" []")
+
+
+def array_to_c(array: np.array, *, precision: int, shape=None, symbol=None):
+    return ARRAY_GLOBAL.format(
+        symbol=symbol or "array",
+        type=C_TYPES[str(precision)],
+        shape=shape or "*".join(str(dim) for dim in array.shape),
+        initializer=array_to_c_initializer(array),
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="gendata.py",
+        description="Generate literal initializers for a fictional BLAS matmul "
+        "(matrix-matrix single precision multiplication) on 2d memrefs",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "-r",
+        "--range",
+        type=float,
+        nargs=2,
+        default=(-1000.0, 1000.0),
+        help="uniform distribution range",
+    )
+    parser.add_argument("-m", "--rows", type=int, default=16, help="number of rows")
+    parser.add_argument(
+        "-n", "--columns", type=int, default=16, help="number of columns"
+    )
+    parser.add_argument(
+        "-k",
+        "--inner-dimension",
+        type=int,
+        default=16,
+        help="size of inner dimension",
+    )
+    parser.add_argument(
+        "--format", default="c", choices=["mlir", "c"], help="output format"
+    )
+    parser.add_argument(
+        "--precision",
+        type=int,
+        default=64,
+        choices=[32, 64],
+        help="floating-point precision to use",
+    )
+    args = parser.parse_args()
+
+    rmin, rmax = args.range
+    m = args.rows
+    n = args.columns
+
+    val = 4.0  # chosen by fair dice roll. guaranteed to be random
+
+    y = np.empty((m, n))
+    y.fill(val)
+
+    printopts = {"linewidth": None, "threshold": sys.maxsize}
+    if args.format == "c":
+        fmt = array_to_c
+        print(f"#define M {m}")
+        print(f"#define N {n}")
+        printopts["formatter"] = {"double ": lambda x: f"{x:+}f"}
+    else:
+        assert args.format == "mlir"
+        fmt = array_to_memref
+        printopts["sign"] = "+"
+    np.set_printoptions(**printopts)
+    print(fmt(y, shape="M * N", precision=args.precision, symbol="Y"))
diff --git a/snitch/Makefile.rules b/snitch/Makefile.rules
index 997f94ee..546350f0 100644
--- a/snitch/Makefile.rules
+++ b/snitch/Makefile.rules
@@ -137,6 +137,7 @@ XDSLOPTFLAGS += -t riscv-asm
 
 MLIROPTFLAGS =
 MLIROPTFLAGS += -opaque-pointers=0
+MLIROPTFLAGS += --func-bufferize
 MLIROPTFLAGS += --convert-linalg-to-loops
 MLIROPTFLAGS += --convert-scf-to-cf
 MLIROPTFLAGS += --canonicalize
diff --git a/xdsl b/xdsl
index bd0de928..6a9d83ca 160000
--- a/xdsl
+++ b/xdsl
@@ -1 +1 @@
-Subproject commit bd0de9285a21d5ca081c79710fc1d5eda0a289d1
+Subproject commit 6a9d83ca561182afb9135daa5cca5e2bf8397158

From a0d46eb3437efcb25dd325c6c45846f485db44a3 Mon Sep 17 00:00:00 2001
From: Sasha Lopoukhine <superlopuh@gmail.com>
Date: Sat, 4 Nov 2023 16:29:22 +0000
Subject: [PATCH 2/2] bufferization is a lie

---
 kernels/fill/16x16xf64/linalg.csv  | 2 +-
 kernels/fill/16x16xf64/linalg.mlir | 4 ++--
 snitch/Makefile.rules              | 1 -
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernels/fill/16x16xf64/linalg.csv b/kernels/fill/16x16xf64/linalg.csv
index 81b5c5d0..5b0cffbc 100644
--- a/kernels/fill/16x16xf64/linalg.csv
+++ b/kernels/fill/16x16xf64/linalg.csv
@@ -1 +1 @@
-37
+370
diff --git a/kernels/fill/16x16xf64/linalg.mlir b/kernels/fill/16x16xf64/linalg.mlir
index 0af39d07..f3586edc 100644
--- a/kernels/fill/16x16xf64/linalg.mlir
+++ b/kernels/fill/16x16xf64/linalg.mlir
@@ -1,6 +1,6 @@
 
 func.func public @fill(%X: f64,
-                       %Y: tensor<16x16xf64>) -> () {
-  %res = linalg.fill ins(%X : f64) outs(%Y : tensor<16x16xf64>) -> tensor<16x16xf64>
+                       %Y: memref<16x16xf64>) -> () {
+  linalg.fill ins(%X : f64) outs(%Y : memref<16x16xf64>) -> ()
   return
 }
diff --git a/snitch/Makefile.rules b/snitch/Makefile.rules
index 546350f0..997f94ee 100644
--- a/snitch/Makefile.rules
+++ b/snitch/Makefile.rules
@@ -137,7 +137,6 @@ XDSLOPTFLAGS += -t riscv-asm
 
 MLIROPTFLAGS =
 MLIROPTFLAGS += -opaque-pointers=0
-MLIROPTFLAGS += --func-bufferize
 MLIROPTFLAGS += --convert-linalg-to-loops
 MLIROPTFLAGS += --convert-scf-to-cf
 MLIROPTFLAGS += --canonicalize