cornell-zhang · zzzDavid · Mar 7, 2023 · Mar 7, 2023 · Mar 7, 2023 · Mar 13, 2023
diff --git a/heterocl/ast/ir_builder.py b/heterocl/ast/ir_builder.py
@@ -7,8 +7,6 @@
 
 # Import MLIR dialects
 # Naming rule: import dialect as dialect_d
-import numpy as np
-
 from hcl_mlir.dialects import (
     func as func_d,
     hcl as hcl_d,
@@ -52,7 +50,7 @@
 
 from . import ast
 from ..context import get_context, get_location
-from ..utils import hcl_dtype_to_mlir, get_extra_type_hints
+from ..utils import hcl_dtype_to_mlir, get_extra_type_hints, make_anywidth_numpy_array
 from .. import types as htypes
 from . import build_cleaner
 
@@ -1408,63 +1406,9 @@ def build_bit_reverse_op(self, op: ast.BitReverseOp, ip):
     def build_constant_tensor_op(self, op: ast.ConstantTensorOp, ip):
         loc = Location.file(op.loc.filename, op.loc.lineno, 0)
         dtype = hcl_dtype_to_mlir(op.dtype, signless=True)
-        shape = op.values.shape
         if isinstance(op.dtype, (htypes.Int, htypes.UInt)):
-            # The following code has several steps to convert the numpy array to have
-            # the correct data type in order to create an MLIR constant tensor.
-            # Since MLIR-NumPy Python interface only supports byte-addressable data types,
-            # we need to change the data type of the array to have the minimum number of bytes
-            # that can represent the target bitwidth.
-            # e.g., hcl.const_tensor(arr, dtype=hcl.Int(20)) (6*6 array)
-            #       which requires 20 bits (3 bytes) to represent each element
-            # declaration: 6*6*i20
-            # numpy input: 6*6*i64
-            # 1. Decompose the original i32 or i64 array into a structured array of uint8
-            #  -> decompose: 6*6*8*i8
-            if op.dtype.bits == 1:
-                val = op.values
-                array = np.packbits(val, axis=None, bitorder="little")
-                value_attr = DenseElementsAttr.get(array, shape=val.shape, type=dtype)
-            else:
-                # Here we construct a customized NumPy dtype, "f0", "f1", "f2", etc.
-                # are the field names, and the entire data type is `op.values.dtype`.
-                # This can be viewed as a `union` type in C/C++.
-                # Please refer to the documentation for more details:
-                # https://numpy.org/doc/stable/reference/arrays.dtypes.html#specifying-and-constructing-data-types
-                decomposed_np_dtype = np.dtype(
-                    (
-                        op.values.dtype,
-                        {
-                            f"f{i}": (np.uint8, i)
-                            for i in range(op.values.dtype.itemsize)
-                        },
-                    )
-                )
-                val = op.values.view(decomposed_np_dtype)
-                # 2. Compose the uint8 array into a structured array of target bitwidth
-                # This is done by taking the first several bytes of the uint8 array
-                # "u1" means one unsigned byte, and "i1" means one signed byte
-                n_bytes = int(np.ceil(dtype.width / 8))
-                new_dtype = np.dtype(
-                    {
-                        "names": [f"f{i}" for i in range(n_bytes)],
-                        "formats": (["i1"] if isinstance(dtype, htypes.Int) else ["u1"])
-                        + ["u1"] * (n_bytes - 1),
-                        "offsets": list(range(n_bytes)),
-                        "itemize": n_bytes,
-                    }
-                )
-                # -> compose: 6*6*3*i8
-                val = np.stack([val[f"f{i}"] for i in range(n_bytes)], axis=-1)
-                # -> flatten: 108*i8
-                val = val.flatten()
-                # -> view: 36*i24
-                val = val.view(np.dtype(new_dtype))
-                # -> reshape: 6*6*i24
-                val = val.reshape(shape)
-                # Pass in the numpy array to get the MLIR attribute
-                # -> result: 6*6*i20
-                value_attr = DenseElementsAttr.get(val, shape=val.shape, type=dtype)
+            val = make_anywidth_numpy_array(op.values, op.dtype.bits)
+            value_attr = DenseElementsAttr.get(val, shape=op.values.shape, type=dtype)
         else:
             val = op.values
             value_attr = DenseElementsAttr.get(val)

diff --git a/heterocl/build_module.py b/heterocl/build_module.py
@@ -337,13 +337,12 @@ def attach_llvm_attrs(module):
         hcl_d.lower_composite_type(module)
         hcl_d.lower_fixed_to_int(module)
         hcl_d.lower_print_ops(module)
-        hcl_d.lower_anywidth_int(module)
+        # hcl_d.lower_anywidth_int(module)
         # Note: lower_any_width_int should precede
         # move_return_to_input, because it uses input/output
         # type hints.
         hcl_d.move_return_to_input(module)
         hcl_d.lower_bit_ops(module)
-        # print(module)
         hcl_d.legalize_cast(module)
         hcl_d.remove_stride_map(module)
         pipeline = "lower-affine,func.func(buffer-loop-hoisting)"

diff --git a/heterocl/module.py b/heterocl/module.py
@@ -110,7 +110,7 @@ def __call__(self, *argv):
                                 argv[len(op.arguments) + i].np_array = np.pad(
                                     argv[len(op.arguments) + i].np_array, pad_shape
                                 )
-            execute_llvm_backend(self.src, self.name, self.return_num, *argv)
+            execute_llvm_backend(self.src, self.name, *argv)
             for res, shape in original_results:
                 slicing = []
                 for s in shape:

diff --git a/heterocl/runtime.py b/heterocl/runtime.py
@@ -7,11 +7,20 @@
 import subprocess
 import ctypes
 import time
-import numpy as np
+import warnings
 
 from hcl_mlir import runtime as rt
 from .report import parse_xml
 
+# Filter out the warning from numpy when using ctypes array as numpy array.
+# This is a Python bug, see:
+# https://stackoverflow.com/questions/4964101/pep-3118-warning-when-using-ctypes-array-as-numpy-array
+warnings.filterwarnings(
+    "ignore",
+    category=RuntimeWarning,
+    message="A builtin ctypes object gave a PEP3118 format string that does not match its itemsize*",
+)
+
 
 def run_process(cmd, pattern=None):
     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
@@ -111,32 +120,26 @@ def execute_fpga_backend(target, shell=True):
         raise RuntimeError("Not implemented")
 
 
-def execute_llvm_backend(execution_engine, name, return_num, *argv):
+def execute_llvm_backend(execution_engine, name, *argv):
     """
-    - execution_engine: mlir.ExecutionEngine object, created in hcl.build
-    - name: str, device top-level function name
-    - return_num: int, the number of return values
-    - argv: list-like object, a list of input and output variables
+    Execute LLVM backend. Assume all return args have been moved to
+    input args.
+    ----------
+    execution_engine: mlir.ExecutionEngine
+        JIT object, created in hcl.build
+    name: str
+        device top-level function name
+    argv: list-like object
+        a list of input and output variables
     """
     if not isinstance(argv, list):
         argv = list(argv)
+
     # Unwrap hcl Array to get numpy arrays
     argv_np = [arg.unwrap() for arg in argv]
-    # Extract output arrays
-    return_args = argv_np[-return_num:]
-    # Convert output variables from numpy arrays to memref pointers
-    return_pointers = []
-    for arg in return_args:
-        memref = rt.get_ranked_memref_descriptor(arg)
-        return_pointers.append(ctypes.pointer(ctypes.pointer(memref)))
-    # Convert input variables from numpy arrays to memref pointers
     arg_pointers = []
-    for arg in argv_np[0:-return_num]:
+    for arg in argv_np:
         memref = rt.get_ranked_memref_descriptor(arg)
         arg_pointers.append(ctypes.pointer(ctypes.pointer(memref)))
     # Invoke device top-level function
-    execution_engine.invoke(name, *return_pointers, *arg_pointers)
-    # Copy output arrays back
-    for i, return_p in enumerate(return_pointers):
-        out_array = rt.ranked_memref_to_numpy(return_p[0])
-        np.copyto(argv[-(len(return_args) - i)].np_array, out_array)
+    execution_engine.invoke(name, *arg_pointers)