From c64cfaec49afaf748f0343f89654ea85e98b6715 Mon Sep 17 00:00:00 2001
From: Niansong Zhang <nz264@cornell.edu>
Date: Tue, 7 Mar 2023 02:07:00 -0500
Subject: [PATCH 01/15] [Util] Move make_anywidth_numpy_array to utils

---
 heterocl/ast/ir_builder.py | 60 +++----------------------------
 heterocl/utils.py          | 72 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 56 deletions(-)

diff --git a/heterocl/ast/ir_builder.py b/heterocl/ast/ir_builder.py
index 2abbca30..b43e24c5 100644
--- a/heterocl/ast/ir_builder.py
+++ b/heterocl/ast/ir_builder.py
@@ -52,7 +52,7 @@
 
 from . import ast
 from ..context import get_context, get_location
-from ..utils import hcl_dtype_to_mlir, get_extra_type_hints
+from ..utils import hcl_dtype_to_mlir, get_extra_type_hints, make_anywidth_numpy_array
 from .. import types as htypes
 from . import build_cleaner
 
@@ -1410,61 +1410,9 @@ def build_constant_tensor_op(self, op: ast.ConstantTensorOp, ip):
         dtype = hcl_dtype_to_mlir(op.dtype, signless=True)
         shape = op.values.shape
         if isinstance(op.dtype, (htypes.Int, htypes.UInt)):
-            # The following code has several steps to convert the numpy array to have
-            # the correct data type in order to create an MLIR constant tensor.
-            # Since MLIR-NumPy Python interface only supports byte-addressable data types,
-            # we need to change the data type of the array to have the minimum number of bytes
-            # that can represent the target bitwidth.
-            # e.g., hcl.const_tensor(arr, dtype=hcl.Int(20)) (6*6 array)
-            #       which requires 20 bits (3 bytes) to represent each element
-            # declaration: 6*6*i20
-            # numpy input: 6*6*i64
-            # 1. Decompose the original i32 or i64 array into a structured array of uint8
-            #  -> decompose: 6*6*8*i8
-            if op.dtype.bits == 1:
-                val = op.values
-                array = np.packbits(val, axis=None, bitorder="little")
-                value_attr = DenseElementsAttr.get(array, shape=val.shape, type=dtype)
-            else:
-                # Here we construct a customized NumPy dtype, "f0", "f1", "f2", etc.
-                # are the field names, and the entire data type is `op.values.dtype`.
-                # This can be viewed as a `union` type in C/C++.
-                # Please refer to the documentation for more details:
-                # https://numpy.org/doc/stable/reference/arrays.dtypes.html#specifying-and-constructing-data-types
-                decomposed_np_dtype = np.dtype(
-                    (
-                        op.values.dtype,
-                        {
-                            f"f{i}": (np.uint8, i)
-                            for i in range(op.values.dtype.itemsize)
-                        },
-                    )
-                )
-                val = op.values.view(decomposed_np_dtype)
-                # 2. Compose the uint8 array into a structured array of target bitwidth
-                # This is done by taking the first several bytes of the uint8 array
-                # "u1" means one unsigned byte, and "i1" means one signed byte
-                n_bytes = int(np.ceil(dtype.width / 8))
-                new_dtype = np.dtype(
-                    {
-                        "names": [f"f{i}" for i in range(n_bytes)],
-                        "formats": (["i1"] if isinstance(dtype, htypes.Int) else ["u1"])
-                        + ["u1"] * (n_bytes - 1),
-                        "offsets": list(range(n_bytes)),
-                        "itemize": n_bytes,
-                    }
-                )
-                # -> compose: 6*6*3*i8
-                val = np.stack([val[f"f{i}"] for i in range(n_bytes)], axis=-1)
-                # -> flatten: 108*i8
-                val = val.flatten()
-                # -> view: 36*i24
-                val = val.view(np.dtype(new_dtype))
-                # -> reshape: 6*6*i24
-                val = val.reshape(shape)
-                # Pass in the numpy array to get the MLIR attribute
-                # -> result: 6*6*i20
-                value_attr = DenseElementsAttr.get(val, shape=val.shape, type=dtype)
+            signed = isinstance(op.dtype, htypes.Int)
+            val = make_anywidth_numpy_array(op.values, op.dtype.bits, signed)
+            value_attr = DenseElementsAttr.get(val, shape=val.shape, type=dtype)
         else:
             val = op.values
             value_attr = DenseElementsAttr.get(val)
diff --git a/heterocl/utils.py b/heterocl/utils.py
index 68116a74..1d416529 100644
--- a/heterocl/utils.py
+++ b/heterocl/utils.py
@@ -219,3 +219,75 @@ def get_max_value(dtype):
     if isinstance(dtype, UFixed):
         return (1 << dtype.bits) - 1
     raise DTypeError(f"Unrecognized data type: {dtype}")
+
+
+def make_anywidth_numpy_array(val, bitwidth, signed):
+    """
+    Converts a numpy array to any target bitwidth.
+    ----------------
+    Parameters:
+    val: numpy.ndarray
+        numpy array, can be any numpy native bitwidth, e.g. np.int64
+    bitwidth: int
+        target bitwidth e.g. 9, 31, 198
+    signed: True or False
+        whether the values in the array are signed or unsigned
+    ----------------
+    Returns:
+    numpy.ndarray
+        numpy array with the target bitwidth
+    """
+    shape = val.shape
+    # The following code has several steps to convert the numpy array to have
+    # the correct data type in order to create an MLIR constant tensor.
+    # Since MLIR-NumPy Python interface only supports byte-addressable data types,
+    # we need to change the data type of the array to have the minimum number of bytes
+    # that can represent the target bitwidth.
+    # e.g., hcl.const_tensor(arr, dtype=hcl.Int(20)) (6*6 array)
+    #       which requires 20 bits (3 bytes) to represent each element
+    # declaration: 6*6*i20
+    # numpy input: 6*6*i64
+    # 1. Decompose the original i32 or i64 array into a structured array of uint8
+    #  -> decompose: 6*6*8*i8
+    if bitwidth == 1:
+        return np.packbits(val, axis=None, bitorder="little")
+    else:
+        # Here we construct a customized NumPy dtype, "f0", "f1", "f2", etc.
+        # are the field names, and the entire data type is `op.values.dtype`.
+        # This can be viewed as a `union` type in C/C++.
+        # Please refer to the documentation for more details:
+        # https://numpy.org/doc/stable/reference/arrays.dtypes.html#specifying-and-constructing-data-types
+        decomposed_np_dtype = np.dtype(
+            (
+                val.dtype,
+                {
+                    f"f{i}": (np.uint8, i)
+                    for i in range(val.dtype.itemsize)
+                },
+            )
+        )
+        val = val.view(decomposed_np_dtype)
+        # 2. Compose the uint8 array into a structured array of target bitwidth
+        # This is done by taking the first several bytes of the uint8 array
+        # "u1" means one unsigned byte, and "i1" means one signed byte
+        n_bytes = int(np.ceil(bitwidth / 8))
+        new_dtype = np.dtype(
+            {
+                "names": [f"f{i}" for i in range(n_bytes)],
+                "formats": (["i1"] if signed else ["u1"])
+                + ["u1"] * (n_bytes - 1),
+                "offsets": list(range(n_bytes)),
+                "itemize": n_bytes,
+            }
+        )
+        # -> compose: 6*6*3*i8
+        val = np.stack([val[f"f{i}"] for i in range(n_bytes)], axis=-1)
+        # -> flatten: 108*i8
+        val = val.flatten()
+        # -> view: 36*i24
+        val = val.view(np.dtype(new_dtype))
+        # -> reshape: 6*6*i24
+        val = val.reshape(shape)
+        # Pass in the numpy array to get the MLIR attribute
+        # -> result: 6*6*i20
+        return val
\ No newline at end of file

From 07e430c41116e894ce0da5baa49b8165a392f019 Mon Sep 17 00:00:00 2001
From: Niansong Zhang <nz264@cornell.edu>
Date: Tue, 7 Mar 2023 15:22:56 -0500
Subject: [PATCH 02/15] [IRBuilder] Fix shape issue with DenseElementsAttr
 creation

---
 heterocl/ast/ir_builder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/heterocl/ast/ir_builder.py b/heterocl/ast/ir_builder.py
index b43e24c5..ffa820e7 100644
--- a/heterocl/ast/ir_builder.py
+++ b/heterocl/ast/ir_builder.py
@@ -1412,7 +1412,7 @@ def build_constant_tensor_op(self, op: ast.ConstantTensorOp, ip):
         if isinstance(op.dtype, (htypes.Int, htypes.UInt)):
             signed = isinstance(op.dtype, htypes.Int)
             val = make_anywidth_numpy_array(op.values, op.dtype.bits, signed)
-            value_attr = DenseElementsAttr.get(val, shape=val.shape, type=dtype)
+            value_attr = DenseElementsAttr.get(val, shape=op.values.shape, type=dtype)
         else:
             val = op.values
             value_attr = DenseElementsAttr.get(val)

From c5fee64344a65fca34e0e0241362386bc76dc882 Mon Sep 17 00:00:00 2001
From: Niansong Zhang <nz264@cornell.edu>
Date: Tue, 7 Mar 2023 16:07:48 -0500
Subject: [PATCH 03/15] Reconstructing LLVM backend runtime

---
 heterocl/build_module.py |  3 +--
 heterocl/runtime.py      | 38 ++++++++++++++++++++++++++++++++++----
 heterocl/tensor.py       |  5 +++++
 3 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/heterocl/build_module.py b/heterocl/build_module.py
index 8a66bd68..91124927 100644
--- a/heterocl/build_module.py
+++ b/heterocl/build_module.py
@@ -337,13 +337,12 @@ def attach_llvm_attrs(module):
         hcl_d.lower_composite_type(module)
         hcl_d.lower_fixed_to_int(module)
         hcl_d.lower_print_ops(module)
-        hcl_d.lower_anywidth_int(module)
+        # hcl_d.lower_anywidth_int(module)
         # Note: lower_any_width_int should precede
         # move_return_to_input, because it uses input/output
         # type hints.
         hcl_d.move_return_to_input(module)
         hcl_d.lower_bit_ops(module)
-        # print(module)
         hcl_d.legalize_cast(module)
         hcl_d.remove_stride_map(module)
         pipeline = "lower-affine,func.func(buffer-loop-hoisting)"
diff --git a/heterocl/runtime.py b/heterocl/runtime.py
index f4ef694e..ea2c4ff4 100644
--- a/heterocl/runtime.py
+++ b/heterocl/runtime.py
@@ -111,7 +111,7 @@ def execute_fpga_backend(target, shell=True):
         raise RuntimeError("Not implemented")
 
 
-def execute_llvm_backend(execution_engine, name, return_num, *argv):
+def execute_llvm_backend_obsolete(execution_engine, name, return_num, *argv):
     """
     - execution_engine: mlir.ExecutionEngine object, created in hcl.build
     - name: str, device top-level function name
@@ -137,6 +137,36 @@ def execute_llvm_backend(execution_engine, name, return_num, *argv):
     # Invoke device top-level function
     execution_engine.invoke(name, *return_pointers, *arg_pointers)
     # Copy output arrays back
-    for i, return_p in enumerate(return_pointers):
-        out_array = rt.ranked_memref_to_numpy(return_p[0])
-        np.copyto(argv[-(len(return_args) - i)].np_array, out_array)
+    # might be unnecessary
+    # for i, return_p in enumerate(return_pointers):
+        # out_array = rt.ranked_memref_to_numpy(return_p[0])
+        # np.copyto(argv[-(len(return_args) - i)].np_array, out_array) # problem here
+
+def execute_llvm_backend(execution_engine, name, return_num, *argv):
+    """
+    Execute LLVM backend. Assume all return args have been moved to 
+    input args. 
+    ----------
+    execution_engine: mlir.ExecutionEngine 
+        JIT object, created in hcl.build
+    name: str
+        device top-level function name
+    argv: list-like object
+        a list of input and output variables
+    """
+    # TODO: remove return_num
+    if not isinstance(argv, list):
+        argv = list(argv)
+    
+    # Unwrap hcl Array to get numpy arrays
+    argv_np = [arg.unwrap() for arg in argv]
+    arg_pointers = []
+    for arg in argv_np:
+        memref = rt.get_ranked_memref_descriptor(arg)
+        arg_pointers.append(ctypes.pointer(ctypes.pointer(memref)))
+    # Invoke device top-level function
+    execution_engine.invoke(name, *arg_pointers)
+    # this part is still necessary
+    # comment out for now
+    # for i, arg_p in enumerate(arg_pointers):
+    #     out_array = rt.ranked_memref_to_numpy(arg_p[0])
\ No newline at end of file
diff --git a/heterocl/tensor.py b/heterocl/tensor.py
index e161ea43..6efe8a5a 100644
--- a/heterocl/tensor.py
+++ b/heterocl/tensor.py
@@ -5,6 +5,7 @@
 from hcl_mlir.exceptions import DTypeError
 
 from .types import dtype_to_str, Int, UInt, Float, Fixed, UFixed
+from .utils import make_anywidth_numpy_array
 
 
 class Array:
@@ -81,6 +82,10 @@ def asnumpy(self):
         return self.np_array
 
     def unwrap(self):
+        if isinstance(self.dtype, (Int, Fixed)):
+            return make_anywidth_numpy_array(self.np_array, self.dtype.bits, True)
+        elif isinstance(self.dtype, (UInt, UFixed)):
+            return make_anywidth_numpy_array(self.np_array, self.dtype.bits, False)
         return self.np_array
 
     def __repr__(self) -> str:

From 7dcf4df91aeca3ac44502b6489a5b53c1f7dd6ab Mon Sep 17 00:00:00 2001
From: Niansong Zhang <nz264@cornell.edu>
Date: Mon, 13 Mar 2023 15:48:23 -0400
Subject: [PATCH 04/15] [Util] Remove np.int128, np.int256, since they don't
 exist

---
 heterocl/utils.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/heterocl/utils.py b/heterocl/utils.py
index 1d416529..be518479 100644
--- a/heterocl/utils.py
+++ b/heterocl/utils.py
@@ -143,10 +143,6 @@ def make_const_tensor(val, dtype):
             np_dtype = np.int32
         elif dtype.bits <= 64:
             np_dtype = np.int64
-        elif dtype.bits <= 128:
-            np_dtype = np.int128
-        elif dtype.bits <= 256:
-            np_dtype = np.int256
         else:
             raise DTypeError(
                 f"Integer width ({dtype}) too large, not supported by numpy"
@@ -277,7 +273,7 @@ def make_anywidth_numpy_array(val, bitwidth, signed):
                 "formats": (["i1"] if signed else ["u1"])
                 + ["u1"] * (n_bytes - 1),
                 "offsets": list(range(n_bytes)),
-                "itemize": n_bytes,
+                "itemize": n_bytes, # should this be itemsize?
             }
         )
         # -> compose: 6*6*3*i8

From fddd444fbdcd377ad900a8f0ae53b679f93863aa Mon Sep 17 00:00:00 2001
From: Niansong Zhang <nz264@cornell.edu>
Date: Wed, 15 Mar 2023 04:11:28 -0400
Subject: [PATCH 05/15] [Array] Extend hcl.array to support any bitwidth

---
 heterocl/runtime.py |  23 ++++-
 heterocl/tensor.py  | 204 +++++++++++++++++++++++++++++++-------------
 heterocl/utils.py   |   4 +-
 3 files changed, 167 insertions(+), 64 deletions(-)

diff --git a/heterocl/runtime.py b/heterocl/runtime.py
index ea2c4ff4..e4d7a0c6 100644
--- a/heterocl/runtime.py
+++ b/heterocl/runtime.py
@@ -12,6 +12,12 @@
 from hcl_mlir import runtime as rt
 from .report import parse_xml
 
+# Filter out the warning from numpy when using ctypes array as numpy array.
+# This is a Python bug, see:
+# https://stackoverflow.com/questions/4964101/pep-3118-warning-when-using-ctypes-array-as-numpy-array
+import warnings
+warnings.filterwarnings("ignore", category=RuntimeWarning, message="A builtin ctypes object gave a PEP3118 format string that does not match its itemsize*")
+
 
 def run_process(cmd, pattern=None):
     p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
@@ -168,5 +174,18 @@ def execute_llvm_backend(execution_engine, name, return_num, *argv):
     execution_engine.invoke(name, *arg_pointers)
     # this part is still necessary
     # comment out for now
-    # for i, arg_p in enumerate(arg_pointers):
-    #     out_array = rt.ranked_memref_to_numpy(arg_p[0])
\ No newline at end of file
+    # print(arg_pointers[0][0][0].aligned)
+    # print(f"is ctypes._Pointer: {isinstance(arg_pointers[0][0][0].aligned, ctypes._Pointer)}")
+    # print(arg_pointers[1][0][0].aligned)
+    # print(f"is ctypes._Pointer: {isinstance(arg_pointers[1][0][0].aligned, ctypes._Pointer)}")
+    for i, arg_p in enumerate(arg_pointers):
+        np_arr = np.ctypeslib.as_array(
+            arg_p[0][0].aligned, shape=arg_p[0][0].shape)
+        strided_arr = np.lib.stride_tricks.as_strided(
+            np_arr,
+            np.ctypeslib.as_array(arg_p[0][0].shape),
+            np.ctypeslib.as_array(arg_p[0][0].strides) * np_arr.itemsize,
+        )
+        out_array = strided_arr
+        # out_array = rt.ranked_memref_to_numpy(arg_p[0]) # can confirm that it works with any bitwidth array
+        np.copyto(argv[i].np_array, out_array) # target, source
\ No newline at end of file
diff --git a/heterocl/tensor.py b/heterocl/tensor.py
index 6efe8a5a..b326932b 100644
--- a/heterocl/tensor.py
+++ b/heterocl/tensor.py
@@ -2,70 +2,69 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import numpy as np
-from hcl_mlir.exceptions import DTypeError
+import math
+from hcl_mlir.exceptions import DTypeError, APIError, DTypeWarning
 
 from .types import dtype_to_str, Int, UInt, Float, Fixed, UFixed
 from .utils import make_anywidth_numpy_array
 
 
 class Array:
-    """A wrapper class for numpy array
-    Differences between array and tensor:
-    tensor is only a placeholder while array holds actual values
+    """
+    Represents a input tensor in HeteroCL.
+    This class is a wrapper of numpy.ndarray, but it also
+    support a wider range of data types, including any-width
+    integer and fixed-point data types.
     """
 
-    def __init__(self, np_array, dtype):
-        self.dtype = dtype  # should specify the type of `dtype`
-        if isinstance(np_array, list):
-            np_array = np.array(np_array)
-        if dtype is not None:
-            # Data type check
-            if isinstance(dtype, Float):
-                hcl_dtype_str = dtype_to_str(dtype)
-                correct_dtype = np.dtype(hcl_dtype_str)
-                if np_array.dtype != correct_dtype:
-                    np_array = np_array.astype(correct_dtype)
-            elif isinstance(dtype, Int):
-                # Handle overflow
-                sb = 1 << self.dtype.bits
-                sb_limit = 1 << (self.dtype.bits - 1)
-                np_array = np_array % sb
-
-                def cast_func(x):
-                    return x if x < sb_limit else x - sb
-
-                vec_np_array = np.vectorize(cast_func)(np_array)
-                np_array = vec_np_array.astype(np.uint64)
-            elif isinstance(dtype, UInt):
-                # Handle overflow
-                sb = 1 << self.dtype.bits
-                np_array = np_array % sb
-                np_array = np_array.astype(np.uint64)
-            elif isinstance(dtype, Fixed):
-                # Handle overflow
-                sb = 1 << self.dtype.bits
-                sb_limit = 1 << (self.dtype.bits - 1)
-                np_array = np_array * (2**dtype.fracs)
-                np_array = np.fix(np_array) % sb
-
-                def cast_func(x):
-                    return x if x < sb_limit else x - sb
-
-                vec_np_array = np.vectorize(cast_func)(np_array)
-                np_array = vec_np_array.astype(np.uint64)
-            elif isinstance(dtype, UFixed):
-                # Handle overflow
-                sb = 1 << self.dtype.bits
-                np_array = np_array * (2**dtype.fracs)
-                np_array = np.fix(np_array) % sb
-                np_array = np_array.astype(np.uint64)
-            else:
-                raise DTypeError("Type error: unrecognized type: " + str(self.dtype))
-        else:
-            raise RuntimeError("Should provide type info")
-        self.np_array = np_array
-
+    def __init__(self, array, dtype):
+        """
+        Parameters
+        ----------
+        array : numpy.ndarray or a python list
+            The array to be wrapped.
+            If the bitwidth of the data type is wider than 64,
+            the array should be a python list.
+        dtype : HeteroCL data type
+        """
+        self.dtype = dtype
+        if dtype is None:
+            raise APIError("Should provide type info")
+        # self.np_array: a numpy array that holds the data
+        # For float type, self.np_array is a float type numpy array
+        # For int, uint, fixed, ufixed, self.np_array is a struct type numpy array
+        # with each field being a byte.
+        self.np_array = self._handle_overflow(array, dtype)
+        if not isinstance(dtype, Float):
+            signed = isinstance(dtype, Int) or isinstance(dtype, Fixed)
+            # closest power of 2
+            bitwidth = 1 << (self.dtype.bits - 1).bit_length()
+            if bitwidth < 8: bitwidth = 8
+            # this is to be compliant with MLIR's anywidth type representation
+            # e.g. i1-i8 -> int8
+            #      i9-i16 -> int16
+            #      i17-i32 -> int32
+            #      i33-i64 -> int64
+            #      i65-i128 -> int128
+            #      i129-i256 -> int256
+            self.np_array = make_anywidth_numpy_array(self.np_array, bitwidth, signed)
+    
     def asnumpy(self):
+        """
+        Convert HeteroCL array to numpy array / python list.
+        If the bitwidth is wider than 64, the result will be a python list.
+        Otherwise, return a numpy array.
+        """
+        if isinstance(self.dtype, Float):
+            hcl_dtype_str = dtype_to_str(self.dtype)
+            np_dtype = np.dtype(hcl_dtype_str)
+            res_array = self.np_array.astype(np_dtype)
+            return res_array
+        elif isinstance(self.dtype, Int):
+            if self.dtype.bits > 64:
+                DTypeWarning(f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list")
+            return self._struct_np_array_to_int()
+
         if isinstance(self.dtype, (Fixed, UFixed)):
             if isinstance(self.dtype, Fixed):
                 res_array = self.np_array.astype(np.int64)
@@ -82,11 +81,98 @@ def asnumpy(self):
         return self.np_array
 
     def unwrap(self):
-        if isinstance(self.dtype, (Int, Fixed)):
-            return make_anywidth_numpy_array(self.np_array, self.dtype.bits, True)
-        elif isinstance(self.dtype, (UInt, UFixed)):
-            return make_anywidth_numpy_array(self.np_array, self.dtype.bits, False)
         return self.np_array
 
+
+    def _handle_overflow(self, array, dtype):
+        """
+        If the dtype is wider than 64 bits,
+        array should a list of numpy numbers.
+        """
+        # Data type check
+        if isinstance(dtype, Float):
+            if isinstance(array, list):
+                array = np.array(array)
+            hcl_dtype_str = dtype_to_str(dtype)
+            correct_dtype = np.dtype(hcl_dtype_str)
+            if array.dtype != correct_dtype:
+                array = array.astype(correct_dtype)
+        elif isinstance(dtype, Int):
+            sb = 1 << self.dtype.bits
+            sb_limit = 1 << (self.dtype.bits - 1)
+            array = array % sb # cap the value to the max value of the bitwidth
+            def cast_func(x):
+                # recursive
+                if isinstance(x, list):
+                    return [cast_func(y) for y in x]
+                # signed integer overflow function: wrap mode
+                return x if x < sb_limit else x - sb
+            if isinstance(array, list):
+                array = [cast_func(x) for x in array] # TODO: this should be tested independently
+            else:
+                array = np.vectorize(cast_func)(array)
+        elif isinstance(dtype, UInt):
+            # Handle overflow
+            sb = 1 << self.dtype.bits
+            array = array % sb
+        elif isinstance(dtype, Fixed):
+            # Handle overflow
+            sb = 1 << self.dtype.bits
+            sb_limit = 1 << (self.dtype.bits - 1)
+            array = array * (2**dtype.fracs)
+            def cast_func(x):
+                # recursive
+                if isinstance(x, list):
+                    return [cast_func(y) for y in x]
+                x = math.trunc(x) % sb # rounds towards zero
+                # signed integer overflow function: wrap mode
+                return x if x < sb_limit else x - sb
+            if isinstance(array, list):
+                array = [cast_func(x) for x in array]
+            else:
+                array = np.vectorize(cast_func)(array)
+        elif isinstance(dtype, UFixed):
+            # Handle overflow
+            sb = 1 << self.dtype.bits
+            array = array * (2**dtype.fracs)
+            def cast_func(x):
+                # recursive
+                if isinstance(x, list):
+                    return [cast_func(y) for y in x]
+                x = math.trunc(x) % sb # rounds towards zero
+                return x
+            if isinstance(array, list):
+                array = [cast_func(x) for x in array]
+            else:
+                array = np.vectorize(cast_func)(array)
+        else:
+            raise DTypeError("Type error: unrecognized type: " + str(self.dtype))
+        return array
+    
+
+    def _struct_np_array_to_int(self):
+        pylist = self.np_array.tolist()
+        # each element is a tuple
+        def to_int(x):
+            if isinstance(x, list):
+                return [to_int(y) for y in x]
+            # concatenate the tuple
+            # each element is a byte
+            signed = isinstance(self.dtype, (Int, Fixed))
+            byte_str = b''
+            byte_str += x[0].to_bytes(1, byteorder='little', signed=signed)
+            for i in range(1, len(x)):
+                byte_str += x[i].to_bytes(1, byteorder='little', signed=False)
+            value = int.from_bytes(byte_str, byteorder='little', signed=signed)
+            # handle signed negative int: equivalent to sign extension
+            if signed and value >= (1 << (self.dtype.bits - 1)):
+                value -= (1 << self.dtype.bits)
+            return value
+        pylist = to_int(pylist)
+        if self.dtype.bits <= 64:
+            return np.array(pylist, dtype=np.int64)
+        else:
+            return pylist
+
     def __repr__(self) -> str:
         return self.asnumpy().__repr__()
diff --git a/heterocl/utils.py b/heterocl/utils.py
index be518479..90de9d34 100644
--- a/heterocl/utils.py
+++ b/heterocl/utils.py
@@ -273,7 +273,7 @@ def make_anywidth_numpy_array(val, bitwidth, signed):
                 "formats": (["i1"] if signed else ["u1"])
                 + ["u1"] * (n_bytes - 1),
                 "offsets": list(range(n_bytes)),
-                "itemize": n_bytes, # should this be itemsize?
+                "itemsize": n_bytes, # should this be itemsize?
             }
         )
         # -> compose: 6*6*3*i8
@@ -284,6 +284,4 @@ def make_anywidth_numpy_array(val, bitwidth, signed):
         val = val.view(np.dtype(new_dtype))
         # -> reshape: 6*6*i24
         val = val.reshape(shape)
-        # Pass in the numpy array to get the MLIR attribute
-        # -> result: 6*6*i20
         return val
\ No newline at end of file

From 172772b84426c6c6c4f1b0cd36f3da044d08d809 Mon Sep 17 00:00:00 2001
From: Niansong Zhang <nz264@cornell.edu>
Date: Wed, 15 Mar 2023 11:25:23 -0400
Subject: [PATCH 06/15] [Array] Add sign extension

---
 heterocl/runtime.py | 16 +------------
 heterocl/tensor.py  | 55 +++++++++++++++++++++++++++------------------
 heterocl/utils.py   | 16 +++++++++----
 3 files changed, 46 insertions(+), 41 deletions(-)

diff --git a/heterocl/runtime.py b/heterocl/runtime.py
index e4d7a0c6..c7901254 100644
--- a/heterocl/runtime.py
+++ b/heterocl/runtime.py
@@ -172,20 +172,6 @@ def execute_llvm_backend(execution_engine, name, return_num, *argv):
         arg_pointers.append(ctypes.pointer(ctypes.pointer(memref)))
     # Invoke device top-level function
     execution_engine.invoke(name, *arg_pointers)
-    # this part is still necessary
-    # comment out for now
-    # print(arg_pointers[0][0][0].aligned)
-    # print(f"is ctypes._Pointer: {isinstance(arg_pointers[0][0][0].aligned, ctypes._Pointer)}")
-    # print(arg_pointers[1][0][0].aligned)
-    # print(f"is ctypes._Pointer: {isinstance(arg_pointers[1][0][0].aligned, ctypes._Pointer)}")
     for i, arg_p in enumerate(arg_pointers):
-        np_arr = np.ctypeslib.as_array(
-            arg_p[0][0].aligned, shape=arg_p[0][0].shape)
-        strided_arr = np.lib.stride_tricks.as_strided(
-            np_arr,
-            np.ctypeslib.as_array(arg_p[0][0].shape),
-            np.ctypeslib.as_array(arg_p[0][0].strides) * np_arr.itemsize,
-        )
-        out_array = strided_arr
-        # out_array = rt.ranked_memref_to_numpy(arg_p[0]) # can confirm that it works with any bitwidth array
+        out_array = rt.ranked_memref_to_numpy(arg_p[0])
         np.copyto(argv[i].np_array, out_array) # target, source
\ No newline at end of file
diff --git a/heterocl/tensor.py b/heterocl/tensor.py
index b326932b..56d01ca4 100644
--- a/heterocl/tensor.py
+++ b/heterocl/tensor.py
@@ -64,21 +64,22 @@ def asnumpy(self):
             if self.dtype.bits > 64:
                 DTypeWarning(f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list")
             return self._struct_np_array_to_int()
-
-        if isinstance(self.dtype, (Fixed, UFixed)):
-            if isinstance(self.dtype, Fixed):
-                res_array = self.np_array.astype(np.int64)
-            else:
-                res_array = self.np_array
-            res_array = res_array.astype(np.float64) / float(2 ** (self.dtype.fracs))
-            return res_array
-        if isinstance(self.dtype, Int):
-            res_array = self.np_array.astype(np.int64)
-            return res_array
-        if isinstance(self.dtype, Float):
-            res_array = self.np_array.astype(float)
-            return res_array
-        return self.np_array
+        elif isinstance(self.dtype, UInt):
+            if self.dtype.bits > 64:
+                DTypeWarning(f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list")
+            return self._struct_np_array_to_int()
+        elif isinstance(self.dtype, Fixed):
+            if self.dtype.bits > 64:
+                DTypeWarning(f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list")
+            base_array = self._struct_np_array_to_int()
+            return base_array.astype(np.float64) / float(2 ** (self.dtype.fracs))
+        elif isinstance(self.dtype, UFixed):
+            if self.dtype.bits > 64:
+                DTypeWarning(f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list")
+            base_array = self._struct_np_array_to_int()
+            return base_array.astype(np.float64) / float(2 ** (self.dtype.fracs))
+        else:
+            raise DTypeError(f"Unsupported data type {self.dtype}")
 
     def unwrap(self):
         return self.np_array
@@ -100,12 +101,12 @@ def _handle_overflow(self, array, dtype):
         elif isinstance(dtype, Int):
             sb = 1 << self.dtype.bits
             sb_limit = 1 << (self.dtype.bits - 1)
-            array = array % sb # cap the value to the max value of the bitwidth
             def cast_func(x):
                 # recursive
                 if isinstance(x, list):
                     return [cast_func(y) for y in x]
                 # signed integer overflow function: wrap mode
+                x = x % sb # cap the value to the max value of the bitwidth
                 return x if x < sb_limit else x - sb
             if isinstance(array, list):
                 array = [cast_func(x) for x in array] # TODO: this should be tested independently
@@ -156,17 +157,27 @@ def _struct_np_array_to_int(self):
         def to_int(x):
             if isinstance(x, list):
                 return [to_int(y) for y in x]
+            signed = isinstance(self.dtype, (Int, Fixed))
+            # turn x from tuple to list
+            x = list(x)
+            # find MSB
+            byte_idx = (self.dtype.bits - 1) // 8
+            bit_idx = (self.dtype.bits - 1) % 8
+            msb = (x[byte_idx] & (1 << bit_idx)) > 0
+            # sign extension
+            if signed and msb:
+                x[byte_idx] |= ((0xff << bit_idx) & 0xff)
+                for i in range(byte_idx + 1, len(x)):
+                    x[i] = 0xff
             # concatenate the tuple
             # each element is a byte
-            signed = isinstance(self.dtype, (Int, Fixed))
             byte_str = b''
-            byte_str += x[0].to_bytes(1, byteorder='little', signed=signed)
-            for i in range(1, len(x)):
+            for i in range(len(x) - 1):
+                # little endian, first x-1 elements are unsigned bytes
                 byte_str += x[i].to_bytes(1, byteorder='little', signed=False)
+            # last element is signed
+            byte_str += x[-1].to_bytes(1, byteorder='little', signed=signed)
             value = int.from_bytes(byte_str, byteorder='little', signed=signed)
-            # handle signed negative int: equivalent to sign extension
-            if signed and value >= (1 << (self.dtype.bits - 1)):
-                value -= (1 << self.dtype.bits)
             return value
         pylist = to_int(pylist)
         if self.dtype.bits <= 64:
diff --git a/heterocl/utils.py b/heterocl/utils.py
index 90de9d34..8675d54d 100644
--- a/heterocl/utils.py
+++ b/heterocl/utils.py
@@ -234,6 +234,8 @@ def make_anywidth_numpy_array(val, bitwidth, signed):
         numpy array with the target bitwidth
     """
     shape = val.shape
+    sign_array = val > 0
+    avail_bytes = val.itemsize # number of bytes of each element
     # The following code has several steps to convert the numpy array to have
     # the correct data type in order to create an MLIR constant tensor.
     # Since MLIR-NumPy Python interface only supports byte-addressable data types,
@@ -266,18 +268,24 @@ def make_anywidth_numpy_array(val, bitwidth, signed):
         # 2. Compose the uint8 array into a structured array of target bitwidth
         # This is done by taking the first several bytes of the uint8 array
         # "u1" means one unsigned byte, and "i1" means one signed byte
+        # f0 is LSB, fn is MSB
         n_bytes = int(np.ceil(bitwidth / 8))
         new_dtype = np.dtype(
             {
                 "names": [f"f{i}" for i in range(n_bytes)],
-                "formats": (["i1"] if signed else ["u1"])
-                + ["u1"] * (n_bytes - 1),
+                "formats": ["u1"] * (n_bytes - 1) + (["i1"] if signed else ["u1"]),
                 "offsets": list(range(n_bytes)),
-                "itemsize": n_bytes, # should this be itemsize?
+                "itemsize": n_bytes,
             }
         )
+        # sometimes the available bytes are not enough to represent the target bitwidth
+        # so that we need to pad the array
+        _bytes = [val[f"f{i}"] for i in range(min(avail_bytes, n_bytes))]
+        if avail_bytes < n_bytes:
+            padding = np.where(sign_array, 0x00, 0xFF).astype(np.uint8)
+            _bytes += [padding] * (n_bytes - avail_bytes)
         # -> compose: 6*6*3*i8
-        val = np.stack([val[f"f{i}"] for i in range(n_bytes)], axis=-1)
+        val = np.stack(_bytes, axis=-1)
         # -> flatten: 108*i8
         val = val.flatten()
         # -> view: 36*i24

From 9cd8bf8e21442e113a9e7b6046828121c801f5cc Mon Sep 17 00:00:00 2001
From: Niansong Zhang <nz264@cornell.edu>
Date: Wed, 15 Mar 2023 14:06:27 -0400
Subject: [PATCH 07/15] [Runtime] copying back results is not necessary

---
 heterocl/runtime.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/heterocl/runtime.py b/heterocl/runtime.py
index c7901254..8a240da7 100644
--- a/heterocl/runtime.py
+++ b/heterocl/runtime.py
@@ -172,6 +172,11 @@ def execute_llvm_backend(execution_engine, name, return_num, *argv):
         arg_pointers.append(ctypes.pointer(ctypes.pointer(memref)))
     # Invoke device top-level function
     execution_engine.invoke(name, *arg_pointers)
-    for i, arg_p in enumerate(arg_pointers):
-        out_array = rt.ranked_memref_to_numpy(arg_p[0])
-        np.copyto(argv[i].np_array, out_array) # target, source
\ No newline at end of file
+    # for i, arg_p in enumerate(arg_pointers):
+        # out_array = rt.ranked_memref_to_numpy(arg_p[0])
+        # if out_array element type has one byte, 
+        # ranked_memref_to_numpy will automatically unpack it
+        # if argv[i].np_array.dtype.itemsize == 1:
+        #     np.copyto(argv[i].np_array['f0'], out_array)
+        # else:
+        #     np.copyto(argv[i].np_array, out_array) # target, source
\ No newline at end of file

From abea8458839f36c14e963462986eec4521341a31 Mon Sep 17 00:00:00 2001
From: Niansong Zhang <nz264@cornell.edu>
Date: Wed, 15 Mar 2023 14:06:53 -0400
Subject: [PATCH 08/15] [Array] Exclude changes in fixed type in this PR, since
 it needs changes in IR first

---
 heterocl/tensor.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/heterocl/tensor.py b/heterocl/tensor.py
index 56d01ca4..e50fecb6 100644
--- a/heterocl/tensor.py
+++ b/heterocl/tensor.py
@@ -35,8 +35,8 @@ def __init__(self, array, dtype):
         # For int, uint, fixed, ufixed, self.np_array is a struct type numpy array
         # with each field being a byte.
         self.np_array = self._handle_overflow(array, dtype)
-        if not isinstance(dtype, Float):
-            signed = isinstance(dtype, Int) or isinstance(dtype, Fixed)
+        if isinstance(dtype, (Int, UInt)):
+            signed = isinstance(dtype, Int)
             # closest power of 2
             bitwidth = 1 << (self.dtype.bits - 1).bit_length()
             if bitwidth < 8: bitwidth = 8
@@ -68,16 +68,20 @@ def asnumpy(self):
             if self.dtype.bits > 64:
                 DTypeWarning(f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list")
             return self._struct_np_array_to_int()
+        #TODO(Niansong): fixed/ufixed does not go through struct_np_array_to_int for now
+        # because a change in IR is needed to support this, leaving it to another PR
         elif isinstance(self.dtype, Fixed):
             if self.dtype.bits > 64:
                 DTypeWarning(f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list")
-            base_array = self._struct_np_array_to_int()
-            return base_array.astype(np.float64) / float(2 ** (self.dtype.fracs))
+            # base_array = self._struct_np_array_to_int()
+            # return base_array.astype(np.float64) / float(2 ** (self.dtype.fracs))
+            return self.np_array.astype(np.float64) / float(2 ** (self.dtype.fracs))
         elif isinstance(self.dtype, UFixed):
             if self.dtype.bits > 64:
                 DTypeWarning(f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list")
-            base_array = self._struct_np_array_to_int()
-            return base_array.astype(np.float64) / float(2 ** (self.dtype.fracs))
+            # base_array = self._struct_np_array_to_int()
+            # return base_array.astype(np.float64) / float(2 ** (self.dtype.fracs))
+            return self.np_array.astype(np.float64) / float(2 ** (self.dtype.fracs))
         else:
             raise DTypeError(f"Unsupported data type {self.dtype}")
 

From 340ccecb9c645f0e67ec7013b0b67e8626d6b59b Mon Sep 17 00:00:00 2001
From: Niansong Zhang <nz264@cornell.edu>
Date: Wed, 15 Mar 2023 14:24:56 -0400
Subject: [PATCH 09/15] [Util] Remove signedness in struct numpy
 representation, to make sign extension easier

---
 heterocl/tensor.py | 5 +----
 heterocl/utils.py  | 3 ++-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/heterocl/tensor.py b/heterocl/tensor.py
index e50fecb6..9930fc19 100644
--- a/heterocl/tensor.py
+++ b/heterocl/tensor.py
@@ -176,11 +176,8 @@ def to_int(x):
             # concatenate the tuple
             # each element is a byte
             byte_str = b''
-            for i in range(len(x) - 1):
-                # little endian, first x-1 elements are unsigned bytes
+            for i in range(len(x)):
                 byte_str += x[i].to_bytes(1, byteorder='little', signed=False)
-            # last element is signed
-            byte_str += x[-1].to_bytes(1, byteorder='little', signed=signed)
             value = int.from_bytes(byte_str, byteorder='little', signed=signed)
             return value
         pylist = to_int(pylist)
diff --git a/heterocl/utils.py b/heterocl/utils.py
index 8675d54d..a6560054 100644
--- a/heterocl/utils.py
+++ b/heterocl/utils.py
@@ -273,7 +273,8 @@ def make_anywidth_numpy_array(val, bitwidth, signed):
         new_dtype = np.dtype(
             {
                 "names": [f"f{i}" for i in range(n_bytes)],
-                "formats": ["u1"] * (n_bytes - 1) + (["i1"] if signed else ["u1"]),
+                # "formats": ["u1"] * (n_bytes - 1) + (["i1"] if signed else ["u1"]),
+                "formats": ["u1"] * n_bytes,
                 "offsets": list(range(n_bytes)),
                 "itemsize": n_bytes,
             }

From 9a28e52ecbafce34c22b9e8a9867b97d04f848ca Mon Sep 17 00:00:00 2001
From: Niansong Zhang <nz264@cornell.edu>
Date: Wed, 15 Mar 2023 14:31:46 -0400
Subject: [PATCH 10/15] [Array] Fix issue with fixed type overflow handling

---
 heterocl/tensor.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/heterocl/tensor.py b/heterocl/tensor.py
index 9930fc19..3ab1d3ce 100644
--- a/heterocl/tensor.py
+++ b/heterocl/tensor.py
@@ -124,6 +124,7 @@ def cast_func(x):
             # Handle overflow
             sb = 1 << self.dtype.bits
             sb_limit = 1 << (self.dtype.bits - 1)
+            array = array.astype(np.float64)
             array = array * (2**dtype.fracs)
             def cast_func(x):
                 # recursive
@@ -136,9 +137,11 @@ def cast_func(x):
                 array = [cast_func(x) for x in array]
             else:
                 array = np.vectorize(cast_func)(array)
+            array = array.astype(np.int64)
         elif isinstance(dtype, UFixed):
             # Handle overflow
             sb = 1 << self.dtype.bits
+            array = array.astype(np.float64)
             array = array * (2**dtype.fracs)
             def cast_func(x):
                 # recursive
@@ -150,6 +153,7 @@ def cast_func(x):
                 array = [cast_func(x) for x in array]
             else:
                 array = np.vectorize(cast_func)(array)
+            array = array.astype(np.int64)
         else:
             raise DTypeError("Type error: unrecognized type: " + str(self.dtype))
         return array

From e930edbde3a43ff12a3d6a4d87e15119c513fd43 Mon Sep 17 00:00:00 2001
From: Niansong Zhang <nz264@cornell.edu>
Date: Wed, 15 Mar 2023 15:11:25 -0400
Subject: [PATCH 11/15] Format with black

---
 heterocl/runtime.py | 34 +++++++++++++++++------------
 heterocl/tensor.py  | 53 ++++++++++++++++++++++++++++++---------------
 heterocl/utils.py   |  9 +++-----
 3 files changed, 58 insertions(+), 38 deletions(-)

diff --git a/heterocl/runtime.py b/heterocl/runtime.py
index 8a240da7..341fb658 100644
--- a/heterocl/runtime.py
+++ b/heterocl/runtime.py
@@ -16,7 +16,12 @@
 # This is a Python bug, see:
 # https://stackoverflow.com/questions/4964101/pep-3118-warning-when-using-ctypes-array-as-numpy-array
 import warnings
-warnings.filterwarnings("ignore", category=RuntimeWarning, message="A builtin ctypes object gave a PEP3118 format string that does not match its itemsize*")
+
+warnings.filterwarnings(
+    "ignore",
+    category=RuntimeWarning,
+    message="A builtin ctypes object gave a PEP3118 format string that does not match its itemsize*",
+)
 
 
 def run_process(cmd, pattern=None):
@@ -145,15 +150,16 @@ def execute_llvm_backend_obsolete(execution_engine, name, return_num, *argv):
     # Copy output arrays back
     # might be unnecessary
     # for i, return_p in enumerate(return_pointers):
-        # out_array = rt.ranked_memref_to_numpy(return_p[0])
-        # np.copyto(argv[-(len(return_args) - i)].np_array, out_array) # problem here
+    # out_array = rt.ranked_memref_to_numpy(return_p[0])
+    # np.copyto(argv[-(len(return_args) - i)].np_array, out_array) # problem here
+
 
 def execute_llvm_backend(execution_engine, name, return_num, *argv):
     """
-    Execute LLVM backend. Assume all return args have been moved to 
-    input args. 
+    Execute LLVM backend. Assume all return args have been moved to
+    input args.
     ----------
-    execution_engine: mlir.ExecutionEngine 
+    execution_engine: mlir.ExecutionEngine
         JIT object, created in hcl.build
     name: str
         device top-level function name
@@ -163,7 +169,7 @@ def execute_llvm_backend(execution_engine, name, return_num, *argv):
     # TODO: remove return_num
     if not isinstance(argv, list):
         argv = list(argv)
-    
+
     # Unwrap hcl Array to get numpy arrays
     argv_np = [arg.unwrap() for arg in argv]
     arg_pointers = []
@@ -173,10 +179,10 @@ def execute_llvm_backend(execution_engine, name, return_num, *argv):
     # Invoke device top-level function
     execution_engine.invoke(name, *arg_pointers)
     # for i, arg_p in enumerate(arg_pointers):
-        # out_array = rt.ranked_memref_to_numpy(arg_p[0])
-        # if out_array element type has one byte, 
-        # ranked_memref_to_numpy will automatically unpack it
-        # if argv[i].np_array.dtype.itemsize == 1:
-        #     np.copyto(argv[i].np_array['f0'], out_array)
-        # else:
-        #     np.copyto(argv[i].np_array, out_array) # target, source
\ No newline at end of file
+    # out_array = rt.ranked_memref_to_numpy(arg_p[0])
+    # if out_array element type has one byte,
+    # ranked_memref_to_numpy will automatically unpack it
+    # if argv[i].np_array.dtype.itemsize == 1:
+    #     np.copyto(argv[i].np_array['f0'], out_array)
+    # else:
+    #     np.copyto(argv[i].np_array, out_array) # target, source
diff --git a/heterocl/tensor.py b/heterocl/tensor.py
index 3ab1d3ce..00cbdfbd 100644
--- a/heterocl/tensor.py
+++ b/heterocl/tensor.py
@@ -39,7 +39,8 @@ def __init__(self, array, dtype):
             signed = isinstance(dtype, Int)
             # closest power of 2
             bitwidth = 1 << (self.dtype.bits - 1).bit_length()
-            if bitwidth < 8: bitwidth = 8
+            if bitwidth < 8:
+                bitwidth = 8
             # this is to be compliant with MLIR's anywidth type representation
             # e.g. i1-i8 -> int8
             #      i9-i16 -> int16
@@ -48,7 +49,7 @@ def __init__(self, array, dtype):
             #      i65-i128 -> int128
             #      i129-i256 -> int256
             self.np_array = make_anywidth_numpy_array(self.np_array, bitwidth, signed)
-    
+
     def asnumpy(self):
         """
         Convert HeteroCL array to numpy array / python list.
@@ -62,23 +63,31 @@ def asnumpy(self):
             return res_array
         elif isinstance(self.dtype, Int):
             if self.dtype.bits > 64:
-                DTypeWarning(f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list")
+                DTypeWarning(
+                    f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list"
+                )
             return self._struct_np_array_to_int()
         elif isinstance(self.dtype, UInt):
             if self.dtype.bits > 64:
-                DTypeWarning(f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list")
+                DTypeWarning(
+                    f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list"
+                )
             return self._struct_np_array_to_int()
-        #TODO(Niansong): fixed/ufixed does not go through struct_np_array_to_int for now
+        # TODO(Niansong): fixed/ufixed does not go through struct_np_array_to_int for now
         # because a change in IR is needed to support this, leaving it to another PR
         elif isinstance(self.dtype, Fixed):
             if self.dtype.bits > 64:
-                DTypeWarning(f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list")
+                DTypeWarning(
+                    f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list"
+                )
             # base_array = self._struct_np_array_to_int()
             # return base_array.astype(np.float64) / float(2 ** (self.dtype.fracs))
             return self.np_array.astype(np.float64) / float(2 ** (self.dtype.fracs))
         elif isinstance(self.dtype, UFixed):
             if self.dtype.bits > 64:
-                DTypeWarning(f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list")
+                DTypeWarning(
+                    f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list"
+                )
             # base_array = self._struct_np_array_to_int()
             # return base_array.astype(np.float64) / float(2 ** (self.dtype.fracs))
             return self.np_array.astype(np.float64) / float(2 ** (self.dtype.fracs))
@@ -88,7 +97,6 @@ def asnumpy(self):
     def unwrap(self):
         return self.np_array
 
-
     def _handle_overflow(self, array, dtype):
         """
         If the dtype is wider than 64 bits,
@@ -105,15 +113,19 @@ def _handle_overflow(self, array, dtype):
         elif isinstance(dtype, Int):
             sb = 1 << self.dtype.bits
             sb_limit = 1 << (self.dtype.bits - 1)
+
             def cast_func(x):
                 # recursive
                 if isinstance(x, list):
                     return [cast_func(y) for y in x]
                 # signed integer overflow function: wrap mode
-                x = x % sb # cap the value to the max value of the bitwidth
+                x = x % sb  # cap the value to the max value of the bitwidth
                 return x if x < sb_limit else x - sb
+
             if isinstance(array, list):
-                array = [cast_func(x) for x in array] # TODO: this should be tested independently
+                array = [
+                    cast_func(x) for x in array
+                ]  # TODO: this should be tested independently
             else:
                 array = np.vectorize(cast_func)(array)
         elif isinstance(dtype, UInt):
@@ -126,13 +138,15 @@ def cast_func(x):
             sb_limit = 1 << (self.dtype.bits - 1)
             array = array.astype(np.float64)
             array = array * (2**dtype.fracs)
+
             def cast_func(x):
                 # recursive
                 if isinstance(x, list):
                     return [cast_func(y) for y in x]
-                x = math.trunc(x) % sb # rounds towards zero
+                x = math.trunc(x) % sb  # rounds towards zero
                 # signed integer overflow function: wrap mode
                 return x if x < sb_limit else x - sb
+
             if isinstance(array, list):
                 array = [cast_func(x) for x in array]
             else:
@@ -143,12 +157,14 @@ def cast_func(x):
             sb = 1 << self.dtype.bits
             array = array.astype(np.float64)
             array = array * (2**dtype.fracs)
+
             def cast_func(x):
                 # recursive
                 if isinstance(x, list):
                     return [cast_func(y) for y in x]
-                x = math.trunc(x) % sb # rounds towards zero
+                x = math.trunc(x) % sb  # rounds towards zero
                 return x
+
             if isinstance(array, list):
                 array = [cast_func(x) for x in array]
             else:
@@ -157,10 +173,10 @@ def cast_func(x):
         else:
             raise DTypeError("Type error: unrecognized type: " + str(self.dtype))
         return array
-    
 
     def _struct_np_array_to_int(self):
         pylist = self.np_array.tolist()
+
         # each element is a tuple
         def to_int(x):
             if isinstance(x, list):
@@ -174,16 +190,17 @@ def to_int(x):
             msb = (x[byte_idx] & (1 << bit_idx)) > 0
             # sign extension
             if signed and msb:
-                x[byte_idx] |= ((0xff << bit_idx) & 0xff)
+                x[byte_idx] |= (0xFF << bit_idx) & 0xFF
                 for i in range(byte_idx + 1, len(x)):
-                    x[i] = 0xff
+                    x[i] = 0xFF
             # concatenate the tuple
             # each element is a byte
-            byte_str = b''
+            byte_str = b""
             for i in range(len(x)):
-                byte_str += x[i].to_bytes(1, byteorder='little', signed=False)
-            value = int.from_bytes(byte_str, byteorder='little', signed=signed)
+                byte_str += x[i].to_bytes(1, byteorder="little", signed=False)
+            value = int.from_bytes(byte_str, byteorder="little", signed=signed)
             return value
+
         pylist = to_int(pylist)
         if self.dtype.bits <= 64:
             return np.array(pylist, dtype=np.int64)
diff --git a/heterocl/utils.py b/heterocl/utils.py
index a6560054..db6387ed 100644
--- a/heterocl/utils.py
+++ b/heterocl/utils.py
@@ -235,7 +235,7 @@ def make_anywidth_numpy_array(val, bitwidth, signed):
     """
     shape = val.shape
     sign_array = val > 0
-    avail_bytes = val.itemsize # number of bytes of each element
+    avail_bytes = val.itemsize  # number of bytes of each element
     # The following code has several steps to convert the numpy array to have
     # the correct data type in order to create an MLIR constant tensor.
     # Since MLIR-NumPy Python interface only supports byte-addressable data types,
@@ -258,10 +258,7 @@ def make_anywidth_numpy_array(val, bitwidth, signed):
         decomposed_np_dtype = np.dtype(
             (
                 val.dtype,
-                {
-                    f"f{i}": (np.uint8, i)
-                    for i in range(val.dtype.itemsize)
-                },
+                {f"f{i}": (np.uint8, i) for i in range(val.dtype.itemsize)},
             )
         )
         val = val.view(decomposed_np_dtype)
@@ -293,4 +290,4 @@ def make_anywidth_numpy_array(val, bitwidth, signed):
         val = val.view(np.dtype(new_dtype))
         # -> reshape: 6*6*i24
         val = val.reshape(shape)
-        return val
\ No newline at end of file
+        return val

From e10a28f301f34c0126dcc9d0c620771bfb945e9b Mon Sep 17 00:00:00 2001
From: Niansong Zhang <nz264@cornell.edu>
Date: Wed, 15 Mar 2023 15:23:38 -0400
Subject: [PATCH 12/15] [Lint] Fix lint errors

---
 heterocl/ast/ir_builder.py |  6 +----
 heterocl/module.py         |  2 +-
 heterocl/runtime.py        | 47 ++------------------------------------
 heterocl/tensor.py         | 26 ++++++++++-----------
 heterocl/utils.py          |  5 ++--
 5 files changed, 19 insertions(+), 67 deletions(-)

diff --git a/heterocl/ast/ir_builder.py b/heterocl/ast/ir_builder.py
index ffa820e7..fae0d970 100644
--- a/heterocl/ast/ir_builder.py
+++ b/heterocl/ast/ir_builder.py
@@ -7,8 +7,6 @@
 
 # Import MLIR dialects
 # Naming rule: import dialect as dialect_d
-import numpy as np
-
 from hcl_mlir.dialects import (
     func as func_d,
     hcl as hcl_d,
@@ -1408,10 +1406,8 @@ def build_bit_reverse_op(self, op: ast.BitReverseOp, ip):
     def build_constant_tensor_op(self, op: ast.ConstantTensorOp, ip):
         loc = Location.file(op.loc.filename, op.loc.lineno, 0)
         dtype = hcl_dtype_to_mlir(op.dtype, signless=True)
-        shape = op.values.shape
         if isinstance(op.dtype, (htypes.Int, htypes.UInt)):
-            signed = isinstance(op.dtype, htypes.Int)
-            val = make_anywidth_numpy_array(op.values, op.dtype.bits, signed)
+            val = make_anywidth_numpy_array(op.values, op.dtype.bits)
             value_attr = DenseElementsAttr.get(val, shape=op.values.shape, type=dtype)
         else:
             val = op.values
diff --git a/heterocl/module.py b/heterocl/module.py
index 0dee554a..17bb2093 100644
--- a/heterocl/module.py
+++ b/heterocl/module.py
@@ -110,7 +110,7 @@ def __call__(self, *argv):
                                 argv[len(op.arguments) + i].np_array = np.pad(
                                     argv[len(op.arguments) + i].np_array, pad_shape
                                 )
-            execute_llvm_backend(self.src, self.name, self.return_num, *argv)
+            execute_llvm_backend(self.src, self.name, *argv)
             for res, shape in original_results:
                 slicing = []
                 for s in shape:
diff --git a/heterocl/runtime.py b/heterocl/runtime.py
index 341fb658..230f4afe 100644
--- a/heterocl/runtime.py
+++ b/heterocl/runtime.py
@@ -7,7 +7,7 @@
 import subprocess
 import ctypes
 import time
-import numpy as np
+import warnings
 
 from hcl_mlir import runtime as rt
 from .report import parse_xml
@@ -15,8 +15,6 @@
 # Filter out the warning from numpy when using ctypes array as numpy array.
 # This is a Python bug, see:
 # https://stackoverflow.com/questions/4964101/pep-3118-warning-when-using-ctypes-array-as-numpy-array
-import warnings
-
 warnings.filterwarnings(
     "ignore",
     category=RuntimeWarning,
@@ -122,39 +120,7 @@ def execute_fpga_backend(target, shell=True):
         raise RuntimeError("Not implemented")
 
 
-def execute_llvm_backend_obsolete(execution_engine, name, return_num, *argv):
-    """
-    - execution_engine: mlir.ExecutionEngine object, created in hcl.build
-    - name: str, device top-level function name
-    - return_num: int, the number of return values
-    - argv: list-like object, a list of input and output variables
-    """
-    if not isinstance(argv, list):
-        argv = list(argv)
-    # Unwrap hcl Array to get numpy arrays
-    argv_np = [arg.unwrap() for arg in argv]
-    # Extract output arrays
-    return_args = argv_np[-return_num:]
-    # Convert output variables from numpy arrays to memref pointers
-    return_pointers = []
-    for arg in return_args:
-        memref = rt.get_ranked_memref_descriptor(arg)
-        return_pointers.append(ctypes.pointer(ctypes.pointer(memref)))
-    # Convert input variables from numpy arrays to memref pointers
-    arg_pointers = []
-    for arg in argv_np[0:-return_num]:
-        memref = rt.get_ranked_memref_descriptor(arg)
-        arg_pointers.append(ctypes.pointer(ctypes.pointer(memref)))
-    # Invoke device top-level function
-    execution_engine.invoke(name, *return_pointers, *arg_pointers)
-    # Copy output arrays back
-    # might be unnecessary
-    # for i, return_p in enumerate(return_pointers):
-    # out_array = rt.ranked_memref_to_numpy(return_p[0])
-    # np.copyto(argv[-(len(return_args) - i)].np_array, out_array) # problem here
-
-
-def execute_llvm_backend(execution_engine, name, return_num, *argv):
+def execute_llvm_backend(execution_engine, name, *argv):
     """
     Execute LLVM backend. Assume all return args have been moved to
     input args.
@@ -166,7 +132,6 @@ def execute_llvm_backend(execution_engine, name, return_num, *argv):
     argv: list-like object
         a list of input and output variables
     """
-    # TODO: remove return_num
     if not isinstance(argv, list):
         argv = list(argv)
 
@@ -178,11 +143,3 @@ def execute_llvm_backend(execution_engine, name, return_num, *argv):
         arg_pointers.append(ctypes.pointer(ctypes.pointer(memref)))
     # Invoke device top-level function
     execution_engine.invoke(name, *arg_pointers)
-    # for i, arg_p in enumerate(arg_pointers):
-    # out_array = rt.ranked_memref_to_numpy(arg_p[0])
-    # if out_array element type has one byte,
-    # ranked_memref_to_numpy will automatically unpack it
-    # if argv[i].np_array.dtype.itemsize == 1:
-    #     np.copyto(argv[i].np_array['f0'], out_array)
-    # else:
-    #     np.copyto(argv[i].np_array, out_array) # target, source
diff --git a/heterocl/tensor.py b/heterocl/tensor.py
index 00cbdfbd..cb48b0ea 100644
--- a/heterocl/tensor.py
+++ b/heterocl/tensor.py
@@ -1,8 +1,8 @@
 # Copyright HeteroCL authors. All Rights Reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import numpy as np
 import math
+import numpy as np
 from hcl_mlir.exceptions import DTypeError, APIError, DTypeWarning
 
 from .types import dtype_to_str, Int, UInt, Float, Fixed, UFixed
@@ -36,19 +36,17 @@ def __init__(self, array, dtype):
         # with each field being a byte.
         self.np_array = self._handle_overflow(array, dtype)
         if isinstance(dtype, (Int, UInt)):
-            signed = isinstance(dtype, Int)
             # closest power of 2
             bitwidth = 1 << (self.dtype.bits - 1).bit_length()
-            if bitwidth < 8:
-                bitwidth = 8
-            # this is to be compliant with MLIR's anywidth type representation
+            bitwidth = max(bitwidth, 8)
+            # this is to be compliant with MLIR's anywidth int type alignment
             # e.g. i1-i8 -> int8
             #      i9-i16 -> int16
             #      i17-i32 -> int32
             #      i33-i64 -> int64
             #      i65-i128 -> int128
             #      i129-i256 -> int256
-            self.np_array = make_anywidth_numpy_array(self.np_array, bitwidth, signed)
+            self.np_array = make_anywidth_numpy_array(self.np_array, bitwidth)
 
     def asnumpy(self):
         """
@@ -56,6 +54,7 @@ def asnumpy(self):
         If the bitwidth is wider than 64, the result will be a python list.
         Otherwise, return a numpy array.
         """
+        # pylint: disable=no-else-return
         if isinstance(self.dtype, Float):
             hcl_dtype_str = dtype_to_str(self.dtype)
             np_dtype = np.dtype(hcl_dtype_str)
@@ -65,13 +64,13 @@ def asnumpy(self):
             if self.dtype.bits > 64:
                 DTypeWarning(
                     f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list"
-                )
+                ).warn()
             return self._struct_np_array_to_int()
         elif isinstance(self.dtype, UInt):
             if self.dtype.bits > 64:
                 DTypeWarning(
                     f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list"
-                )
+                ).warn()
             return self._struct_np_array_to_int()
         # TODO(Niansong): fixed/ufixed does not go through struct_np_array_to_int for now
         # because a change in IR is needed to support this, leaving it to another PR
@@ -79,7 +78,7 @@ def asnumpy(self):
             if self.dtype.bits > 64:
                 DTypeWarning(
                     f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list"
-                )
+                ).warn()
             # base_array = self._struct_np_array_to_int()
             # return base_array.astype(np.float64) / float(2 ** (self.dtype.fracs))
             return self.np_array.astype(np.float64) / float(2 ** (self.dtype.fracs))
@@ -87,7 +86,7 @@ def asnumpy(self):
             if self.dtype.bits > 64:
                 DTypeWarning(
                     f"The bitwidth of target type is wider than 64 ({self.dtype}), .asnumpy() returns a python list"
-                )
+                ).warn()
             # base_array = self._struct_np_array_to_int()
             # return base_array.astype(np.float64) / float(2 ** (self.dtype.fracs))
             return self.np_array.astype(np.float64) / float(2 ** (self.dtype.fracs))
@@ -196,16 +195,15 @@ def to_int(x):
             # concatenate the tuple
             # each element is a byte
             byte_str = b""
-            for i in range(len(x)):
-                byte_str += x[i].to_bytes(1, byteorder="little", signed=False)
+            for byte in x:
+                byte_str += byte.to_bytes(1, byteorder="little", signed=False)
             value = int.from_bytes(byte_str, byteorder="little", signed=signed)
             return value
 
         pylist = to_int(pylist)
         if self.dtype.bits <= 64:
             return np.array(pylist, dtype=np.int64)
-        else:
-            return pylist
+        return pylist
 
     def __repr__(self) -> str:
         return self.asnumpy().__repr__()
diff --git a/heterocl/utils.py b/heterocl/utils.py
index db6387ed..b5806ff3 100644
--- a/heterocl/utils.py
+++ b/heterocl/utils.py
@@ -217,7 +217,7 @@ def get_max_value(dtype):
     raise DTypeError(f"Unrecognized data type: {dtype}")
 
 
-def make_anywidth_numpy_array(val, bitwidth, signed):
+def make_anywidth_numpy_array(val, bitwidth):
     """
     Converts a numpy array to any target bitwidth.
     ----------------
@@ -247,6 +247,8 @@ def make_anywidth_numpy_array(val, bitwidth, signed):
     # numpy input: 6*6*i64
     # 1. Decompose the original i32 or i64 array into a structured array of uint8
     #  -> decompose: 6*6*8*i8
+    # pylint: disable=no-else-return
+    # I think this if-else makes the code more readable
     if bitwidth == 1:
         return np.packbits(val, axis=None, bitorder="little")
     else:
@@ -270,7 +272,6 @@ def make_anywidth_numpy_array(val, bitwidth, signed):
         new_dtype = np.dtype(
             {
                 "names": [f"f{i}" for i in range(n_bytes)],
-                # "formats": ["u1"] * (n_bytes - 1) + (["i1"] if signed else ["u1"]),
                 "formats": ["u1"] * n_bytes,
                 "offsets": list(range(n_bytes)),
                 "itemsize": n_bytes,

From a873b892f25ef5bd5bb5fb0541dd6fa4e87855eb Mon Sep 17 00:00:00 2001
From: Niansong Zhang <nz264@cornell.edu>
Date: Wed, 15 Mar 2023 15:48:01 -0400
Subject: [PATCH 13/15] [Lint] Upgrade local pylint, fix errors

---
 heterocl/tensor.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/heterocl/tensor.py b/heterocl/tensor.py
index cb48b0ea..3d8e6226 100644
--- a/heterocl/tensor.py
+++ b/heterocl/tensor.py
@@ -126,7 +126,7 @@ def cast_func(x):
                     cast_func(x) for x in array
                 ]  # TODO: this should be tested independently
             else:
-                array = np.vectorize(cast_func)(array)
+                array = np.vectorize(cast_func)(array).astype(np.int64)
         elif isinstance(dtype, UInt):
             # Handle overflow
             sb = 1 << self.dtype.bits
@@ -149,8 +149,7 @@ def cast_func(x):
             if isinstance(array, list):
                 array = [cast_func(x) for x in array]
             else:
-                array = np.vectorize(cast_func)(array)
-            array = array.astype(np.int64)
+                array = np.vectorize(cast_func)(array).astype(np.int64)
         elif isinstance(dtype, UFixed):
             # Handle overflow
             sb = 1 << self.dtype.bits
@@ -167,8 +166,7 @@ def cast_func(x):
             if isinstance(array, list):
                 array = [cast_func(x) for x in array]
             else:
-                array = np.vectorize(cast_func)(array)
-            array = array.astype(np.int64)
+                array = np.vectorize(cast_func)(array).astype(np.int64)
         else:
             raise DTypeError("Type error: unrecognized type: " + str(self.dtype))
         return array

From 506349aca8e69c1dfb3c85c0b2ed82552d42667d Mon Sep 17 00:00:00 2001
From: Niansong Zhang <nz264@cornell.edu>
Date: Wed, 15 Mar 2023 16:25:27 -0400
Subject: [PATCH 14/15] [Test] Add test_irregular_bitwidth_input

---
 heterocl/utils.py   |  2 +-
 tests/test_dtype.py | 30 ++++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/heterocl/utils.py b/heterocl/utils.py
index b5806ff3..91418928 100644
--- a/heterocl/utils.py
+++ b/heterocl/utils.py
@@ -234,7 +234,7 @@ def make_anywidth_numpy_array(val, bitwidth):
         numpy array with the target bitwidth
     """
     shape = val.shape
-    sign_array = val > 0
+    sign_array = val >= 0
     avail_bytes = val.itemsize  # number of bytes of each element
     # The following code has several steps to convert the numpy array to have
     # the correct data type in order to create an MLIR constant tensor.
diff --git a/tests/test_dtype.py b/tests/test_dtype.py
index 7ff3a3f5..47439bc5 100644
--- a/tests/test_dtype.py
+++ b/tests/test_dtype.py
@@ -671,5 +671,35 @@ def cast(A):
                 assert False, "test failed, see failed test case above"
 
 
+def test_irregular_bitwidth_input():
+    def test_int(dtype):
+        hcl.init(dtype)
+        A = hcl.placeholder((10,), "A", dtype=dtype)
+        B = hcl.compute(A.shape, lambda *args: A[args] + 1, "B")
+        s = hcl.create_schedule([A, B])
+        f = hcl.build(s)
+        # A_np = np.random.randint(-10, 10, A.shape)
+        A_np = np.zeros(A.shape)
+        A_hcl = hcl.asarray(A_np, dtype=dtype)
+        B_hcl = hcl.asarray(np.zeros(A.shape), dtype=dtype)
+        f(A_hcl, B_hcl)
+        B_np = B_hcl.asnumpy()
+        if dtype.bits <= 64:
+            golden = hcl.asarray(A_np + 1, dtype=dtype).asnumpy()
+            assert np.allclose(golden, B_np)
+        else:
+            # B_np is a list
+            golden = [x + 1 for x in A_np.tolist()]
+            for res, g in zip(B_np, golden):
+                if res != g:
+                    print(f"res: {res}, hex: {hex(res)}\n")
+                    print(f"g: {g}, hex: {hex(g)}\n")
+                assert res == g
+
+    test_dtypes = [hcl.Int(2), hcl.Int(20), hcl.Int(63), hcl.Int(255), hcl.Int(512)]
+    for dtype in test_dtypes:
+        test_int(dtype)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])

From acb4e2f58080fe9444987cdb25908f3446bb9506 Mon Sep 17 00:00:00 2001
From: Niansong Zhang <nz264@cornell.edu>
Date: Wed, 15 Mar 2023 16:41:02 -0400
Subject: [PATCH 15/15] [Test] Use random input

---
 tests/test_dtype.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_dtype.py b/tests/test_dtype.py
index 47439bc5..8100ac82 100644
--- a/tests/test_dtype.py
+++ b/tests/test_dtype.py
@@ -678,8 +678,8 @@ def test_int(dtype):
         B = hcl.compute(A.shape, lambda *args: A[args] + 1, "B")
         s = hcl.create_schedule([A, B])
         f = hcl.build(s)
-        # A_np = np.random.randint(-10, 10, A.shape)
-        A_np = np.zeros(A.shape)
+        A_np = np.random.randint(-10, 10, A.shape)
+        # A_np = np.zeros(A.shape)
         A_hcl = hcl.asarray(A_np, dtype=dtype)
         B_hcl = hcl.asarray(np.zeros(A.shape), dtype=dtype)
         f(A_hcl, B_hcl)