diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py index 7842a53c0..8def3589f 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py +++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py @@ -12,7 +12,7 @@ bfloat16 = tf.bfloat16.as_numpy_dtype -from aie.dialects.scf import yield_, for_ as range_ +from aie.extras.dialects.ext.scf import _for as range_ from aie.dialects.aiex import npu_dma_memcpy_nd, npu_sync from aie.api.dataflow.inout.inout import MyInOutProgram @@ -94,77 +94,41 @@ def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str, vectorized): num_data_tiles = (M // m) * (N // n) # input/output matrices - memref_A_ty = np.ndarray[dtype_in, [M * K]] - memref_B_ty = np.ndarray[dtype_in, [K * N]] - memref_C_ty = np.ndarray[dtype_out, [M * N]] + A_ty = np.ndarray[dtype_in, (M * K,)] + B_ty = np.ndarray[dtype_in, (K * N,)] + C_ty = np.ndarray[dtype_out, (M * N,)] # submatrices - memref_a_ty = np.ndarray[dtype_in, (m, k)] - memref_b_ty = np.ndarray[dtype_in, (k, n)] - memref_c_ty = np.ndarray[dtype_out, (m, n)] + a_ty = np.ndarray[dtype_in, (m, k)] + b_ty = np.ndarray[dtype_in, (k, n)] + c_ty = np.ndarray[dtype_out, (m, n)] # AIE Core Function declarations scalar_str = "" if vectorized else "scalar_" - zero = BinKernel( - f"zero_{scalar_str}{dtype_out_str}", f"mm_{m}x{k}x{n}.o", [memref_c_ty] - ) + zero = BinKernel(f"zero_{scalar_str}{dtype_out_str}", f"mm_{m}x{k}x{n}.o", [c_ty]) matmul = BinKernel( f"matmul_{scalar_str}{dtype_in_str}_{dtype_out_str}", f"mm_{m}x{k}x{n}.o", - [memref_a_ty, memref_b_ty, memref_c_ty], + [a_ty, b_ty, c_ty], ) - inA = MyObjectFifo(2, memref_a_ty) - memA = MyObjectFifo( - 2, - memref_a_ty, - dimensionsToStream=( - [ - (m // r, r * k), - (k // s, s), - (r, k), - (s, 1), - ] - if vectorized - else [] - ), - ) + inA = MyObjectFifo(2, a_ty) + memAToStream = [(m // r, r * k), (k // s, s), (r, k), (s, 1)] if vectorized else [] + memA = MyObjectFifo(2, a_ty, dimensionsToStream=memAToStream) inALink = MyObjectFifoLink([inA.second], [memA.first], coords=(0, 1)) # AnyMemtile # Input B - inB = MyObjectFifo(2, memref_b_ty) - memB = MyObjectFifo( - 2, - memref_b_ty, - dimensionsToStream=( - [ - (k // s, s * n), - (n // t, t), - (s, n), - (t, 1), - ] - if vectorized - else [] - ), - ) + inB = MyObjectFifo(2, b_ty) + memBToStream = [(k // s, s * n), (n // t, t), (s, n), (t, 1)] if vectorized else [] + memB = MyObjectFifo(2, b_ty, dimensionsToStream=memBToStream) inBLink = MyObjectFifoLink([inB.second], [memB.first], coords=(0, 1)) # AnyMemtile # Output C - memC = MyObjectFifo(2, memref_c_ty) - outC = MyObjectFifo( - 2, - memref_c_ty, - dimensionsToStream=( - [ - (m // r, r * n), - (r, t), - (n // t, r * t), - (t, 1), - ] - if vectorized - else [] - ), + memC = MyObjectFifo(2, c_ty) + memCToStream = ( + [(m // r, r * n), (r, t), (n // t, r * t), (t, 1)] if vectorized else [] ) + outC = MyObjectFifo(2, c_ty, dimensionsToStream=memCToStream) outCLink = MyObjectFifoLink( [memC.second], [outC.first], coords=(0, 1) ) # AnyMemtile @@ -183,13 +147,8 @@ def core_fn(a, b, c, zero, matmul): matmul(elem_in_a, elem_in_b, elem_out) a.release(1) b.release(1) - if (K // k) > 1: - yield_([]) c.release(1) - if num_data_tiles > 1: - yield_([]) - yield_([]) def sequence_fn(A, B, C, inA, inB, outC): # only do 4 tile rows at a time before synchronizing, so we can reuse BDs @@ -242,7 +201,7 @@ def sequence_fn(A, B, C, inA, inB, outC): inout_program = MyInOutProgram( sequence_fn, - [memref_A_ty, memref_B_ty, memref_C_ty], + [A_ty, B_ty, C_ty], [inA.first, inB.first, outC.second], coords=(0, 0), # AnyShim ) diff --git a/programming_examples/basic/matrix_scalar_add/aie2.py b/programming_examples/basic/matrix_scalar_add/aie2.py index 45922e0bf..1d910f390 100644 --- a/programming_examples/basic/matrix_scalar_add/aie2.py +++ b/programming_examples/basic/matrix_scalar_add/aie2.py @@ -6,21 +6,17 @@ # # (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates -from aie.dialects.aie import * -from aie.dialects.aiex import * -from aie.dialects.scf import * -from aie.dialects.scf import for_ as range_ -from aie.extras.dialects.ext import memref, arith -from aie.extras.context import mlir_mod_ctx +import numpy as np +import sys from aie.extras.dialects.ext.arith import constant from aie.extras.dialects.ext.func import func -from aie.extras.ast import canonicalize - -# from aie.extras.dialects.ext.scf import canonicalizer as scf_canonicalizer -# from aie.extras.dialects.ast.canonicalize import canonicalize - -import sys +from aie.extras.dialects.ext.scf import _for as range_ +from aie.api.dataflow.inout.simplefifoinout import SimpleFifoInOutProgram +from aie.api.dataflow.objectfifo import MyObjectFifo +from aie.api.phys.device import NPU1Col1, XCVC1902 +from aie.api.program import MyProgram +from aie.api.worker import MyWorker # Size of the entire image IMAGE_HEIGHT = 16 @@ -37,83 +33,67 @@ objfifo_capacity = 4 - -def my_matrix_add_one(): - - if len(sys.argv) != 3: - raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)") - if sys.argv[1] == "npu": - dev = AIEDevice.npu1_1col - elif sys.argv[1] == "xcvc1902": - dev = AIEDevice.xcvc1902 - else: - raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) - - @device(dev) - def device_body(): - memRef_ty = T.memref(TILE_SIZE, T.i32()) - - # Tile declarations - ShimTile = tile(int(sys.argv[2]), 0) - ComputeTile2 = tile(int(sys.argv[2]), 2) - - # AIE-array data movement with object fifos - # Input - of_in1 = object_fifo("in0", ShimTile, ComputeTile2, objfifo_capacity, memRef_ty) - - # Output - of_out1 = object_fifo( - "out0", ComputeTile2, ShimTile, objfifo_capacity, memRef_ty - ) - - # @canonicalize(using=scf_canonicalizer) shoudl decorate this after func? - # we need emit = true because must be emited in outer loop (not deferred) to have access to symbol table - @func(emit=True) - def memfoo(elem_in: memRef_ty, elem_out: memRef_ty): - one = constant(1) - for i in range_(TILE_SIZE): - elem_out[i] = elem_in[i] + one - yield_([]) - - # Set up compute tile 2 - @core(ComputeTile2) - def core_body(): - # Effective while(1) - for _ in for_(sys.maxsize): - elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1) - elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1) - memfoo(elem_in, elem_out) - of_in1.release(ObjectFifoPort.Consume, 1) - of_out1.release(ObjectFifoPort.Produce, 1) - yield_([]) - - # To/from AIE-array data movement - - tensor_ty = T.memref(TILE_SIZE, T.i32()) - - @runtime_sequence(tensor_ty, tensor_ty) - def sequence(inTensor, outTensor): - npu_dma_memcpy_nd( - metadata="out0", - bd_id=0, - mem=outTensor, - sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH], - strides=[1, 1, IMAGE_WIDTH, 1], - ) - npu_dma_memcpy_nd( - metadata="in0", - bd_id=1, - mem=inTensor, - sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH], - strides=[1, 1, IMAGE_WIDTH, 1], - ) - npu_sync(column=0, row=0, direction=0, channel=0) - - -with mlir_mod_ctx() as ctx: - my_matrix_add_one() - res = ctx.module.operation.verify() - if res == True: - print(ctx.module) - else: - print(res) +if len(sys.argv) != 3: + raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)") +if sys.argv[1] == "npu": + dev = NPU1Col1() +elif sys.argv[1] == "xcvc1902": + dev = XCVC1902() +else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + +my_dtype = np.int32 +tile_ty = np.ndarray[my_dtype, (TILE_SIZE,)] + +# AIE-array data movement with object fifos +of_in = MyObjectFifo(objfifo_capacity, tile_ty) +of_out = MyObjectFifo(objfifo_capacity, tile_ty) + + +@func +def add_kernel(elem_in: tile_ty, elem_out: tile_ty): + for i in range_(TILE_SIZE): + elem_out[i] = elem_in[i] + constant(1) + + +def core_fn(of_in, of_out, add_kernel): + # Effective while(1) + for _ in range_(sys.maxsize): + elem_in = of_in.acquire(1) + elem_out = of_out.acquire(1) + add_kernel(elem_in, elem_out) + of_in.release(1) + of_out.release(1) + + +# Set up compute tile 2 TODO: clean up placement +worker_program = MyWorker( + core_fn, + [of_in.second, of_out.first, add_kernel], + coords=(int(sys.argv[2]), 2), +) + +# To/from AIE-array data movement +inout_program = SimpleFifoInOutProgram( + of_in.first, + TILE_SIZE, + of_out.second, + TILE_SIZE, + in_sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH], + in_strides=[1, 1, IMAGE_WIDTH, 1], + out_sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH], + out_strides=[1, 1, IMAGE_WIDTH, 1], + dtype=my_dtype, + coords=(int(sys.argv[2]), 0), +) + +my_program = MyProgram( + dev, worker_programs=[worker_program], inout_program=inout_program +) +my_program.resolve_program() + +""" +TODOs: +* look into # @canonicalize(using=scf_canonicalizer) shoudl decorate this after func if we want control flow +* we need emit = true because must be emited in outer loop (not deferred) to have access to symbol table +""" diff --git a/programming_examples/basic/matrix_scalar_add/aie2_new.py b/programming_examples/basic/matrix_scalar_add/aie2_new.py deleted file mode 100644 index 6bfc0bc83..000000000 --- a/programming_examples/basic/matrix_scalar_add/aie2_new.py +++ /dev/null @@ -1,103 +0,0 @@ -# matrix_scalar_add/aie2.py -*- Python -*- -# -# This file is licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates - -import numpy as np -import sys - -from aie.dialects.scf import for_ as range_ -from aie.dialects.scf import yield_ -from aie.dialects.aie import T -from aie.extras.dialects.ext import memref, arith - -from aie.api.dataflow.inout.simplefifoinout import SimpleFifoInOutProgram -from aie.api.dataflow.objectfifo import MyObjectFifo -from aie.api.kernels.bykernel import PyKernel -from aie.api.phys.device import NPU1Col1, XCVC1902 -from aie.api.program import MyProgram -from aie.api.worker import MyWorker - -# Size of the entire image -IMAGE_HEIGHT = 16 -IMAGE_WIDTH = 128 -IMAGE_SIZE = IMAGE_WIDTH * IMAGE_HEIGHT - -# Size of the tile we are processing -TILE_HEIGHT = 8 -TILE_WIDTH = 16 -TILE_SIZE = TILE_WIDTH * TILE_HEIGHT - -NUM_3D = IMAGE_WIDTH / TILE_WIDTH -NUM_4D = IMAGE_HEIGHT / TILE_HEIGHT - -objfifo_capacity = 4 - - -def my_matrix_add_one(): - if len(sys.argv) != 3: - raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)") - if sys.argv[1] == "npu": - dev = NPU1Col1() - elif sys.argv[1] == "xcvc1902": - dev = XCVC1902() - else: - raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) - - my_dtype = np.int32 - tile_ty = np.ndarray[my_dtype, (TILE_SIZE,)] - - # AIE-array data movement with object fifos - of_in = MyObjectFifo(objfifo_capacity, tile_ty) - of_out = MyObjectFifo(objfifo_capacity, tile_ty) - - # TODO: make typed? should have type type_ty - def my_pykernel(elem_in, elem_out): - # No metaprogramming, interpretted literally - for i in range(TILE_SIZE): - elem_out[i] = elem_in[i] + 1 - - py_add_kernel = PyKernel(my_pykernel) - - # Set up compute tile 2 - def core_fn(of_in, of_out, add_kernel): - # Effective while(1) - for _ in range_(sys.maxsize): - elem_in = of_in.acquire(1) - elem_out = of_out.acquire(1) - add_kernel(elem_in, elem_out) - of_in.release(1) - of_out.release(1) - yield_([]) - - # TODO: clean up placement - worker_program = MyWorker( - core_fn, - [of_in.second, of_out.first, py_add_kernel], - coords=(int(sys.argv[2]), 2), - ) - - # To/from AIE-array data movement - inout_program = SimpleFifoInOutProgram( - of_in.first, - TILE_SIZE, - of_out.second, - TILE_SIZE, - in_sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH], - in_strides=[1, 1, IMAGE_WIDTH, 1], - out_sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH], - out_strides=[1, 1, IMAGE_WIDTH, 1], - dtype=my_dtype, - coords=(int(sys.argv[2]), 0), - ) - - my_program = MyProgram( - dev, worker_programs=[worker_program], inout_program=inout_program - ) - my_program.resolve_program() - - -my_matrix_add_one() diff --git a/programming_examples/basic/matrix_scalar_add/aie2_old.py b/programming_examples/basic/matrix_scalar_add/aie2_old.py new file mode 100644 index 000000000..45922e0bf --- /dev/null +++ b/programming_examples/basic/matrix_scalar_add/aie2_old.py @@ -0,0 +1,119 @@ +# matrix_scalar_add/aie2.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.dialects.scf import for_ as range_ +from aie.extras.dialects.ext import memref, arith +from aie.extras.context import mlir_mod_ctx + +from aie.extras.dialects.ext.arith import constant +from aie.extras.dialects.ext.func import func +from aie.extras.ast import canonicalize + +# from aie.extras.dialects.ext.scf import canonicalizer as scf_canonicalizer +# from aie.extras.dialects.ast.canonicalize import canonicalize + +import sys + +# Size of the entire image +IMAGE_HEIGHT = 16 +IMAGE_WIDTH = 128 +IMAGE_SIZE = IMAGE_WIDTH * IMAGE_HEIGHT + +# Size of the tile we are processing +TILE_HEIGHT = 8 +TILE_WIDTH = 16 +TILE_SIZE = TILE_WIDTH * TILE_HEIGHT + +NUM_3D = IMAGE_WIDTH / TILE_WIDTH +NUM_4D = IMAGE_HEIGHT / TILE_HEIGHT + +objfifo_capacity = 4 + + +def my_matrix_add_one(): + + if len(sys.argv) != 3: + raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)") + if sys.argv[1] == "npu": + dev = AIEDevice.npu1_1col + elif sys.argv[1] == "xcvc1902": + dev = AIEDevice.xcvc1902 + else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) + + @device(dev) + def device_body(): + memRef_ty = T.memref(TILE_SIZE, T.i32()) + + # Tile declarations + ShimTile = tile(int(sys.argv[2]), 0) + ComputeTile2 = tile(int(sys.argv[2]), 2) + + # AIE-array data movement with object fifos + # Input + of_in1 = object_fifo("in0", ShimTile, ComputeTile2, objfifo_capacity, memRef_ty) + + # Output + of_out1 = object_fifo( + "out0", ComputeTile2, ShimTile, objfifo_capacity, memRef_ty + ) + + # @canonicalize(using=scf_canonicalizer) shoudl decorate this after func? + # we need emit = true because must be emited in outer loop (not deferred) to have access to symbol table + @func(emit=True) + def memfoo(elem_in: memRef_ty, elem_out: memRef_ty): + one = constant(1) + for i in range_(TILE_SIZE): + elem_out[i] = elem_in[i] + one + yield_([]) + + # Set up compute tile 2 + @core(ComputeTile2) + def core_body(): + # Effective while(1) + for _ in for_(sys.maxsize): + elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1) + elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1) + memfoo(elem_in, elem_out) + of_in1.release(ObjectFifoPort.Consume, 1) + of_out1.release(ObjectFifoPort.Produce, 1) + yield_([]) + + # To/from AIE-array data movement + + tensor_ty = T.memref(TILE_SIZE, T.i32()) + + @runtime_sequence(tensor_ty, tensor_ty) + def sequence(inTensor, outTensor): + npu_dma_memcpy_nd( + metadata="out0", + bd_id=0, + mem=outTensor, + sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH], + strides=[1, 1, IMAGE_WIDTH, 1], + ) + npu_dma_memcpy_nd( + metadata="in0", + bd_id=1, + mem=inTensor, + sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH], + strides=[1, 1, IMAGE_WIDTH, 1], + ) + npu_sync(column=0, row=0, direction=0, channel=0) + + +with mlir_mod_ctx() as ctx: + my_matrix_add_one() + res = ctx.module.operation.verify() + if res == True: + print(ctx.module) + else: + print(res) diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py index 057e22a74..ee91adf6d 100644 --- a/programming_examples/basic/passthrough_kernel/aie2.py +++ b/programming_examples/basic/passthrough_kernel/aie2.py @@ -8,8 +8,8 @@ import sys import numpy as np -from aie.dialects.scf import for_ as range_ -from aie.dialects.scf import yield_ +# TODO: move maybe to aie.api.controlflow +from aie.extras.dialects.ext.scf import _for as range_ from aie.api.dataflow.inout.simplefifoinout import SimpleFifoInOutProgram from aie.api.dataflow.objectfifo import MyObjectFifo @@ -48,7 +48,6 @@ def core_fn(of_in, of_out, passThroughLine): passThroughLine(elemIn, elemOut, line_size) of_in.release(1) of_out.release(1) - yield_([]) # TODO: clean up placement diff --git a/python/api/program.py b/python/api/program.py index 975b9c04b..e4846bee7 100644 --- a/python/api/program.py +++ b/python/api/program.py @@ -6,7 +6,9 @@ """ from ..extras.context import mlir_mod_ctx +from ..extras.dialects.ext.func import FuncBase from ..dialects.aie import device + from .worker import MyWorker from .phys.device import MyDevice from .dataflow.inout.inout import InOutProgram @@ -44,7 +46,10 @@ def device_body(): # generate fifos (and external functions) for w in self.__worker_programs: for arg in w.fn_args: - arg.resolve() + if isinstance(arg, FuncBase): + arg.emit() + else: + arg.resolve() self._print_verify(ctx) for f in self.__inout_program.get_fifos(): f.resolve() diff --git a/python/extras/dialects/ext/func.py b/python/extras/dialects/ext/func.py index 0432de4bd..6229f575c 100644 --- a/python/extras/dialects/ext/func.py +++ b/python/extras/dialects/ext/func.py @@ -1,6 +1,9 @@ import sys +import numpy as np from functools import update_wrapper +from typing import get_origin +from ....api.tensor import MyTensorType from ...meta import op_region_builder from ...util import get_user_code_loc, make_maybe_no_args_decorator, get_arg_types from ....dialects._ods_common import get_op_result_or_op_results @@ -109,9 +112,11 @@ def prep_func_types(sig, return_types): for p in sig.parameters.values() if not p.annotation is inspect.Signature.empty ] + # convert ndarray types to memref types assert all( - isinstance(r, (str, Type)) or isalambda(r) for r in input_types - ), f"all input types must be mlir types {input_types=}" + isinstance(r, (str, Type)) or isalambda(r) or get_origin(r) == np.ndarray + for r in input_types + ), f"all input types must be mlir types or ndarrays (tensors) {input_types=}" user_loc = get_user_code_loc() # If ir.Context is none (like for deferred func emit) if user_loc is None: @@ -197,6 +202,8 @@ def emit(self, *call_args, decl=False, force=False) -> FuncOp: input_types[i] = Type(eval(v, self.body_builder.__globals__)) elif isalambda(v): input_types[i] = v() + elif get_origin(v) == np.ndarray: + input_types[i] = MyTensorType.get_memref_type(v) else: input_types = get_arg_types(call_args) diff --git a/python/extras/dialects/ext/scf.py b/python/extras/dialects/ext/scf.py new file mode 100644 index 000000000..ca466625f --- /dev/null +++ b/python/extras/dialects/ext/scf.py @@ -0,0 +1,50 @@ +from typing import Optional, Sequence + +from ....ir import InsertionPoint, Value +from ....dialects.linalg.opdsl.lang.emitter import _is_index_type +from ....dialects.scf import ForOp, yield_ + +from .arith import constant, index_cast + + +def _for( + start, + stop=None, + step=None, + iter_args: Optional[Sequence[Value]] = None, + insert_yield: bool = True, + *, + loc=None, + ip=None, +): + if step is None: + step = 1 + if stop is None: + stop = start + start = 0 + params = [start, stop, step] + for i, p in enumerate(params): + if isinstance(p, int): + p = constant(p, index=True) + if not _is_index_type(p.type): + p = index_cast(p) + params[i] = p + + start, stop, step = params + + for_op = ForOp(start, stop, step, iter_args, loc=loc, ip=ip) + iv = for_op.induction_variable + iter_args = tuple(for_op.inner_iter_args) + with InsertionPoint(for_op.body): + if len(iter_args) > 1: + yield iv, iter_args, for_op.results + # print("HI") + elif len(iter_args) == 1: + yield iv, iter_args[0], for_op.results[0] + # print("HI") + else: + # print("HELLO") + yield iv + # print("HI1") + if insert_yield: + yield_([])