Skip to content

Commit

Permalink
Remove yield terminator for range_ and function call support for nd.a…
Browse files Browse the repository at this point in the history
…rray types
  • Loading branch information
hunhoffe committed Sep 20, 2024
1 parent 7366e04 commit ae7b161
Show file tree
Hide file tree
Showing 8 changed files with 278 additions and 262 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

bfloat16 = tf.bfloat16.as_numpy_dtype

from aie.dialects.scf import yield_, for_ as range_
from aie.extras.dialects.ext.scf import _for as range_
from aie.dialects.aiex import npu_dma_memcpy_nd, npu_sync

from aie.api.dataflow.inout.inout import MyInOutProgram
Expand Down Expand Up @@ -94,77 +94,41 @@ def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str, vectorized):
num_data_tiles = (M // m) * (N // n)

# input/output matrices
memref_A_ty = np.ndarray[dtype_in, [M * K]]
memref_B_ty = np.ndarray[dtype_in, [K * N]]
memref_C_ty = np.ndarray[dtype_out, [M * N]]
A_ty = np.ndarray[dtype_in, (M * K,)]
B_ty = np.ndarray[dtype_in, (K * N,)]
C_ty = np.ndarray[dtype_out, (M * N,)]

# submatrices
memref_a_ty = np.ndarray[dtype_in, (m, k)]
memref_b_ty = np.ndarray[dtype_in, (k, n)]
memref_c_ty = np.ndarray[dtype_out, (m, n)]
a_ty = np.ndarray[dtype_in, (m, k)]
b_ty = np.ndarray[dtype_in, (k, n)]
c_ty = np.ndarray[dtype_out, (m, n)]

# AIE Core Function declarations
scalar_str = "" if vectorized else "scalar_"
zero = BinKernel(
f"zero_{scalar_str}{dtype_out_str}", f"mm_{m}x{k}x{n}.o", [memref_c_ty]
)
zero = BinKernel(f"zero_{scalar_str}{dtype_out_str}", f"mm_{m}x{k}x{n}.o", [c_ty])
matmul = BinKernel(
f"matmul_{scalar_str}{dtype_in_str}_{dtype_out_str}",
f"mm_{m}x{k}x{n}.o",
[memref_a_ty, memref_b_ty, memref_c_ty],
[a_ty, b_ty, c_ty],
)

inA = MyObjectFifo(2, memref_a_ty)
memA = MyObjectFifo(
2,
memref_a_ty,
dimensionsToStream=(
[
(m // r, r * k),
(k // s, s),
(r, k),
(s, 1),
]
if vectorized
else []
),
)
inA = MyObjectFifo(2, a_ty)
memAToStream = [(m // r, r * k), (k // s, s), (r, k), (s, 1)] if vectorized else []
memA = MyObjectFifo(2, a_ty, dimensionsToStream=memAToStream)
inALink = MyObjectFifoLink([inA.second], [memA.first], coords=(0, 1)) # AnyMemtile

# Input B
inB = MyObjectFifo(2, memref_b_ty)
memB = MyObjectFifo(
2,
memref_b_ty,
dimensionsToStream=(
[
(k // s, s * n),
(n // t, t),
(s, n),
(t, 1),
]
if vectorized
else []
),
)
inB = MyObjectFifo(2, b_ty)
memBToStream = [(k // s, s * n), (n // t, t), (s, n), (t, 1)] if vectorized else []
memB = MyObjectFifo(2, b_ty, dimensionsToStream=memBToStream)
inBLink = MyObjectFifoLink([inB.second], [memB.first], coords=(0, 1)) # AnyMemtile

# Output C
memC = MyObjectFifo(2, memref_c_ty)
outC = MyObjectFifo(
2,
memref_c_ty,
dimensionsToStream=(
[
(m // r, r * n),
(r, t),
(n // t, r * t),
(t, 1),
]
if vectorized
else []
),
memC = MyObjectFifo(2, c_ty)
memCToStream = (
[(m // r, r * n), (r, t), (n // t, r * t), (t, 1)] if vectorized else []
)
outC = MyObjectFifo(2, c_ty, dimensionsToStream=memCToStream)
outCLink = MyObjectFifoLink(
[memC.second], [outC.first], coords=(0, 1)
) # AnyMemtile
Expand All @@ -183,13 +147,8 @@ def core_fn(a, b, c, zero, matmul):
matmul(elem_in_a, elem_in_b, elem_out)
a.release(1)
b.release(1)
if (K // k) > 1:
yield_([])

c.release(1)
if num_data_tiles > 1:
yield_([])
yield_([])

def sequence_fn(A, B, C, inA, inB, outC):
# only do 4 tile rows at a time before synchronizing, so we can reuse BDs
Expand Down Expand Up @@ -242,7 +201,7 @@ def sequence_fn(A, B, C, inA, inB, outC):

inout_program = MyInOutProgram(
sequence_fn,
[memref_A_ty, memref_B_ty, memref_C_ty],
[A_ty, B_ty, C_ty],
[inA.first, inB.first, outC.second],
coords=(0, 0), # AnyShim
)
Expand Down
164 changes: 72 additions & 92 deletions programming_examples/basic/matrix_scalar_add/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,17 @@
#
# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates

from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.dialects.scf import *
from aie.dialects.scf import for_ as range_
from aie.extras.dialects.ext import memref, arith
from aie.extras.context import mlir_mod_ctx
import numpy as np
import sys

from aie.extras.dialects.ext.arith import constant
from aie.extras.dialects.ext.func import func
from aie.extras.ast import canonicalize

# from aie.extras.dialects.ext.scf import canonicalizer as scf_canonicalizer
# from aie.extras.dialects.ast.canonicalize import canonicalize

import sys
from aie.extras.dialects.ext.scf import _for as range_
from aie.api.dataflow.inout.simplefifoinout import SimpleFifoInOutProgram
from aie.api.dataflow.objectfifo import MyObjectFifo
from aie.api.phys.device import NPU1Col1, XCVC1902
from aie.api.program import MyProgram
from aie.api.worker import MyWorker

# Size of the entire image
IMAGE_HEIGHT = 16
Expand All @@ -37,83 +33,67 @@

objfifo_capacity = 4


def my_matrix_add_one():

if len(sys.argv) != 3:
raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
if sys.argv[1] == "npu":
dev = AIEDevice.npu1_1col
elif sys.argv[1] == "xcvc1902":
dev = AIEDevice.xcvc1902
else:
raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))

@device(dev)
def device_body():
memRef_ty = T.memref(TILE_SIZE, T.i32())

# Tile declarations
ShimTile = tile(int(sys.argv[2]), 0)
ComputeTile2 = tile(int(sys.argv[2]), 2)

# AIE-array data movement with object fifos
# Input
of_in1 = object_fifo("in0", ShimTile, ComputeTile2, objfifo_capacity, memRef_ty)

# Output
of_out1 = object_fifo(
"out0", ComputeTile2, ShimTile, objfifo_capacity, memRef_ty
)

# @canonicalize(using=scf_canonicalizer) shoudl decorate this after func?
# we need emit = true because must be emited in outer loop (not deferred) to have access to symbol table
@func(emit=True)
def memfoo(elem_in: memRef_ty, elem_out: memRef_ty):
one = constant(1)
for i in range_(TILE_SIZE):
elem_out[i] = elem_in[i] + one
yield_([])

# Set up compute tile 2
@core(ComputeTile2)
def core_body():
# Effective while(1)
for _ in for_(sys.maxsize):
elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1)
elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1)
memfoo(elem_in, elem_out)
of_in1.release(ObjectFifoPort.Consume, 1)
of_out1.release(ObjectFifoPort.Produce, 1)
yield_([])

# To/from AIE-array data movement

tensor_ty = T.memref(TILE_SIZE, T.i32())

@runtime_sequence(tensor_ty, tensor_ty)
def sequence(inTensor, outTensor):
npu_dma_memcpy_nd(
metadata="out0",
bd_id=0,
mem=outTensor,
sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
strides=[1, 1, IMAGE_WIDTH, 1],
)
npu_dma_memcpy_nd(
metadata="in0",
bd_id=1,
mem=inTensor,
sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
strides=[1, 1, IMAGE_WIDTH, 1],
)
npu_sync(column=0, row=0, direction=0, channel=0)


with mlir_mod_ctx() as ctx:
my_matrix_add_one()
res = ctx.module.operation.verify()
if res == True:
print(ctx.module)
else:
print(res)
if len(sys.argv) != 3:
raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
if sys.argv[1] == "npu":
dev = NPU1Col1()
elif sys.argv[1] == "xcvc1902":
dev = XCVC1902()
else:
raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))

my_dtype = np.int32
tile_ty = np.ndarray[my_dtype, (TILE_SIZE,)]

# AIE-array data movement with object fifos
of_in = MyObjectFifo(objfifo_capacity, tile_ty)
of_out = MyObjectFifo(objfifo_capacity, tile_ty)


@func
def add_kernel(elem_in: tile_ty, elem_out: tile_ty):
for i in range_(TILE_SIZE):
elem_out[i] = elem_in[i] + constant(1)


def core_fn(of_in, of_out, add_kernel):
# Effective while(1)
for _ in range_(sys.maxsize):
elem_in = of_in.acquire(1)
elem_out = of_out.acquire(1)
add_kernel(elem_in, elem_out)
of_in.release(1)
of_out.release(1)


# Set up compute tile 2 TODO: clean up placement
worker_program = MyWorker(
core_fn,
[of_in.second, of_out.first, add_kernel],
coords=(int(sys.argv[2]), 2),
)

# To/from AIE-array data movement
inout_program = SimpleFifoInOutProgram(
of_in.first,
TILE_SIZE,
of_out.second,
TILE_SIZE,
in_sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
in_strides=[1, 1, IMAGE_WIDTH, 1],
out_sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
out_strides=[1, 1, IMAGE_WIDTH, 1],
dtype=my_dtype,
coords=(int(sys.argv[2]), 0),
)

my_program = MyProgram(
dev, worker_programs=[worker_program], inout_program=inout_program
)
my_program.resolve_program()

"""
TODOs:
* look into # @canonicalize(using=scf_canonicalizer) shoudl decorate this after func if we want control flow
* we need emit = true because must be emited in outer loop (not deferred) to have access to symbol table
"""
Loading

0 comments on commit ae7b161

Please sign in to comment.