diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
index 7842a53c0..8def3589f 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
@@ -12,7 +12,7 @@
 
 bfloat16 = tf.bfloat16.as_numpy_dtype
 
-from aie.dialects.scf import yield_, for_ as range_
+from aie.extras.dialects.ext.scf import _for as range_
 from aie.dialects.aiex import npu_dma_memcpy_nd, npu_sync
 
 from aie.api.dataflow.inout.inout import MyInOutProgram
@@ -94,77 +94,41 @@ def my_matmul(M, K, N, m, k, n, dtype_in_str, dtype_out_str, vectorized):
     num_data_tiles = (M // m) * (N // n)
 
     # input/output matrices
-    memref_A_ty = np.ndarray[dtype_in, [M * K]]
-    memref_B_ty = np.ndarray[dtype_in, [K * N]]
-    memref_C_ty = np.ndarray[dtype_out, [M * N]]
+    A_ty = np.ndarray[dtype_in, (M * K,)]
+    B_ty = np.ndarray[dtype_in, (K * N,)]
+    C_ty = np.ndarray[dtype_out, (M * N,)]
 
     # submatrices
-    memref_a_ty = np.ndarray[dtype_in, (m, k)]
-    memref_b_ty = np.ndarray[dtype_in, (k, n)]
-    memref_c_ty = np.ndarray[dtype_out, (m, n)]
+    a_ty = np.ndarray[dtype_in, (m, k)]
+    b_ty = np.ndarray[dtype_in, (k, n)]
+    c_ty = np.ndarray[dtype_out, (m, n)]
 
     # AIE Core Function declarations
     scalar_str = "" if vectorized else "scalar_"
-    zero = BinKernel(
-        f"zero_{scalar_str}{dtype_out_str}", f"mm_{m}x{k}x{n}.o", [memref_c_ty]
-    )
+    zero = BinKernel(f"zero_{scalar_str}{dtype_out_str}", f"mm_{m}x{k}x{n}.o", [c_ty])
     matmul = BinKernel(
         f"matmul_{scalar_str}{dtype_in_str}_{dtype_out_str}",
         f"mm_{m}x{k}x{n}.o",
-        [memref_a_ty, memref_b_ty, memref_c_ty],
+        [a_ty, b_ty, c_ty],
     )
 
-    inA = MyObjectFifo(2, memref_a_ty)
-    memA = MyObjectFifo(
-        2,
-        memref_a_ty,
-        dimensionsToStream=(
-            [
-                (m // r, r * k),
-                (k // s, s),
-                (r, k),
-                (s, 1),
-            ]
-            if vectorized
-            else []
-        ),
-    )
+    inA = MyObjectFifo(2, a_ty)
+    memAToStream = [(m // r, r * k), (k // s, s), (r, k), (s, 1)] if vectorized else []
+    memA = MyObjectFifo(2, a_ty, dimensionsToStream=memAToStream)
     inALink = MyObjectFifoLink([inA.second], [memA.first], coords=(0, 1))  # AnyMemtile
 
     # Input B
-    inB = MyObjectFifo(2, memref_b_ty)
-    memB = MyObjectFifo(
-        2,
-        memref_b_ty,
-        dimensionsToStream=(
-            [
-                (k // s, s * n),
-                (n // t, t),
-                (s, n),
-                (t, 1),
-            ]
-            if vectorized
-            else []
-        ),
-    )
+    inB = MyObjectFifo(2, b_ty)
+    memBToStream = [(k // s, s * n), (n // t, t), (s, n), (t, 1)] if vectorized else []
+    memB = MyObjectFifo(2, b_ty, dimensionsToStream=memBToStream)
     inBLink = MyObjectFifoLink([inB.second], [memB.first], coords=(0, 1))  # AnyMemtile
 
     # Output C
-    memC = MyObjectFifo(2, memref_c_ty)
-    outC = MyObjectFifo(
-        2,
-        memref_c_ty,
-        dimensionsToStream=(
-            [
-                (m // r, r * n),
-                (r, t),
-                (n // t, r * t),
-                (t, 1),
-            ]
-            if vectorized
-            else []
-        ),
+    memC = MyObjectFifo(2, c_ty)
+    memCToStream = (
+        [(m // r, r * n), (r, t), (n // t, r * t), (t, 1)] if vectorized else []
     )
+    outC = MyObjectFifo(2, c_ty, dimensionsToStream=memCToStream)
     outCLink = MyObjectFifoLink(
         [memC.second], [outC.first], coords=(0, 1)
     )  # AnyMemtile
@@ -183,13 +147,8 @@ def core_fn(a, b, c, zero, matmul):
                     matmul(elem_in_a, elem_in_b, elem_out)
                     a.release(1)
                     b.release(1)
-                    if (K // k) > 1:
-                        yield_([])
 
                 c.release(1)
-                if num_data_tiles > 1:
-                    yield_([])
-            yield_([])
 
     def sequence_fn(A, B, C, inA, inB, outC):
         # only do 4 tile rows at a time before synchronizing, so we can reuse BDs
@@ -242,7 +201,7 @@ def sequence_fn(A, B, C, inA, inB, outC):
 
     inout_program = MyInOutProgram(
         sequence_fn,
-        [memref_A_ty, memref_B_ty, memref_C_ty],
+        [A_ty, B_ty, C_ty],
         [inA.first, inB.first, outC.second],
         coords=(0, 0),  # AnyShim
     )
diff --git a/programming_examples/basic/matrix_scalar_add/aie2.py b/programming_examples/basic/matrix_scalar_add/aie2.py
index 45922e0bf..1d910f390 100644
--- a/programming_examples/basic/matrix_scalar_add/aie2.py
+++ b/programming_examples/basic/matrix_scalar_add/aie2.py
@@ -6,21 +6,17 @@
 #
 # (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
 
-from aie.dialects.aie import *
-from aie.dialects.aiex import *
-from aie.dialects.scf import *
-from aie.dialects.scf import for_ as range_
-from aie.extras.dialects.ext import memref, arith
-from aie.extras.context import mlir_mod_ctx
+import numpy as np
+import sys
 
 from aie.extras.dialects.ext.arith import constant
 from aie.extras.dialects.ext.func import func
-from aie.extras.ast import canonicalize
-
-# from aie.extras.dialects.ext.scf import canonicalizer as scf_canonicalizer
-# from aie.extras.dialects.ast.canonicalize import canonicalize
-
-import sys
+from aie.extras.dialects.ext.scf import _for as range_
+from aie.api.dataflow.inout.simplefifoinout import SimpleFifoInOutProgram
+from aie.api.dataflow.objectfifo import MyObjectFifo
+from aie.api.phys.device import NPU1Col1, XCVC1902
+from aie.api.program import MyProgram
+from aie.api.worker import MyWorker
 
 # Size of the entire image
 IMAGE_HEIGHT = 16
@@ -37,83 +33,67 @@
 
 objfifo_capacity = 4
 
-
-def my_matrix_add_one():
-
-    if len(sys.argv) != 3:
-        raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
-    if sys.argv[1] == "npu":
-        dev = AIEDevice.npu1_1col
-    elif sys.argv[1] == "xcvc1902":
-        dev = AIEDevice.xcvc1902
-    else:
-        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
-
-    @device(dev)
-    def device_body():
-        memRef_ty = T.memref(TILE_SIZE, T.i32())
-
-        # Tile declarations
-        ShimTile = tile(int(sys.argv[2]), 0)
-        ComputeTile2 = tile(int(sys.argv[2]), 2)
-
-        # AIE-array data movement with object fifos
-        # Input
-        of_in1 = object_fifo("in0", ShimTile, ComputeTile2, objfifo_capacity, memRef_ty)
-
-        # Output
-        of_out1 = object_fifo(
-            "out0", ComputeTile2, ShimTile, objfifo_capacity, memRef_ty
-        )
-
-        # @canonicalize(using=scf_canonicalizer) shoudl decorate this after func?
-        # we need emit = true because must be emited in outer loop (not deferred) to have access to symbol table
-        @func(emit=True)
-        def memfoo(elem_in: memRef_ty, elem_out: memRef_ty):
-            one = constant(1)
-            for i in range_(TILE_SIZE):
-                elem_out[i] = elem_in[i] + one
-                yield_([])
-
-        # Set up compute tile 2
-        @core(ComputeTile2)
-        def core_body():
-            # Effective while(1)
-            for _ in for_(sys.maxsize):
-                elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1)
-                elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1)
-                memfoo(elem_in, elem_out)
-                of_in1.release(ObjectFifoPort.Consume, 1)
-                of_out1.release(ObjectFifoPort.Produce, 1)
-                yield_([])
-
-        # To/from AIE-array data movement
-
-        tensor_ty = T.memref(TILE_SIZE, T.i32())
-
-        @runtime_sequence(tensor_ty, tensor_ty)
-        def sequence(inTensor, outTensor):
-            npu_dma_memcpy_nd(
-                metadata="out0",
-                bd_id=0,
-                mem=outTensor,
-                sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
-                strides=[1, 1, IMAGE_WIDTH, 1],
-            )
-            npu_dma_memcpy_nd(
-                metadata="in0",
-                bd_id=1,
-                mem=inTensor,
-                sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
-                strides=[1, 1, IMAGE_WIDTH, 1],
-            )
-            npu_sync(column=0, row=0, direction=0, channel=0)
-
-
-with mlir_mod_ctx() as ctx:
-    my_matrix_add_one()
-    res = ctx.module.operation.verify()
-    if res == True:
-        print(ctx.module)
-    else:
-        print(res)
+if len(sys.argv) != 3:
+    raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
+if sys.argv[1] == "npu":
+    dev = NPU1Col1()
+elif sys.argv[1] == "xcvc1902":
+    dev = XCVC1902()
+else:
+    raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+
+my_dtype = np.int32
+tile_ty = np.ndarray[my_dtype, (TILE_SIZE,)]
+
+# AIE-array data movement with object fifos
+of_in = MyObjectFifo(objfifo_capacity, tile_ty)
+of_out = MyObjectFifo(objfifo_capacity, tile_ty)
+
+
+@func
+def add_kernel(elem_in: tile_ty, elem_out: tile_ty):
+    for i in range_(TILE_SIZE):
+        elem_out[i] = elem_in[i] + constant(1)
+
+
+def core_fn(of_in, of_out, add_kernel):
+    # Effective while(1)
+    for _ in range_(sys.maxsize):
+        elem_in = of_in.acquire(1)
+        elem_out = of_out.acquire(1)
+        add_kernel(elem_in, elem_out)
+        of_in.release(1)
+        of_out.release(1)
+
+
+# Set up compute tile 2 TODO: clean up placement
+worker_program = MyWorker(
+    core_fn,
+    [of_in.second, of_out.first, add_kernel],
+    coords=(int(sys.argv[2]), 2),
+)
+
+# To/from AIE-array data movement
+inout_program = SimpleFifoInOutProgram(
+    of_in.first,
+    TILE_SIZE,
+    of_out.second,
+    TILE_SIZE,
+    in_sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
+    in_strides=[1, 1, IMAGE_WIDTH, 1],
+    out_sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
+    out_strides=[1, 1, IMAGE_WIDTH, 1],
+    dtype=my_dtype,
+    coords=(int(sys.argv[2]), 0),
+)
+
+my_program = MyProgram(
+    dev, worker_programs=[worker_program], inout_program=inout_program
+)
+my_program.resolve_program()
+
+"""
+TODOs:
+* look into # @canonicalize(using=scf_canonicalizer) shoudl decorate this after func if we want control flow
+* we need emit = true because must be emited in outer loop (not deferred) to have access to symbol table
+"""
diff --git a/programming_examples/basic/matrix_scalar_add/aie2_new.py b/programming_examples/basic/matrix_scalar_add/aie2_new.py
deleted file mode 100644
index 6bfc0bc83..000000000
--- a/programming_examples/basic/matrix_scalar_add/aie2_new.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# matrix_scalar_add/aie2.py -*- Python -*-
-#
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
-
-import numpy as np
-import sys
-
-from aie.dialects.scf import for_ as range_
-from aie.dialects.scf import yield_
-from aie.dialects.aie import T
-from aie.extras.dialects.ext import memref, arith
-
-from aie.api.dataflow.inout.simplefifoinout import SimpleFifoInOutProgram
-from aie.api.dataflow.objectfifo import MyObjectFifo
-from aie.api.kernels.bykernel import PyKernel
-from aie.api.phys.device import NPU1Col1, XCVC1902
-from aie.api.program import MyProgram
-from aie.api.worker import MyWorker
-
-# Size of the entire image
-IMAGE_HEIGHT = 16
-IMAGE_WIDTH = 128
-IMAGE_SIZE = IMAGE_WIDTH * IMAGE_HEIGHT
-
-# Size of the tile we are processing
-TILE_HEIGHT = 8
-TILE_WIDTH = 16
-TILE_SIZE = TILE_WIDTH * TILE_HEIGHT
-
-NUM_3D = IMAGE_WIDTH / TILE_WIDTH
-NUM_4D = IMAGE_HEIGHT / TILE_HEIGHT
-
-objfifo_capacity = 4
-
-
-def my_matrix_add_one():
-    if len(sys.argv) != 3:
-        raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
-    if sys.argv[1] == "npu":
-        dev = NPU1Col1()
-    elif sys.argv[1] == "xcvc1902":
-        dev = XCVC1902()
-    else:
-        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
-
-    my_dtype = np.int32
-    tile_ty = np.ndarray[my_dtype, (TILE_SIZE,)]
-
-    # AIE-array data movement with object fifos
-    of_in = MyObjectFifo(objfifo_capacity, tile_ty)
-    of_out = MyObjectFifo(objfifo_capacity, tile_ty)
-
-    # TODO: make typed? should have type type_ty
-    def my_pykernel(elem_in, elem_out):
-        # No metaprogramming, interpretted literally
-        for i in range(TILE_SIZE):
-            elem_out[i] = elem_in[i] + 1
-
-    py_add_kernel = PyKernel(my_pykernel)
-
-    # Set up compute tile 2
-    def core_fn(of_in, of_out, add_kernel):
-        # Effective while(1)
-        for _ in range_(sys.maxsize):
-            elem_in = of_in.acquire(1)
-            elem_out = of_out.acquire(1)
-            add_kernel(elem_in, elem_out)
-            of_in.release(1)
-            of_out.release(1)
-            yield_([])
-
-    # TODO: clean up placement
-    worker_program = MyWorker(
-        core_fn,
-        [of_in.second, of_out.first, py_add_kernel],
-        coords=(int(sys.argv[2]), 2),
-    )
-
-    # To/from AIE-array data movement
-    inout_program = SimpleFifoInOutProgram(
-        of_in.first,
-        TILE_SIZE,
-        of_out.second,
-        TILE_SIZE,
-        in_sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
-        in_strides=[1, 1, IMAGE_WIDTH, 1],
-        out_sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
-        out_strides=[1, 1, IMAGE_WIDTH, 1],
-        dtype=my_dtype,
-        coords=(int(sys.argv[2]), 0),
-    )
-
-    my_program = MyProgram(
-        dev, worker_programs=[worker_program], inout_program=inout_program
-    )
-    my_program.resolve_program()
-
-
-my_matrix_add_one()
diff --git a/programming_examples/basic/matrix_scalar_add/aie2_old.py b/programming_examples/basic/matrix_scalar_add/aie2_old.py
new file mode 100644
index 000000000..45922e0bf
--- /dev/null
+++ b/programming_examples/basic/matrix_scalar_add/aie2_old.py
@@ -0,0 +1,119 @@
+# matrix_scalar_add/aie2.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.dialects.scf import for_ as range_
+from aie.extras.dialects.ext import memref, arith
+from aie.extras.context import mlir_mod_ctx
+
+from aie.extras.dialects.ext.arith import constant
+from aie.extras.dialects.ext.func import func
+from aie.extras.ast import canonicalize
+
+# from aie.extras.dialects.ext.scf import canonicalizer as scf_canonicalizer
+# from aie.extras.dialects.ast.canonicalize import canonicalize
+
+import sys
+
+# Size of the entire image
+IMAGE_HEIGHT = 16
+IMAGE_WIDTH = 128
+IMAGE_SIZE = IMAGE_WIDTH * IMAGE_HEIGHT
+
+# Size of the tile we are processing
+TILE_HEIGHT = 8
+TILE_WIDTH = 16
+TILE_SIZE = TILE_WIDTH * TILE_HEIGHT
+
+NUM_3D = IMAGE_WIDTH / TILE_WIDTH
+NUM_4D = IMAGE_HEIGHT / TILE_HEIGHT
+
+objfifo_capacity = 4
+
+
+def my_matrix_add_one():
+
+    if len(sys.argv) != 3:
+        raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
+    if sys.argv[1] == "npu":
+        dev = AIEDevice.npu1_1col
+    elif sys.argv[1] == "xcvc1902":
+        dev = AIEDevice.xcvc1902
+    else:
+        raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1]))
+
+    @device(dev)
+    def device_body():
+        memRef_ty = T.memref(TILE_SIZE, T.i32())
+
+        # Tile declarations
+        ShimTile = tile(int(sys.argv[2]), 0)
+        ComputeTile2 = tile(int(sys.argv[2]), 2)
+
+        # AIE-array data movement with object fifos
+        # Input
+        of_in1 = object_fifo("in0", ShimTile, ComputeTile2, objfifo_capacity, memRef_ty)
+
+        # Output
+        of_out1 = object_fifo(
+            "out0", ComputeTile2, ShimTile, objfifo_capacity, memRef_ty
+        )
+
+        # @canonicalize(using=scf_canonicalizer) shoudl decorate this after func?
+        # we need emit = true because must be emited in outer loop (not deferred) to have access to symbol table
+        @func(emit=True)
+        def memfoo(elem_in: memRef_ty, elem_out: memRef_ty):
+            one = constant(1)
+            for i in range_(TILE_SIZE):
+                elem_out[i] = elem_in[i] + one
+                yield_([])
+
+        # Set up compute tile 2
+        @core(ComputeTile2)
+        def core_body():
+            # Effective while(1)
+            for _ in for_(sys.maxsize):
+                elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1)
+                elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1)
+                memfoo(elem_in, elem_out)
+                of_in1.release(ObjectFifoPort.Consume, 1)
+                of_out1.release(ObjectFifoPort.Produce, 1)
+                yield_([])
+
+        # To/from AIE-array data movement
+
+        tensor_ty = T.memref(TILE_SIZE, T.i32())
+
+        @runtime_sequence(tensor_ty, tensor_ty)
+        def sequence(inTensor, outTensor):
+            npu_dma_memcpy_nd(
+                metadata="out0",
+                bd_id=0,
+                mem=outTensor,
+                sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
+                strides=[1, 1, IMAGE_WIDTH, 1],
+            )
+            npu_dma_memcpy_nd(
+                metadata="in0",
+                bd_id=1,
+                mem=inTensor,
+                sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
+                strides=[1, 1, IMAGE_WIDTH, 1],
+            )
+            npu_sync(column=0, row=0, direction=0, channel=0)
+
+
+with mlir_mod_ctx() as ctx:
+    my_matrix_add_one()
+    res = ctx.module.operation.verify()
+    if res == True:
+        print(ctx.module)
+    else:
+        print(res)
diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py
index 057e22a74..ee91adf6d 100644
--- a/programming_examples/basic/passthrough_kernel/aie2.py
+++ b/programming_examples/basic/passthrough_kernel/aie2.py
@@ -8,8 +8,8 @@
 import sys
 import numpy as np
 
-from aie.dialects.scf import for_ as range_
-from aie.dialects.scf import yield_
+# TODO: move maybe to aie.api.controlflow
+from aie.extras.dialects.ext.scf import _for as range_
 
 from aie.api.dataflow.inout.simplefifoinout import SimpleFifoInOutProgram
 from aie.api.dataflow.objectfifo import MyObjectFifo
@@ -48,7 +48,6 @@ def core_fn(of_in, of_out, passThroughLine):
         passThroughLine(elemIn, elemOut, line_size)
         of_in.release(1)
         of_out.release(1)
-        yield_([])
 
 
 # TODO: clean up placement
diff --git a/python/api/program.py b/python/api/program.py
index 975b9c04b..e4846bee7 100644
--- a/python/api/program.py
+++ b/python/api/program.py
@@ -6,7 +6,9 @@
 """
 
 from ..extras.context import mlir_mod_ctx
+from ..extras.dialects.ext.func import FuncBase
 from ..dialects.aie import device
+
 from .worker import MyWorker
 from .phys.device import MyDevice
 from .dataflow.inout.inout import InOutProgram
@@ -44,7 +46,10 @@ def device_body():
                 # generate fifos (and external functions)
                 for w in self.__worker_programs:
                     for arg in w.fn_args:
-                        arg.resolve()
+                        if isinstance(arg, FuncBase):
+                            arg.emit()
+                        else:
+                            arg.resolve()
                         self._print_verify(ctx)
                 for f in self.__inout_program.get_fifos():
                     f.resolve()
diff --git a/python/extras/dialects/ext/func.py b/python/extras/dialects/ext/func.py
index 0432de4bd..6229f575c 100644
--- a/python/extras/dialects/ext/func.py
+++ b/python/extras/dialects/ext/func.py
@@ -1,6 +1,9 @@
 import sys
+import numpy as np
 from functools import update_wrapper
+from typing import get_origin
 
+from ....api.tensor import MyTensorType
 from ...meta import op_region_builder
 from ...util import get_user_code_loc, make_maybe_no_args_decorator, get_arg_types
 from ....dialects._ods_common import get_op_result_or_op_results
@@ -109,9 +112,11 @@ def prep_func_types(sig, return_types):
         for p in sig.parameters.values()
         if not p.annotation is inspect.Signature.empty
     ]
+    # convert ndarray types to memref types
     assert all(
-        isinstance(r, (str, Type)) or isalambda(r) for r in input_types
-    ), f"all input types must be mlir types {input_types=}"
+        isinstance(r, (str, Type)) or isalambda(r) or get_origin(r) == np.ndarray
+        for r in input_types
+    ), f"all input types must be mlir types or ndarrays (tensors) {input_types=}"
     user_loc = get_user_code_loc()
     # If ir.Context is none (like for deferred func emit)
     if user_loc is None:
@@ -197,6 +202,8 @@ def emit(self, *call_args, decl=False, force=False) -> FuncOp:
                         input_types[i] = Type(eval(v, self.body_builder.__globals__))
                     elif isalambda(v):
                         input_types[i] = v()
+                    elif get_origin(v) == np.ndarray:
+                        input_types[i] = MyTensorType.get_memref_type(v)
             else:
                 input_types = get_arg_types(call_args)
 
diff --git a/python/extras/dialects/ext/scf.py b/python/extras/dialects/ext/scf.py
new file mode 100644
index 000000000..ca466625f
--- /dev/null
+++ b/python/extras/dialects/ext/scf.py
@@ -0,0 +1,50 @@
+from typing import Optional, Sequence
+
+from ....ir import InsertionPoint, Value
+from ....dialects.linalg.opdsl.lang.emitter import _is_index_type
+from ....dialects.scf import ForOp, yield_
+
+from .arith import constant, index_cast
+
+
+def _for(
+    start,
+    stop=None,
+    step=None,
+    iter_args: Optional[Sequence[Value]] = None,
+    insert_yield: bool = True,
+    *,
+    loc=None,
+    ip=None,
+):
+    if step is None:
+        step = 1
+    if stop is None:
+        stop = start
+        start = 0
+    params = [start, stop, step]
+    for i, p in enumerate(params):
+        if isinstance(p, int):
+            p = constant(p, index=True)
+        if not _is_index_type(p.type):
+            p = index_cast(p)
+        params[i] = p
+
+    start, stop, step = params
+
+    for_op = ForOp(start, stop, step, iter_args, loc=loc, ip=ip)
+    iv = for_op.induction_variable
+    iter_args = tuple(for_op.inner_iter_args)
+    with InsertionPoint(for_op.body):
+        if len(iter_args) > 1:
+            yield iv, iter_args, for_op.results
+            # print("HI")
+        elif len(iter_args) == 1:
+            yield iv, iter_args[0], for_op.results[0]
+            # print("HI")
+        else:
+            # print("HELLO")
+            yield iv
+            # print("HI1")
+        if insert_yield:
+            yield_([])