Stub SporkEnv

exo-lang · akeley98 · Oct 23, 2024 · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024
commit 58bb326c49779422e95b07f6ab5f4cd84df139e8
diff --git a/src/exo/backend/LoopIR_compiler.py b/src/exo/backend/LoopIR_compiler.py
@@ -658,7 +658,10 @@ def is_leaf_proc(stmts):
 
     def add_line(self, line):
         if line:
-            self._lines.append(self._tab + line)
+            if self.spork:
+                self.spork.kernel_lines.append(self._tab + line)
+            else:
+                self._lines.append(self._tab + line)
 
     def comp_stmts(self, stmts):
         for b in stmts:
@@ -833,6 +836,9 @@ def get_actor_kind(self):
         return self.spork.get_actor_kind() if self.spork else actor_kinds.cpu
 
     def comp_s(self, s):
+        if self.spork:
+            self.spork.on_comp_s(s)
+
         if isinstance(s, LoopIR.Pass):
             self.add_line("; // NO-OP")
         elif isinstance(s, LoopIR.SyncStmt):
@@ -890,7 +896,19 @@ def comp_s(self, s):
             name = self.new_varname(s.name, typ=s.rhs.type, mem=mem)
             self.add_line(f"struct {win_struct} {name} = {rhs};")
         elif isinstance(s, LoopIR.If):
-            cond = self.comp_e(s.cond)
+            is_lane_specialization = (
+                isinstance(s.cond, LoopIR.Const)
+                and s.cond.type == T.lane_specialization
+            )
+            if is_lane_specialization:
+                if not self.spork:
+                    raise ValueError(
+                        f"{s.srcinfo}: lane specialization outside GPU scope"
+                    )
+                cond = self.spork.push_lane_specialization(s.cond.val)
+            else:
+                cond = self.comp_e(s.cond)
+
             self.add_line(f"if ({cond}) {{")
             self.push()
             self.comp_stmts(s.body)
@@ -902,6 +920,9 @@ def comp_s(self, s):
                 self.pop()
             self.add_line("}")
 
+            if is_lane_specialization:
+                self.spork.pop_lane_specialization()
+
         elif isinstance(s, LoopIR.For):
             lo = self.comp_e(s.lo)
             hi = self.comp_e(s.hi)
@@ -916,6 +937,7 @@ def comp_s(self, s):
             loop_mode = s.loop_mode
             old_actor_kind = self.get_actor_kind()
             new_actor_kind = loop_mode.new_actor_kind(old_actor_kind)
+            emit_loop = True
             starting_cuda_kernel = (
                 not self.spork and new_actor_kind is not actor_kinds.cpu
             )
@@ -924,25 +946,33 @@ def comp_s(self, s):
                 warnings.warn(
                     f"{s.srcinfo}: cannot nest loop with actor kind {new_actor_kind} in {old_actor_kind} scope"
                 )
+            if starting_cuda_kernel:
+                self.spork = SporkEnv(
+                    f"{self.proc.name}EXOcu_{s.srcinfo.lineno:04d}{itr}"
+                )
+            if self.spork:
+                emit_loop = self.spork.push_for(new_actor_kind, loop_mode)
 
             if isinstance(loop_mode, Par):
+                assert emit_loop
                 self.add_line(f"#pragma omp parallel for")
 
-            if loop_mode.cuda_nesting is None:
+            if emit_loop:
                 self.add_line(
                     f"for (int_fast32_t {itr} = {lo}; {itr} < {hi}; {itr}++) {{"
                 )
                 self.push(only="tab")
-            else:
-                self.add_line(f"// TODO parallel-for {type(loop_mode)}")
-                warnings.warn("TODO implement CUDA codegen")
 
             self.comp_stmts(s.body)
 
+            if self.spork:
+                self.spork.pop_for()
             if starting_cuda_kernel:
+                # TODO compile kernel_lines into separate device function
+                self._lines.extend(self.spork.kernel_lines)
                 self.spork = None
 
-            if loop_mode.cuda_nesting is None:
+            if emit_loop:
                 self.pop()
                 self.add_line("}")
 
@@ -1057,8 +1087,7 @@ def comp_e(self, e, prec=0):
             elif e.type == T.f32:
                 return f"{float(e.val)}f"
             elif e.type == T.lane_specialization:
-                warnings.warn("TODO lane specialization")
-                return "true  /* TODO LaneSpecialization */"
+                assert False, "should be handled when compiling LoopIR.If"
             else:
                 return f"(({e.type.ctype()}) {str(e.val)})"
 

diff --git a/src/exo/spork/actor_kinds.py b/src/exo/spork/actor_kinds.py
@@ -65,12 +65,12 @@ def __str__(self):
 
 """cp.async.bulk instructions with cluster/block shared memory as destination"""
 tma_to_shared_async = ActorKind(
-    "tma_to_shared_async", ActorKindCategory.ASYNC, set(), cuda_generic
+    "tma_to_shared_async", ActorKindCategory.ASYNC, set(), cuda_sync
 )
 
 """cp{.reduce}.bulk.async instructions with global memory as destination"""
 tma_to_global_async = ActorKind(
-    "tma_to_global_async", ActorKindCategory.ASYNC, set(), cuda_generic
+    "tma_to_global_async", ActorKindCategory.ASYNC, set(), cuda_sync
 )
 
 """wgmma instructions' actions on registers"""
@@ -84,7 +84,7 @@ def __str__(self):
     "wgmma_async",
     ActorKindCategory.ASYNC,
     {wgmma_async_reg, wgmma_async_mem},
-    cuda_generic,
+    cuda_sync,
 )
 
 """actions on wgmma matrix tile registers, either by wgmma.async

diff --git a/src/exo/spork/lane_units.py b/src/exo/spork/lane_units.py
@@ -9,22 +9,32 @@ class LaneUnit(object):
 
     """
 
-    def __init__(self, name):
+    def __init__(self, name, parent):
         self.name = name
+        self.parent = parent
 
     def __repr__(self):
         return f"<exo.spork.lane_unit.LaneUnit {self.name}>"
 
     def __str__(self):
         return self.name
 
+    def contains(self, child):
+        """Evaluate nesting relationship
 
-cpu_thread = LaneUnit("cpu_thread")
-cuda_cluster = LaneUnit("cuda_cluster")
-cuda_block = LaneUnit("cuda_block")
-cuda_warpgroup = LaneUnit("cuda_warpgroup")
-cuda_warp = LaneUnit("cuda_warp")
-cuda_thread = LaneUnit("cuda_thread")
+        e.g. cuda_block.contains(cuda_thread) -> True"""
+        if self == child.parent:
+            return True
+        else:
+            return child.parent is not None and self.contains(child.parent)
+
+
+cpu_thread = LaneUnit("cpu_thread", None)
+cuda_cluster = LaneUnit("cuda_cluster", None)
+cuda_block = LaneUnit("cuda_block", cuda_cluster)
+cuda_warpgroup = LaneUnit("cuda_warpgroup", cuda_block)
+cuda_warp = LaneUnit("cuda_warp", cuda_warpgroup)
+cuda_thread = LaneUnit("cuda_thread", cuda_warp)
 
 lane_unit_dict = {
     unit.name: unit

diff --git a/src/exo/spork/loop_modes.py b/src/exo/spork/loop_modes.py
@@ -6,11 +6,8 @@
 
 
 class LoopMode(object):
-    # None if the loop mode doesn't correspond to cuda.
-    # Otherwise, this is the "index" of how nested this loop type
-    # should be in the cuda programming model.
-    # (e.g. thread's cuda_nesting > block's cuda_nesting)
-    cuda_nesting: Optional[int]
+    is_par = False
+    is_async = False
 
     def loop_mode_name(self):
         raise NotImplementedError()
@@ -21,10 +18,6 @@ def new_actor_kind(self, old_actor_kind: ActorKind):
     def lane_unit(self):
         raise NotImplementedError()
 
-    def cuda_can_nest_in(self, other):
-        assert self.cuda_nesting is not None
-        return other.cuda_nesting is not None and self.cuda_nesting > other.cuda_nesting
-
     def _unpack_positive_int(self, value, name):
         if hasattr(value, "val"):
             value = value.val
@@ -35,10 +28,11 @@ def _unpack_positive_int(self, value, name):
 
 
 class Seq(LoopMode):
-    cuda_nesting = None
+    is_par = False
+    is_async = False
 
     def __init__(self):
-        self.cuda_nesting = None
+        pass
 
     def loop_mode_name(self):
         return "seq"
@@ -52,10 +46,11 @@ def new_actor_kind(self, old_actor_kind: ActorKind):
 
 
 class Par(LoopMode):
-    cuda_nesting = None
+    is_par = True
+    is_async = False
 
     def __init__(self):
-        self.cuda_nesting = None
+        pass
 
     def loop_mode_name(self):
         return "par"
@@ -71,7 +66,9 @@ def lane_unit(self):
 
 
 class CudaClusters(LoopMode):
-    cuda_nesting = 2
+    is_par = True
+    is_async = False
+
     blocks: int
 
     def __init__(self, blocks):
@@ -88,7 +85,9 @@ def lane_unit(self):
 
 
 class CudaBlocks(LoopMode):
-    cuda_nesting = 3
+    is_par = True
+    is_async = False
+
     warps: int
 
     def __init__(self, warps=1):
@@ -108,7 +107,8 @@ def lane_unit(self):
 
 
 class CudaWarpgroups(LoopMode):
-    cuda_nesting = 4
+    is_par = True
+    is_async = False
 
     def __init__(self):
         pass
@@ -127,7 +127,8 @@ def lane_unit(self):
 
 
 class CudaWarps(LoopMode):
-    cuda_nesting = 5
+    is_par = True
+    is_async = False
 
     def __init__(self):
         pass
@@ -146,7 +147,8 @@ def lane_unit(self):
 
 
 class CudaThreads(LoopMode):
-    cuda_nesting = 6
+    is_par = True
+    is_async = False
 
     def __init__(self):
         pass
@@ -164,17 +166,25 @@ def lane_unit(self):
 cuda_threads = CudaThreads()
 
 
-def loop_mode_for_actor_kind(actor_kind):
+def loop_mode_for_async_actor_kind(actor_kind):
+    assert actor_kind.is_async()
+
     class _AsyncLoopMode(LoopMode):
+        is_par = False
+        is_async = True
+
         def __init__(self):
-            self.cuda_nesting = None
+            pass
 
         def loop_mode_name(self):
             return actor_kind.name
 
         def new_actor_kind(self, old_actor_kind: ActorKind):
             return actor_kind
 
+        def __repr__(self):
+            return f"loop_mode_for_async_actor_kind({actor_kind})()"
+
     return _AsyncLoopMode
 
 
@@ -193,7 +203,7 @@ def make_loop_mode_dict():
     for name, actor_kind in actor_kind_dict.items():
         assert name == actor_kind.name
         if not actor_kind.is_synthetic() and actor_kind.is_async():
-            loop_mode_dict[name] = loop_mode_for_actor_kind(actor_kind)
+            loop_mode_dict[name] = loop_mode_for_async_actor_kind(actor_kind)
     return loop_mode_dict
 
 

diff --git a/src/exo/spork/spork_env.py b/src/exo/spork/spork_env.py
@@ -1,3 +1,63 @@
+from typing import List, Optional, Tuple
+
+from ..core.prelude import Sym, SrcInfo, extclass
+
+from .lane_units import LaneUnit, LaneSpecialization
+from .loop_modes import LoopMode
+from .actor_kinds import ActorKind, cpu
+
+from ..core.LoopIR import LoopIR
+
+
+class ParIndexRecord(object):
+    __slots__ = ["exo_iter", "c_iter", "c_range", "static_range"]
+
+    exo_iter: str  # Iteration variable name in Exo code
+    c_iter: str  # Iteration variable name in C code
+    # [c[0], c[1]) -- iteration bounds as compiled C code fragments
+    c_range: Tuple[str, str]
+    # [static_range[0], static_range[1]) -- statically analyzed maximal bounds
+    static_range: Tuple[int, int]
+
+
+# This is where most of the backend logic for CUDA should go.
 class SporkEnv(object):
+    kernel_name: str
+    kernel_lines: List[str]
+    _actor_kind_stack: List[ActorKind]
+
+    def __init__(self, kernel_name: str):
+        self.kernel_name = kernel_name
+        self.kernel_lines = []
+        self._actor_kind_stack = []
+
     def __bool__(self):
         return True
+
+    def get_actor_kind(self):
+        assert self._actor_kind_stack
+        return self._actor_kind_stack[-1]
+
+    def push_for(self, actor_kind: ActorKind, loop_mode: LoopMode) -> bool:
+        assert isinstance(actor_kind, ActorKind)
+        assert isinstance(loop_mode, LoopMode)
+
+        if self._actor_kind_stack:
+            assert actor_kind.allows_parent(self._actor_kind_stack[-1])
+        self._actor_kind_stack.append(actor_kind)
+        self.kernel_lines.append(f"// TODO parallel-for {loop_mode}")
+        return True
+
+    def pop_for(self):
+        assert self._actor_kind_stack
+        self._actor_kind_stack.pop()
+
+    def push_lane_specialization(self, lane_specialization: LaneSpecialization) -> str:
+        assert isinstance(lane_specialization, LaneSpecialization)
+        return "true  /* TODO compile LaneSpecialization */"
+
+    def pop_lane_specialization(self):
+        pass
+
+    def on_comp_s(self, s):
+        pass