Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: Exo GPU code for discussion (spork) #736

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Stub SporkEnv
akeley98 committed Oct 31, 2024

Verified

This commit was signed with the committer’s verified signature.
aj-stein-nist A.J. Stein
commit 58bb326c49779422e95b07f6ab5f4cd84df139e8
47 changes: 38 additions & 9 deletions src/exo/backend/LoopIR_compiler.py
Original file line number Diff line number Diff line change
@@ -658,7 +658,10 @@ def is_leaf_proc(stmts):

def add_line(self, line):
if line:
self._lines.append(self._tab + line)
if self.spork:
self.spork.kernel_lines.append(self._tab + line)
else:
self._lines.append(self._tab + line)

def comp_stmts(self, stmts):
for b in stmts:
@@ -833,6 +836,9 @@ def get_actor_kind(self):
return self.spork.get_actor_kind() if self.spork else actor_kinds.cpu

def comp_s(self, s):
if self.spork:
self.spork.on_comp_s(s)

if isinstance(s, LoopIR.Pass):
self.add_line("; // NO-OP")
elif isinstance(s, LoopIR.SyncStmt):
@@ -890,7 +896,19 @@ def comp_s(self, s):
name = self.new_varname(s.name, typ=s.rhs.type, mem=mem)
self.add_line(f"struct {win_struct} {name} = {rhs};")
elif isinstance(s, LoopIR.If):
cond = self.comp_e(s.cond)
is_lane_specialization = (
isinstance(s.cond, LoopIR.Const)
and s.cond.type == T.lane_specialization
)
if is_lane_specialization:
if not self.spork:
raise ValueError(
f"{s.srcinfo}: lane specialization outside GPU scope"
)
cond = self.spork.push_lane_specialization(s.cond.val)
else:
cond = self.comp_e(s.cond)

self.add_line(f"if ({cond}) {{")
self.push()
self.comp_stmts(s.body)
@@ -902,6 +920,9 @@ def comp_s(self, s):
self.pop()
self.add_line("}")

if is_lane_specialization:
self.spork.pop_lane_specialization()

elif isinstance(s, LoopIR.For):
lo = self.comp_e(s.lo)
hi = self.comp_e(s.hi)
@@ -916,6 +937,7 @@ def comp_s(self, s):
loop_mode = s.loop_mode
old_actor_kind = self.get_actor_kind()
new_actor_kind = loop_mode.new_actor_kind(old_actor_kind)
emit_loop = True
starting_cuda_kernel = (
not self.spork and new_actor_kind is not actor_kinds.cpu
)
@@ -924,25 +946,33 @@ def comp_s(self, s):
warnings.warn(
f"{s.srcinfo}: cannot nest loop with actor kind {new_actor_kind} in {old_actor_kind} scope"
)
if starting_cuda_kernel:
self.spork = SporkEnv(
f"{self.proc.name}EXOcu_{s.srcinfo.lineno:04d}{itr}"
)
if self.spork:
emit_loop = self.spork.push_for(new_actor_kind, loop_mode)

if isinstance(loop_mode, Par):
assert emit_loop
self.add_line(f"#pragma omp parallel for")

if loop_mode.cuda_nesting is None:
if emit_loop:
self.add_line(
f"for (int_fast32_t {itr} = {lo}; {itr} < {hi}; {itr}++) {{"
)
self.push(only="tab")
else:
self.add_line(f"// TODO parallel-for {type(loop_mode)}")
warnings.warn("TODO implement CUDA codegen")

self.comp_stmts(s.body)

if self.spork:
self.spork.pop_for()
if starting_cuda_kernel:
# TODO compile kernel_lines into separate device function
self._lines.extend(self.spork.kernel_lines)
self.spork = None

if loop_mode.cuda_nesting is None:
if emit_loop:
self.pop()
self.add_line("}")

@@ -1057,8 +1087,7 @@ def comp_e(self, e, prec=0):
elif e.type == T.f32:
return f"{float(e.val)}f"
elif e.type == T.lane_specialization:
warnings.warn("TODO lane specialization")
return "true /* TODO LaneSpecialization */"
assert False, "should be handled when compiling LoopIR.If"
else:
return f"(({e.type.ctype()}) {str(e.val)})"

6 changes: 3 additions & 3 deletions src/exo/spork/actor_kinds.py
Original file line number Diff line number Diff line change
@@ -65,12 +65,12 @@ def __str__(self):

"""cp.async.bulk instructions with cluster/block shared memory as destination"""
tma_to_shared_async = ActorKind(
"tma_to_shared_async", ActorKindCategory.ASYNC, set(), cuda_generic
"tma_to_shared_async", ActorKindCategory.ASYNC, set(), cuda_sync
)

"""cp{.reduce}.bulk.async instructions with global memory as destination"""
tma_to_global_async = ActorKind(
"tma_to_global_async", ActorKindCategory.ASYNC, set(), cuda_generic
"tma_to_global_async", ActorKindCategory.ASYNC, set(), cuda_sync
)

"""wgmma instructions' actions on registers"""
@@ -84,7 +84,7 @@ def __str__(self):
"wgmma_async",
ActorKindCategory.ASYNC,
{wgmma_async_reg, wgmma_async_mem},
cuda_generic,
cuda_sync,
)

"""actions on wgmma matrix tile registers, either by wgmma.async
24 changes: 17 additions & 7 deletions src/exo/spork/lane_units.py
Original file line number Diff line number Diff line change
@@ -9,22 +9,32 @@ class LaneUnit(object):

"""

def __init__(self, name):
def __init__(self, name, parent):
self.name = name
self.parent = parent

def __repr__(self):
return f"<exo.spork.lane_unit.LaneUnit {self.name}>"

def __str__(self):
return self.name

def contains(self, child):
"""Evaluate nesting relationship

cpu_thread = LaneUnit("cpu_thread")
cuda_cluster = LaneUnit("cuda_cluster")
cuda_block = LaneUnit("cuda_block")
cuda_warpgroup = LaneUnit("cuda_warpgroup")
cuda_warp = LaneUnit("cuda_warp")
cuda_thread = LaneUnit("cuda_thread")
e.g. cuda_block.contains(cuda_thread) -> True"""
if self == child.parent:
return True
else:
return child.parent is not None and self.contains(child.parent)


cpu_thread = LaneUnit("cpu_thread", None)
cuda_cluster = LaneUnit("cuda_cluster", None)
cuda_block = LaneUnit("cuda_block", cuda_cluster)
cuda_warpgroup = LaneUnit("cuda_warpgroup", cuda_block)
cuda_warp = LaneUnit("cuda_warp", cuda_warpgroup)
cuda_thread = LaneUnit("cuda_thread", cuda_warp)

lane_unit_dict = {
unit.name: unit
52 changes: 31 additions & 21 deletions src/exo/spork/loop_modes.py
Original file line number Diff line number Diff line change
@@ -6,11 +6,8 @@


class LoopMode(object):
# None if the loop mode doesn't correspond to cuda.
# Otherwise, this is the "index" of how nested this loop type
# should be in the cuda programming model.
# (e.g. thread's cuda_nesting > block's cuda_nesting)
cuda_nesting: Optional[int]
is_par = False
is_async = False

def loop_mode_name(self):
raise NotImplementedError()
@@ -21,10 +18,6 @@ def new_actor_kind(self, old_actor_kind: ActorKind):
def lane_unit(self):
raise NotImplementedError()

def cuda_can_nest_in(self, other):
assert self.cuda_nesting is not None
return other.cuda_nesting is not None and self.cuda_nesting > other.cuda_nesting

def _unpack_positive_int(self, value, name):
if hasattr(value, "val"):
value = value.val
@@ -35,10 +28,11 @@ def _unpack_positive_int(self, value, name):


class Seq(LoopMode):
cuda_nesting = None
is_par = False
is_async = False

def __init__(self):
self.cuda_nesting = None
pass

def loop_mode_name(self):
return "seq"
@@ -52,10 +46,11 @@ def new_actor_kind(self, old_actor_kind: ActorKind):


class Par(LoopMode):
cuda_nesting = None
is_par = True
is_async = False

def __init__(self):
self.cuda_nesting = None
pass

def loop_mode_name(self):
return "par"
@@ -71,7 +66,9 @@ def lane_unit(self):


class CudaClusters(LoopMode):
cuda_nesting = 2
is_par = True
is_async = False

blocks: int

def __init__(self, blocks):
@@ -88,7 +85,9 @@ def lane_unit(self):


class CudaBlocks(LoopMode):
cuda_nesting = 3
is_par = True
is_async = False

warps: int

def __init__(self, warps=1):
@@ -108,7 +107,8 @@ def lane_unit(self):


class CudaWarpgroups(LoopMode):
cuda_nesting = 4
is_par = True
is_async = False

def __init__(self):
pass
@@ -127,7 +127,8 @@ def lane_unit(self):


class CudaWarps(LoopMode):
cuda_nesting = 5
is_par = True
is_async = False

def __init__(self):
pass
@@ -146,7 +147,8 @@ def lane_unit(self):


class CudaThreads(LoopMode):
cuda_nesting = 6
is_par = True
is_async = False

def __init__(self):
pass
@@ -164,17 +166,25 @@ def lane_unit(self):
cuda_threads = CudaThreads()


def loop_mode_for_actor_kind(actor_kind):
def loop_mode_for_async_actor_kind(actor_kind):
assert actor_kind.is_async()

class _AsyncLoopMode(LoopMode):
is_par = False
is_async = True

def __init__(self):
self.cuda_nesting = None
pass

def loop_mode_name(self):
return actor_kind.name

def new_actor_kind(self, old_actor_kind: ActorKind):
return actor_kind

def __repr__(self):
return f"loop_mode_for_async_actor_kind({actor_kind})()"

return _AsyncLoopMode


@@ -193,7 +203,7 @@ def make_loop_mode_dict():
for name, actor_kind in actor_kind_dict.items():
assert name == actor_kind.name
if not actor_kind.is_synthetic() and actor_kind.is_async():
loop_mode_dict[name] = loop_mode_for_actor_kind(actor_kind)
loop_mode_dict[name] = loop_mode_for_async_actor_kind(actor_kind)
return loop_mode_dict


60 changes: 60 additions & 0 deletions src/exo/spork/spork_env.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,63 @@
from typing import List, Optional, Tuple

from ..core.prelude import Sym, SrcInfo, extclass

from .lane_units import LaneUnit, LaneSpecialization
from .loop_modes import LoopMode
from .actor_kinds import ActorKind, cpu

from ..core.LoopIR import LoopIR


class ParIndexRecord(object):
__slots__ = ["exo_iter", "c_iter", "c_range", "static_range"]

exo_iter: str # Iteration variable name in Exo code
c_iter: str # Iteration variable name in C code
# [c[0], c[1]) -- iteration bounds as compiled C code fragments
c_range: Tuple[str, str]
# [static_range[0], static_range[1]) -- statically analyzed maximal bounds
static_range: Tuple[int, int]


# This is where most of the backend logic for CUDA should go.
class SporkEnv(object):
kernel_name: str
kernel_lines: List[str]
_actor_kind_stack: List[ActorKind]

def __init__(self, kernel_name: str):
self.kernel_name = kernel_name
self.kernel_lines = []
self._actor_kind_stack = []

def __bool__(self):
return True

def get_actor_kind(self):
assert self._actor_kind_stack
return self._actor_kind_stack[-1]

def push_for(self, actor_kind: ActorKind, loop_mode: LoopMode) -> bool:
assert isinstance(actor_kind, ActorKind)
assert isinstance(loop_mode, LoopMode)

if self._actor_kind_stack:
assert actor_kind.allows_parent(self._actor_kind_stack[-1])
self._actor_kind_stack.append(actor_kind)
self.kernel_lines.append(f"// TODO parallel-for {loop_mode}")
return True

def pop_for(self):
assert self._actor_kind_stack
self._actor_kind_stack.pop()

def push_lane_specialization(self, lane_specialization: LaneSpecialization) -> str:
assert isinstance(lane_specialization, LaneSpecialization)
return "true /* TODO compile LaneSpecialization */"

def pop_lane_specialization(self):
pass

def on_comp_s(self, s):
pass