Skip to content

Commit

Permalink
support async collective op execution
Browse files Browse the repository at this point in the history
  • Loading branch information
eedalong committed Mar 14, 2024
1 parent f160eb2 commit f869ef5
Show file tree
Hide file tree
Showing 5 changed files with 177 additions and 10 deletions.
29 changes: 28 additions & 1 deletion tao_compiler/mlir/disc/transforms/disc_lower_to_library_call.cc
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,18 @@ Value GetDefaultStreamHandle(Operation* op, PatternRewriter& rewriter) {
return stream_idx;
}


Value GetAsyncCollectiveOpStreamHandle(Operation* op, PatternRewriter& rewriter) {
Location loc = op->getLoc();
MLIRContext* ctx = rewriter.getContext();
Type llvm_int32_type = IntegerType::get(ctx, 32);
Value zero = rewriter.create<LLVM::ConstantOp>(loc, llvm_int32_type,
rewriter.getI32IntegerAttr(1));
Type pointer_type = LLVM::LLVMPointerType::get(IntegerType::get(ctx, 8));
Value stream_idx = rewriter.create<LLVM::IntToPtrOp>(loc, pointer_type, zero);
return stream_idx;
}

// Insert a sync on stream call.
void InsertSyncOnStream(Operation* op, Value ctx, Value stream_handle,
PatternRewriter& rewriter) {
Expand Down Expand Up @@ -847,6 +859,15 @@ LogicalResult emitAttr(Attribute attr, StrT& out) {
return failure();
}

bool IsAsyncCollectiveOp(CustomCallV2Op op) {

if(op.getAttr("call_target_name") == "ral_all_reduce" ||
op.getAttr("call_target_name") == "ral_all_gather") {
return true;
}

return false;
}
struct CustomCallV2OpConvertor : public OpRewritePattern<CustomCallV2Op> {
CustomCallV2OpConvertor(MLIRContext* context, bool gpuEnabled)
: OpRewritePattern<CustomCallV2Op>::OpRewritePattern(context) {
Expand All @@ -867,7 +888,13 @@ struct CustomCallV2OpConvertor : public OpRewritePattern<CustomCallV2Op> {
<< "fail to lower the custom_attrs of the custom call op.\n";
}

Value streamHandle = GetDefaultStreamHandle(op, rewriter);
Value streamHandle;
if (IsAsyncCollectiveOp(op)) {
streamHandle = GetAsyncCollectiveOpStreamHandle(op, rewriter);
} else {
streamHandle = GetDefaultStreamHandle(op, rewriter);
}

SmallVector<Value> newOperands{streamHandle};
for (Value operand : op->getOperands()) newOperands.push_back(operand);

Expand Down
61 changes: 52 additions & 9 deletions tao_compiler/mlir/disc/transforms/mhlo_decomp_rewriters.cc
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,22 @@ LogicalResult SliceOpConvert::matchAndRewrite(mhlo::SliceOp op,
}
} // namespace
namespace {


bool IsAsyncCollective(Operation* op) {
if(llvm::isa<mhlo::AllReduceOp>(op)) {
if (const char* env_p = std::getenv("ENABLE_ASYNC_ALL_REDUCE")) {
return std::strcmp(env_p, "true") == 0 || std::strcmp(env_p, "True") == 0;
}
} else if(llvm::isa<mhlo::AllGatherOp>(op)) {
if (const char* env_p = std::getenv("ENABLE_ASYNC_ALL_GATHER")) {
return std::strcmp(env_p, "true") == 0 || std::strcmp(env_p, "True") == 0;
}
}

return false;
}

enum ReductionKind {
ALL_REDUCE_SUM,
ALL_REDUCE_PRODUCT,
Expand Down Expand Up @@ -192,6 +208,9 @@ struct CollectiveOpConverter : public OpRewritePattern<mhlo::AllReduceOp> {
if (!reductionKind) {
return failure();
}

bool is_async = IsAsyncCollective(op.getOperation());

for (int i = 0; i < op->getOperands().size(); ++i) {
// no need call all_reduce op if no consumer
if (op->getResult(i).getUsers().empty()) {
Expand All @@ -206,19 +225,43 @@ struct CollectiveOpConverter : public OpRewritePattern<mhlo::AllReduceOp> {
op->setAttr("output_layouts", rewriter.getStringAttr("*"));
op->setAttr("expected_input_layouts", rewriter.getStringAttr("*"));
op->setAttr("expected_output_layouts", rewriter.getStringAttr("*"));
SmallVector<NamedAttribute> newAttrs;
newAttrs.push_back(
NamedAttribute(rewriter.getStringAttr("reduction_kind"),
rewriter.getStringAttr(reductionKind.value())));

auto newCustomAttrs = DictionaryAttr::get(op->getContext(), newAttrs);

op->setAttr("custom_attrs", newCustomAttrs);
SmallVector<NamedAttribute> attrs;
attrs.push_back(
NamedAttribute(rewriter.getStringAttr("reduction_kind"),
rewriter.getStringAttr(reductionKind.value()))
);
attrs.push_back(
NamedAttribute(rewriter.getStringAttr("is_async"),
rewriter.getBoolAttr(is_async))
);
auto customAttrs = DictionaryAttr::get(op->getContext(), attrs);
op->setAttr("custom_attrs", customAttrs);

auto newOutput = rewriter.create<mhlo_disc::CustomCallV2Op>(
auto reduce_op = rewriter.create<mhlo_disc::CustomCallV2Op>(
op->getLoc(), op->getResults()[i].getType(), op->getOperands()[i],
op->getAttrs());
newOutputs.push_back(newOutput.getResult(0));

if(is_async) {
int64_t async_pair_token = reinterpret_cast<int64_t>(reduce_op.getOperation());
attrs.push_back(
NamedAttribute(rewriter.getStringAttr("async_token_key"),
rewriter.getIntegerAttr(rewriter.getIntegerType(64), async_pair_token))
);
auto newCustomAttrs = DictionaryAttr::get(reduce_op->getContext(), attrs);
reduce_op->setAttr("custom_attrs", newCustomAttrs);
}

if(is_async) {
// Insert CollectiveDoneOp
auto collective_done_op = rewriter.create<mhlo_disc::CustomCallV2Op>(
reduce_op->getLoc(), reduce_op->getResults()[0].getType(), reduce_op->getResults()[0],
reduce_op->getAttrs());
collective_done_op->setAttr("call_target_name", rewriter.getStringAttr("ral_async_collective_done"));
newOutputs.push_back(collective_done_op.getResult(0));
} else {
newOutputs.push_back(reduce_op.getResult(0));
}
}
rewriter.replaceOp(op, newOutputs);
return success();
Expand Down
69 changes: 69 additions & 0 deletions tao_compiler/mlir/ral/collective.cu.cc
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ MemRefType<T, N> ral_all_reduce(ExecutionContext* ctx, void* stream_handle,
auto& dictAttr = attr->as<DictPDLAttr>();
std::string reductionKind =
dictAttr.get("reduction_kind").template as<StrPDLAttr>().getValue();

bool isAsync = dictAttr.get("is_async").template as<BoolPDLAttr>().getValue();

ncclDataType_t ncclDtype = ncclDataTypeMapper<T>::value;
auto ncclReductionType = getNcclReductionType(reductionKind);

Expand All @@ -87,9 +90,54 @@ MemRefType<T, N> ral_all_reduce(ExecutionContext* ctx, void* stream_handle,
if (ncclResult != ncclSuccess) {
ctx->signalError(Context::FAILURE, "fail to call ncclAllReduce\n");
}

if(isAsync && gpu_stream) {
int64_t token_key =
dictAttr.get("async_token_key").template as<IntPDLAttr>().getValue();
cudaEvent_t event;

auto event_status = cudaEventCreate(&event);
if (event_status != cudaSuccess) {
ctx->signalError(Context::FAILURE, "cudaEventCreate failed:" + cudaGetErrorString(event_status)+ "\n");
}

auto record_status = cudaEventRecord(event, gpu_stream);
if (record_status != cudaSuccess) {
cudaEventDestroy(event);
ctx->signalError(Context::FAILURE, "cudaEventRecord failed:" + cudaGetErrorString(record_status) + "\n");
}

static_cast<gpu::BaseCudaExecutionContext*>(ctx)->addAsyncPairToken(token_key, event);
}

return output;
}

template <typename T, int N>
MemRefType<T, N> ral_async_collective_done(ExecutionContext* ctx, void* stream_handle,
MemRefType<T, N> input, void* customAttrs) {
auto attr =
getOrParsePDLAttr(ctx, customAttrs, "simple_test_fused_add_mul_kernel");
if (!attr) {
ctx->signalError(Context::FAILURE, "fail to parse custom_attrs in ral_async_collective_done\n");
}
auto& dictAttr = attr->as<DictPDLAttr>();
int64_t token_key =
dictAttr.get("async_token_key").template as<IntPDLAttr>().getValue();
auto event =
static_cast<gpu::BaseCudaExecutionContext*>(ctx)->getAsyncPairToken(token_key);
if(event) {
auto sync_status = cudaEventSynchronize(event);
if (sync_status != cudaSuccess) {
ctx->signalError(Context::FAILURE, "cudaEventSynchronize failed: " + cudaGetErrorString(sync_status) + "\n");
}
static_cast<gpu::BaseCudaExecutionContext*>(ctx)->removeAsyncPairToken(token_key);
cudaEventDestroy(event);
}

return input;
}

TAO_RAL_API("ral_all_reduce", "gpu", ral_all_reduce<float, 1>);
TAO_RAL_API("ral_all_reduce", "gpu", ral_all_reduce<float, 2>);
TAO_RAL_API("ral_all_reduce", "gpu", ral_all_reduce<float, 3>);
Expand All @@ -98,5 +146,26 @@ TAO_RAL_API("ral_all_reduce", "gpu", ral_all_reduce<float16, 1>);
TAO_RAL_API("ral_all_reduce", "gpu", ral_all_reduce<float16, 2>);
TAO_RAL_API("ral_all_reduce", "gpu", ral_all_reduce<float16, 3>);
TAO_RAL_API("ral_all_reduce", "gpu", ral_all_reduce<float16, 4>);




TAO_RAL_API("ral_async_collective_done", "gpu", ral_async_collective_done<float, 1>);
TAO_RAL_API("ral_async_collective_done", "gpu", ral_async_collective_done<float, 2>);
TAO_RAL_API("ral_async_collective_done", "gpu", ral_async_collective_done<float, 3>);
TAO_RAL_API("ral_async_collective_done", "gpu", ral_async_collective_done<float, 4>);
TAO_RAL_API("ral_async_collective_done", "gpu", ral_async_collective_done<float16, 1>);
TAO_RAL_API("ral_async_collective_done", "gpu", ral_async_collective_done<float16, 2>);
TAO_RAL_API("ral_async_collective_done", "gpu", ral_async_collective_done<float16, 3>);
TAO_RAL_API("ral_async_collective_done", "gpu", ral_async_collective_done<float16, 4>);









} // namespace ral
} // namespace tao
24 changes: 24 additions & 0 deletions tao_compiler/mlir/ral/context/base/cuda/cuda_context_impl.cc
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ struct BaseCudaContextState : public tao::ral::Context::Resource {
std::map<void*, GpuModuleHandle> blobs;
// map <blob ptr, kernel name> -> callable kernel
std::map<std::pair<void*, std::string>, GpuFunctionHandle> kernels;
// map int64 -> cudaEvent_t
std::map<int64_t, cudaEvent_t> async_pair_tokens;

std::shared_ptr<Allocator> gpu_allocator;
bool cache_workspace_mem_across_execution;
Expand Down Expand Up @@ -206,6 +208,28 @@ ncclComm_t BaseCudaExecutionContext::getNcclComm() {
return state->nccl_comm;
}

cudaEvent_t BaseCudaExecutionContext::getAsyncPairToken(int64_t key) {
auto* state = getResource<BaseCudaContextState>(kRalBaseCudaContextState);
if(state->async_pair_tokens.find(key) != state->async_pair_tokens.end()) {
return state->async_pair_tokens[key];
}
return nullptr;
}

void BaseCudaExecutionContext::addAsyncPairToken(int64_t key, cudaEvent_t token) {
auto* state = getResource<BaseCudaContextState>(kRalBaseCudaContextState);
state->async_pair_tokens[key] = token;
return;
}

void removeAsyncPairToken(int64_t key) {
auto* state = getResource<BaseCudaContextState>(kRalBaseCudaContextState);
if(state->async_pair_tokens.find(key) != state->async_pair_tokens.end()) {
return state->async_pair_tokens.erase(key);
}
return;
}

void BaseCudaExecutionContext::setOutputDeleter(OutputBufferWrapper& output) {
{
if (synced) {
Expand Down
4 changes: 4 additions & 0 deletions tao_compiler/mlir/ral/context/base/cuda/cuda_context_impl.h
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ struct BaseCudaExecutionContext
~BaseCudaExecutionContext();

ncclComm_t getNcclComm();

cudaEvent_t getAsyncPairToken(int64_t key);
void addAsyncPairToken(int64_t key, cudaEvent_t token);
void removeAsyncPairToken(int64_t key);
// We need to sync on the gpu stream before we fetch the first output.
bool synced = false;
// all buffer allocated by the gpu_allocator
Expand Down

0 comments on commit f869ef5

Please sign in to comment.