From b59dfb3ea1f5aac5170796ab77c605fff5b8f5f8 Mon Sep 17 00:00:00 2001 From: Eduardo Souza Date: Thu, 29 Aug 2024 05:37:53 +0000 Subject: [PATCH] Adding fastpath allocation --- src/llvm-gc-interface-passes.h | 5 ++ src/llvm-late-gc-lowering.cpp | 139 +++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+) diff --git a/src/llvm-gc-interface-passes.h b/src/llvm-gc-interface-passes.h index cb485751d407b6..09ee585bde45ee 100644 --- a/src/llvm-gc-interface-passes.h +++ b/src/llvm-gc-interface-passes.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -329,6 +330,7 @@ struct LateLowerGCFrame: private JuliaPassContext { private: CallInst *pgcstack; + Function *smallAllocFunc; void MaybeNoteDef(State &S, BBState &BBS, Value *Def, const ArrayRef &SafepointsSoFar, SmallVector &&RefinedPtr = SmallVector()); @@ -366,6 +368,9 @@ struct LateLowerGCFrame: private JuliaPassContext { void RefineLiveSet(LargeSparseBitVector &LS, State &S, ArrayRef CalleeRoots); Value *EmitTagPtr(IRBuilder<> &builder, Type *T, Type *T_size, Value *V); Value *EmitLoadTag(IRBuilder<> &builder, Type *T_size, Value *V); +#ifdef MMTK_GC + Value* lowerGCAllocBytesLate(CallInst *target, Function &F); +#endif }; // The final GC lowering pass. This pass lowers platform-agnostic GC diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp index 8d1d5ff73b2613..6dfacd9c27b181 100644 --- a/src/llvm-late-gc-lowering.cpp +++ b/src/llvm-late-gc-lowering.cpp @@ -2414,8 +2414,122 @@ void LateLowerGCFrame::PlaceRootsAndUpdateCalls(SmallVectorImpl &Colors, St } } +#ifdef MMTK_GC +Value* LateLowerGCFrame::lowerGCAllocBytesLate(CallInst *target, Function &F) +{ + assert(target->arg_size() == 3); + + IRBuilder<> builder(target); + auto ptls = target->getArgOperand(0); + auto type = target->getArgOperand(2); + if (auto CI = dyn_cast(target->getArgOperand(1))) { + size_t sz = (size_t)CI->getZExtValue(); + // This is strongly architecture and OS dependent + int osize; + int offset = jl_gc_classify_pools(sz, &osize); + if (offset >= 0) { + // In this case instead of lowering julia.gc_alloc_bytes to jl_gc_small_alloc + // We do a slowpath/fastpath check and lower it only on the slowpath, returning + // the cursor and updating it in the fastpath. + auto pool_osize_i32 = ConstantInt::get(Type::getInt32Ty(F.getContext()), osize); + auto pool_osize = ConstantInt::get(Type::getInt64Ty(F.getContext()), osize); + + // Should we generate fastpath allocation sequence here? We should always generate fastpath here for MMTk. + // Setting this to false will increase allocation overhead a lot, and should only be used for debugging. + const bool INLINE_FASTPATH_ALLOCATION = true; + + if (INLINE_FASTPATH_ALLOCATION) { + // Assuming we use the first immix allocator. + // FIXME: We should get the allocator index and type from MMTk. + auto allocator_offset = offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, mmtk_mutator) + offsetof(MMTkMutatorContext, allocators) + offsetof(Allocators, immix); + + auto cursor_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, cursor)); + auto limit_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), allocator_offset + offsetof(ImmixAllocator, limit)); + + auto cursor_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, cursor_pos); + auto cursor_ptr = builder.CreateBitCast(cursor_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "cursor_ptr"); + auto cursor = builder.CreateLoad(Type::getInt64Ty(target->getContext()), cursor_ptr, "cursor"); + + // offset = 8 + auto delta_offset = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), ConstantInt::get(Type::getInt64Ty(target->getContext()), 8)); + auto delta_cursor = builder.CreateNSWSub(ConstantInt::get(Type::getInt64Ty(target->getContext()), 0), cursor); + auto delta_op = builder.CreateNSWAdd(delta_offset, delta_cursor); + // alignment 16 (15 = 16 - 1) + auto delta = builder.CreateAnd(delta_op, ConstantInt::get(Type::getInt64Ty(target->getContext()), 15), "delta"); + auto result = builder.CreateNSWAdd(cursor, delta, "result"); + + auto new_cursor = builder.CreateNSWAdd(result, pool_osize); + + auto limit_tls_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, limit_pos); + auto limit_ptr = builder.CreateBitCast(limit_tls_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "limit_ptr"); + auto limit = builder.CreateLoad(Type::getInt64Ty(target->getContext()), limit_ptr, "limit"); + + auto gt_limit = builder.CreateICmpSGT(new_cursor, limit); + + auto slowpath = BasicBlock::Create(target->getContext(), "slowpath", target->getFunction()); + auto fastpath = BasicBlock::Create(target->getContext(), "fastpath", target->getFunction()); + + auto next_instr = target->getNextNode(); + SmallVector Weights{1, 9}; + + MDBuilder MDB(F.getContext()); + SplitBlockAndInsertIfThenElse(gt_limit, next_instr, &slowpath, &fastpath, false, false, MDB.createBranchWeights(Weights)); + + builder.SetInsertPoint(next_instr); + auto phiNode = builder.CreatePHI(target->getCalledFunction()->getReturnType(), 2, "phi_fast_slow"); + + // slowpath + builder.SetInsertPoint(slowpath); + auto pool_offs = ConstantInt::get(Type::getInt32Ty(F.getContext()), 1); + auto new_call = builder.CreateCall(smallAllocFunc, { ptls, pool_offs, pool_osize_i32, type }); + new_call->setAttributes(new_call->getCalledFunction()->getAttributes()); + builder.CreateBr(next_instr->getParent()); + + // fastpath + builder.SetInsertPoint(fastpath); + builder.CreateStore(new_cursor, cursor_ptr); + + // ptls->gc_tls.gc_num.allocd += osize; + auto pool_alloc_pos = ConstantInt::get(Type::getInt64Ty(target->getContext()), offsetof(jl_tls_states_t, gc_tls) + offsetof(jl_gc_tls_states_t, gc_num)); + auto pool_alloc_i8 = builder.CreateGEP(Type::getInt8Ty(target->getContext()), ptls, pool_alloc_pos); + auto pool_alloc_tls = builder.CreateBitCast(pool_alloc_i8, PointerType::get(Type::getInt64Ty(target->getContext()), 0), "pool_alloc"); + auto pool_allocd = builder.CreateLoad(Type::getInt64Ty(target->getContext()), pool_alloc_tls); + auto pool_allocd_total = builder.CreateAdd(pool_allocd, pool_osize); + builder.CreateStore(pool_allocd_total, pool_alloc_tls); + + auto v_raw = builder.CreateNSWAdd(result, ConstantInt::get(Type::getInt64Ty(target->getContext()), sizeof(jl_taggedvalue_t))); + auto v_as_ptr = builder.CreateIntToPtr(v_raw, smallAllocFunc->getReturnType()); + builder.CreateBr(next_instr->getParent()); + + phiNode->addIncoming(new_call, slowpath); + phiNode->addIncoming(v_as_ptr, fastpath); + phiNode->takeName(target); + return phiNode; + } + } + } + return target; +} + +template +static void replaceInstruction( + Instruction *oldInstruction, + Value *newInstruction, + TIterator &it) +{ + if (newInstruction != oldInstruction) { + oldInstruction->replaceAllUsesWith(newInstruction); + it = oldInstruction->eraseFromParent(); + } + else { + ++it; + } +} +#endif + bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) { initAll(*F.getParent()); + smallAllocFunc = getOrDeclare(jl_well_known::GCSmallAlloc); LLVM_DEBUG(dbgs() << "GC ROOT PLACEMENT: Processing function " << F.getName() << "\n"); if (!pgcstack_getter && !adoptthread_func) return CleanupIR(F, nullptr, CFGModified); @@ -2430,6 +2544,31 @@ bool LateLowerGCFrame::runOnFunction(Function &F, bool *CFGModified) { std::map> CallFrames; // = OptimizeCallFrames(S, Ordering); PlaceRootsAndUpdateCalls(Colors, S, CallFrames); CleanupIR(F, &S, CFGModified); + +#ifdef MMTK_GC + // We lower the julia.gc_alloc_bytes intrinsic in this pass to insert slowpath/fastpath blocks for MMTk + for (BasicBlock &BB : F) { + for (auto it = BB.begin(); it != BB.end();) { + auto *CI = dyn_cast(&*it); + if (!CI) { + ++it; + continue; + } + + Value *callee = CI->getCalledOperand(); + assert(callee); + + auto GCAllocBytes = getOrNull(jl_intrinsics::GCAllocBytes); + if (GCAllocBytes == callee) { + *CFGModified = true; + replaceInstruction(CI, lowerGCAllocBytesLate(CI, F), it); + continue; + } + ++it; + } + } +#endif + return true; }