diff --git a/csrc/runtime/compiled_kernel.h b/csrc/runtime/compiled_kernel.h index a021c5463ef..43d7d1f9db6 100644 --- a/csrc/runtime/compiled_kernel.h +++ b/csrc/runtime/compiled_kernel.h @@ -20,7 +20,6 @@ #include #include #include -// #include #include #include @@ -54,6 +53,9 @@ class RtcKernel : public NonCopyable { int64_t device_index_; }; +//! Class for compilation logic through nvRTC. It shouldn't hold any logic +//! associated with how to run a kernel, but how to compile it. It should also +//! contain any information about the kernel itself. class CompiledKernel : public NonCopyable { public: // NVF_API was added for nvfuser_extension. See examples/sinh_extension. @@ -119,11 +121,6 @@ class CompiledKernel : public NonCopyable { return lowered_->kernel()->as(); } - //! get register spills (load + store) of the compiled kernel - int getKernelRegisterSpills() const { - return compiled_kernel_->register_spills; - } - //! Returns the string of the compiled kernel NVF_API std::string kernelString() const { NVF_ERROR(!kernel_code_.empty(), "Kernel code not generated"); @@ -155,9 +152,6 @@ class CompiledKernel : public NonCopyable { const int64_t& groupId() const { return group_id_; } - // void setGroupId(int64_t gid) { - // group_id_ = gid; - // } bool validKernelId() const { return !kernel_id_.empty(); diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp index 1296aa47870..c4c4ff12eb5 100644 --- a/csrc/runtime/executor.cpp +++ b/csrc/runtime/executor.cpp @@ -160,26 +160,25 @@ bool hasCpuScalarOutputs(Fusion* _fusion) { } } // namespace -bool KernelExecutor::supported(Fusion* _fusion) { +bool KernelExecutor::supported(Fusion* fusion) { FUSER_PERF_SCOPE("KernelExecutor::supported"); - return !hasCpuScalarOutputs(_fusion); + return !hasCpuScalarOutputs(fusion); } void KernelExecutor::compile( - Fusion* _fusion, + Fusion* fusion, const KernelArgumentHolder& args, const LaunchParams& launch_constraints, CompileParams compile_params, SchedulerType scheduler_type) { FUSER_PERF_SCOPE("KernelExecutor::compile"); - fusion_ = std::make_unique(*_fusion); + NVF_ERROR( - supported(fusion_.get()), + supported(fusion), "KernelExecutor does not support the Fusion provided."); NVF_ERROR( - !fusion_->outputs().empty(), - "No output found for this kernel, aborting."); + !fusion->outputs().empty(), "No output found for this kernel, aborting."); auto device = c10::Device(c10::DeviceType::CUDA, args.getDeviceIndex()); @@ -194,7 +193,7 @@ void KernelExecutor::compile( //! Force index_type to int and disable magic zero if we detect that the //! kernel contains any TMA memory operations. - std::vector exprs = fusion_->exprs(); + std::vector exprs = fusion->exprs(); bool has_cp_async_bulk = std::any_of(exprs.begin(), exprs.end(), [](Expr* e) { return ir_utils::isCpAsyncBulk(e); }); @@ -251,7 +250,7 @@ void KernelExecutor::compile( // Lowered is needed to compute launch parameters as it uses the CA map. We // could modify that, but simply generating that part first. compiled_kernel_ = std::make_unique( - fusion_.get(), + fusion, compile_params, device, scheduler_type, @@ -920,7 +919,8 @@ std::vector KernelExecutor::run( NVF_ERROR(isCompiled()); NVF_ERROR( - outputs.empty() || (outputs.size() == fusion()->outputs().size()), + outputs.empty() || + (outputs.size() == compiledKernel()->fusion()->outputs().size()), __func__, " provided number of outputs does not match fusion output"); diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h index 8fb8592ab2a..56a3f3e929c 100644 --- a/csrc/runtime/executor.h +++ b/csrc/runtime/executor.h @@ -145,7 +145,7 @@ class KernelExecutor : public ExecutorAbstract { if (compiledKernel()) { return true; } - return fusion_ != nullptr; + return false; }; void evictCache(size_t cache_id) { @@ -180,9 +180,6 @@ class KernelExecutor : public ExecutorAbstract { using ExecutorCompileTimeInfoCache = executor_utils::caching::ExecutorCompileTimeInfoCache; - const std::unique_ptr& fusion() const { - return fusion_; - } //! Internal knob used for debugging/profiling only void setExecuteKernelFlag(bool execute_kernel) { execute_kernel_ = execute_kernel; @@ -339,9 +336,6 @@ class KernelExecutor : public ExecutorAbstract { int64_t warp_size_ = 0; - // Initialized for non-compiled fusions - std::unique_ptr fusion_; - // lookup table to take short cut to retrieve recorded information in order to // launch kernels without re-inference parameters. std::unordered_map executor_entry_lookup_; diff --git a/csrc/runtime/executor_params.h b/csrc/runtime/executor_params.h index 4ca462b0afd..da8c998ec8b 100644 --- a/csrc/runtime/executor_params.h +++ b/csrc/runtime/executor_params.h @@ -22,6 +22,9 @@ struct CompileParams { bool enable_magic_zero = true; // if true, save ptxas info to compile log and check for register spilling bool enable_ptxas_verbose = false; + // Wrapping device in an optional allows us to initialize a value for the + // struct without having to select a specific device. Otherwise the default + // constructor will be deleted for the struct. std::optional device = std::nullopt; bool operator==(const CompileParams& other) const { diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp index 3c91da40736..c4b1761d3b8 100644 --- a/tests/cpp/test_gpu3.cpp +++ b/tests/cpp/test_gpu3.cpp @@ -8103,7 +8103,7 @@ TEST_F(NVFuserTest, AvoidCachingSliceInput) { continue; } const auto* ke = exec->as(); - for (auto expr : ke->fusion()->exprs()) { + for (auto expr : ke->compiledKernel()->fusion()->exprs()) { if (expr->isA()) { auto slice = expr->as(); EXPECT_EQ(slice->in()->getMemoryType(), MemoryType::Global); diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp index 55131346585..bdfa67cff99 100644 --- a/tests/cpp/test_resize.cpp +++ b/tests/cpp/test_resize.cpp @@ -4154,10 +4154,10 @@ TEST_P(ResizeSchedulerTest, PropagateSliceToInputs) { const auto& heuristic_param = runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); - Fusion* scheduled_fusion = + auto scheduled_fusion = dynamic_cast(runtime->executors().at(0).get()) - ->fusion() - .get(); + ->compiledKernel() + ->fusion(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } @@ -4245,10 +4245,10 @@ TEST_P(ResizeSchedulerTest, PropagateSliceToInputsWithReshape1) { const auto& heuristic_param = runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); - Fusion* scheduled_fusion = + auto scheduled_fusion = dynamic_cast(runtime->executors().at(0).get()) - ->fusion() - .get(); + ->compiledKernel() + ->fusion(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } @@ -4332,10 +4332,10 @@ TEST_P(ResizeSchedulerTest, PropagateSliceToInputsWithReshape2) { const auto& heuristic_param = runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); - Fusion* scheduled_fusion = + auto scheduled_fusion = dynamic_cast(runtime->executors().at(0).get()) - ->fusion() - .get(); + ->compiledKernel() + ->fusion(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } @@ -4443,10 +4443,10 @@ TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs1) { const auto& heuristic_param = runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); - Fusion* scheduled_fusion = + auto scheduled_fusion = dynamic_cast(runtime->executors().at(0).get()) - ->fusion() - .get(); + ->compiledKernel() + ->fusion(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } @@ -4653,8 +4653,8 @@ TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs3) { EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); Fusion* scheduled_fusion = dynamic_cast(runtime->executors().at(0).get()) - ->fusion() - .get(); + ->compiledKernel() + ->fusion(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } @@ -4793,10 +4793,10 @@ TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs5) { const auto& heuristic_param = runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); - Fusion* scheduled_fusion = + auto scheduled_fusion = dynamic_cast(runtime->executors().at(0).get()) - ->fusion() - .get(); + ->compiledKernel() + ->fusion(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } @@ -4984,10 +4984,10 @@ TEST_P(ResizeSchedulerTest, SliceRotateCat) { const auto& heuristic_param = runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); - Fusion* scheduled_fusion = + auto scheduled_fusion = dynamic_cast(runtime->executors().at(0).get()) - ->fusion() - .get(); + ->compiledKernel() + ->fusion(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } @@ -5123,10 +5123,10 @@ TEST_P(ResizeSchedulerTest, SliceRotateCatResidual) { const auto& heuristic_param = runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); - Fusion* scheduled_fusion = + auto scheduled_fusion = dynamic_cast(runtime->executors().at(0).get()) - ->fusion() - .get(); + ->compiledKernel() + ->fusion(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } @@ -5312,10 +5312,10 @@ TEST_P(ResizeSchedulerTest, PropagatePadToInputs) { const auto& heuristic_param = runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); - Fusion* scheduled_fusion = + auto scheduled_fusion = dynamic_cast(runtime->executors().at(0).get()) - ->fusion() - .get(); + ->compiledKernel() + ->fusion(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } @@ -5415,10 +5415,10 @@ TEST_P(ResizeSchedulerTest, PropagateCatToInputs) { const auto& heuristic_param = runtime->schedulerHeuristics()->heuristicsList().front(); EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize); - Fusion* scheduled_fusion = + auto scheduled_fusion = dynamic_cast(runtime->executors().at(0).get()) - ->fusion() - .get(); + ->compiledKernel() + ->fusion(); checkLoopDomainEquivalence( scheduled_fusion->outputs().at(0)->as()); } diff --git a/tests/cpp/utils.h b/tests/cpp/utils.h index 21c4d7a1aa5..10acc8f9d4e 100644 --- a/tests/cpp/utils.h +++ b/tests/cpp/utils.h @@ -826,7 +826,7 @@ bool isSchedulerInUse( const SchedulerType& scheduler_type); // Disable magic zero -const CompileParams matmul_cparams{DataType::Int32, 255, false}; +constexpr CompileParams matmul_cparams{DataType::Int32, 255, false}; // Utility to generate tensor with bias applied on the input tensor TensorView* biasEpilogue(TensorView* tensor, TensorView* bias);