PR review and minor cleanup.

NVIDIA · Jan 11, 2025 · e008bc7 · e008bc7
1 parent c199020
commit e008bc7
Show file tree

Hide file tree

Showing 7 changed files with 48 additions and 57 deletions.
diff --git a/csrc/runtime/compiled_kernel.h b/csrc/runtime/compiled_kernel.h
@@ -20,7 +20,6 @@
 #include <runtime/executor_params.h>
 #include <runtime/executor_utils.h>
 #include <scheduler/scheduler_types.h>
-// #include <serde/fusion_cache_generated.h>
 #include <utils.h>
 #include <atomic>
 
@@ -54,6 +53,9 @@ class RtcKernel : public NonCopyable {
   int64_t device_index_;
 };
 
+//! Class for compilation logic through nvRTC. It shouldn't hold any logic
+//! associated with how to run a kernel, but how to compile it. It should also
+//! contain any information about the kernel itself.
 class CompiledKernel : public NonCopyable {
  public:
   // NVF_API was added for nvfuser_extension. See examples/sinh_extension.
@@ -119,11 +121,6 @@ class CompiledKernel : public NonCopyable {
     return lowered_->kernel()->as<Fusion>();
   }
 
-  //! get register spills (load + store) of the compiled kernel
-  int getKernelRegisterSpills() const {
-    return compiled_kernel_->register_spills;
-  }
-
   //! Returns the string of the compiled kernel
   NVF_API std::string kernelString() const {
     NVF_ERROR(!kernel_code_.empty(), "Kernel code not generated");
@@ -155,9 +152,6 @@ class CompiledKernel : public NonCopyable {
   const int64_t& groupId() const {
     return group_id_;
   }
-  // void setGroupId(int64_t gid) {
-  //   group_id_ = gid;
-  // }
 
   bool validKernelId() const {
     return !kernel_id_.empty();

diff --git a/csrc/runtime/executor.cpp b/csrc/runtime/executor.cpp
@@ -160,26 +160,25 @@ bool hasCpuScalarOutputs(Fusion* _fusion) {
 }
 } // namespace
 
-bool KernelExecutor::supported(Fusion* _fusion) {
+bool KernelExecutor::supported(Fusion* fusion) {
   FUSER_PERF_SCOPE("KernelExecutor::supported");
-  return !hasCpuScalarOutputs(_fusion);
+  return !hasCpuScalarOutputs(fusion);
 }
 
 void KernelExecutor::compile(
-    Fusion* _fusion,
+    Fusion* fusion,
     const KernelArgumentHolder& args,
     const LaunchParams& launch_constraints,
     CompileParams compile_params,
     SchedulerType scheduler_type) {
   FUSER_PERF_SCOPE("KernelExecutor::compile");
-  fusion_ = std::make_unique<Fusion>(*_fusion);
+
   NVF_ERROR(
-      supported(fusion_.get()),
+      supported(fusion),
       "KernelExecutor does not support the Fusion provided.");
 
   NVF_ERROR(
-      !fusion_->outputs().empty(),
-      "No output found for this kernel, aborting.");
+      !fusion->outputs().empty(), "No output found for this kernel, aborting.");
 
   auto device = c10::Device(c10::DeviceType::CUDA, args.getDeviceIndex());
 
@@ -194,7 +193,7 @@ void KernelExecutor::compile(
 
   //! Force index_type to int and disable magic zero if we detect that the
   //! kernel contains any TMA memory operations.
-  std::vector<Expr*> exprs = fusion_->exprs();
+  std::vector<Expr*> exprs = fusion->exprs();
   bool has_cp_async_bulk = std::any_of(exprs.begin(), exprs.end(), [](Expr* e) {
     return ir_utils::isCpAsyncBulk(e);
   });
@@ -251,7 +250,7 @@ void KernelExecutor::compile(
   // Lowered is needed to compute launch parameters as it uses the CA map. We
   // could modify that, but simply generating that part first.
   compiled_kernel_ = std::make_unique<CompiledKernel>(
-      fusion_.get(),
+      fusion,
       compile_params,
       device,
       scheduler_type,
@@ -920,7 +919,8 @@ std::vector<at::Tensor> KernelExecutor::run(
 
   NVF_ERROR(isCompiled());
   NVF_ERROR(
-      outputs.empty() || (outputs.size() == fusion()->outputs().size()),
+      outputs.empty() ||
+          (outputs.size() == compiledKernel()->fusion()->outputs().size()),
       __func__,
       " provided number of outputs does not match fusion output");
 

diff --git a/csrc/runtime/executor.h b/csrc/runtime/executor.h
@@ -145,7 +145,7 @@ class KernelExecutor : public ExecutorAbstract {
     if (compiledKernel()) {
       return true;
     }
-    return fusion_ != nullptr;
+    return false;
   };
 
   void evictCache(size_t cache_id) {
@@ -180,9 +180,6 @@ class KernelExecutor : public ExecutorAbstract {
   using ExecutorCompileTimeInfoCache =
       executor_utils::caching::ExecutorCompileTimeInfoCache;
 
-  const std::unique_ptr<Fusion>& fusion() const {
-    return fusion_;
-  }
   //! Internal knob used for debugging/profiling only
   void setExecuteKernelFlag(bool execute_kernel) {
     execute_kernel_ = execute_kernel;
@@ -339,9 +336,6 @@ class KernelExecutor : public ExecutorAbstract {
 
   int64_t warp_size_ = 0;
 
-  // Initialized for non-compiled fusions
-  std::unique_ptr<Fusion> fusion_;
-
   // lookup table to take short cut to retrieve recorded information in order to
   // launch kernels without re-inference parameters.
   std::unordered_map<size_t, ExecutorEntry> executor_entry_lookup_;

diff --git a/csrc/runtime/executor_params.h b/csrc/runtime/executor_params.h
@@ -22,6 +22,9 @@ struct CompileParams {
   bool enable_magic_zero = true;
   // if true, save ptxas info to compile log and check for register spilling
   bool enable_ptxas_verbose = false;
+  // Wrapping device in an optional allows us to initialize a value for the
+  // struct without having to select a specific device. Otherwise the default
+  // constructor will be deleted for the struct.
   std::optional<c10::Device> device = std::nullopt;
 
   bool operator==(const CompileParams& other) const {

diff --git a/tests/cpp/test_gpu3.cpp b/tests/cpp/test_gpu3.cpp
@@ -8103,7 +8103,7 @@ TEST_F(NVFuserTest, AvoidCachingSliceInput) {
       continue;
     }
     const auto* ke = exec->as<KernelExecutor>();
-    for (auto expr : ke->fusion()->exprs()) {
+    for (auto expr : ke->compiledKernel()->fusion()->exprs()) {
       if (expr->isA<SliceOp>()) {
         auto slice = expr->as<SliceOp>();
         EXPECT_EQ(slice->in()->getMemoryType(), MemoryType::Global);

diff --git a/tests/cpp/test_resize.cpp b/tests/cpp/test_resize.cpp
@@ -4154,10 +4154,10 @@ TEST_P(ResizeSchedulerTest, PropagateSliceToInputs) {
     const auto& heuristic_param =
         runtime->schedulerHeuristics()->heuristicsList().front();
     EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
-    Fusion* scheduled_fusion =
+    auto scheduled_fusion =
         dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
-            ->fusion()
-            .get();
+            ->compiledKernel()
+            ->fusion();
     checkLoopDomainEquivalence(
         scheduled_fusion->outputs().at(0)->as<TensorView>());
   }
@@ -4245,10 +4245,10 @@ TEST_P(ResizeSchedulerTest, PropagateSliceToInputsWithReshape1) {
     const auto& heuristic_param =
         runtime->schedulerHeuristics()->heuristicsList().front();
     EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
-    Fusion* scheduled_fusion =
+    auto scheduled_fusion =
         dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
-            ->fusion()
-            .get();
+            ->compiledKernel()
+            ->fusion();
     checkLoopDomainEquivalence(
         scheduled_fusion->outputs().at(0)->as<TensorView>());
   }
@@ -4332,10 +4332,10 @@ TEST_P(ResizeSchedulerTest, PropagateSliceToInputsWithReshape2) {
     const auto& heuristic_param =
         runtime->schedulerHeuristics()->heuristicsList().front();
     EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
-    Fusion* scheduled_fusion =
+    auto scheduled_fusion =
         dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
-            ->fusion()
-            .get();
+            ->compiledKernel()
+            ->fusion();
     checkLoopDomainEquivalence(
         scheduled_fusion->outputs().at(0)->as<TensorView>());
   }
@@ -4443,10 +4443,10 @@ TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs1) {
     const auto& heuristic_param =
         runtime->schedulerHeuristics()->heuristicsList().front();
     EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
-    Fusion* scheduled_fusion =
+    auto scheduled_fusion =
         dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
-            ->fusion()
-            .get();
+            ->compiledKernel()
+            ->fusion();
     checkLoopDomainEquivalence(
         scheduled_fusion->outputs().at(0)->as<TensorView>());
   }
@@ -4653,8 +4653,8 @@ TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs3) {
   EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
   Fusion* scheduled_fusion =
       dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
-          ->fusion()
-          .get();
+          ->compiledKernel()
+          ->fusion();
   checkLoopDomainEquivalence(
       scheduled_fusion->outputs().at(0)->as<TensorView>());
 }
@@ -4793,10 +4793,10 @@ TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs5) {
     const auto& heuristic_param =
         runtime->schedulerHeuristics()->heuristicsList().front();
     EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
-    Fusion* scheduled_fusion =
+    auto scheduled_fusion =
         dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
-            ->fusion()
-            .get();
+            ->compiledKernel()
+            ->fusion();
     checkLoopDomainEquivalence(
         scheduled_fusion->outputs().at(0)->as<TensorView>());
   }
@@ -4984,10 +4984,10 @@ TEST_P(ResizeSchedulerTest, SliceRotateCat) {
     const auto& heuristic_param =
         runtime->schedulerHeuristics()->heuristicsList().front();
     EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
-    Fusion* scheduled_fusion =
+    auto scheduled_fusion =
         dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
-            ->fusion()
-            .get();
+            ->compiledKernel()
+            ->fusion();
     checkLoopDomainEquivalence(
         scheduled_fusion->outputs().at(0)->as<TensorView>());
   }
@@ -5123,10 +5123,10 @@ TEST_P(ResizeSchedulerTest, SliceRotateCatResidual) {
     const auto& heuristic_param =
         runtime->schedulerHeuristics()->heuristicsList().front();
     EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
-    Fusion* scheduled_fusion =
+    auto scheduled_fusion =
         dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
-            ->fusion()
-            .get();
+            ->compiledKernel()
+            ->fusion();
     checkLoopDomainEquivalence(
         scheduled_fusion->outputs().at(0)->as<TensorView>());
   }
@@ -5312,10 +5312,10 @@ TEST_P(ResizeSchedulerTest, PropagatePadToInputs) {
     const auto& heuristic_param =
         runtime->schedulerHeuristics()->heuristicsList().front();
     EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
-    Fusion* scheduled_fusion =
+    auto scheduled_fusion =
         dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
-            ->fusion()
-            .get();
+            ->compiledKernel()
+            ->fusion();
     checkLoopDomainEquivalence(
         scheduled_fusion->outputs().at(0)->as<TensorView>());
   }
@@ -5415,10 +5415,10 @@ TEST_P(ResizeSchedulerTest, PropagateCatToInputs) {
     const auto& heuristic_param =
         runtime->schedulerHeuristics()->heuristicsList().front();
     EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
-    Fusion* scheduled_fusion =
+    auto scheduled_fusion =
         dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
-            ->fusion()
-            .get();
+            ->compiledKernel()
+            ->fusion();
     checkLoopDomainEquivalence(
         scheduled_fusion->outputs().at(0)->as<TensorView>());
   }

diff --git a/tests/cpp/utils.h b/tests/cpp/utils.h
@@ -826,7 +826,7 @@ bool isSchedulerInUse(
     const SchedulerType& scheduler_type);
 
 // Disable magic zero
-const CompileParams matmul_cparams{DataType::Int32, 255, false};
+constexpr CompileParams matmul_cparams{DataType::Int32, 255, false};
 
 // Utility to generate tensor with bias applied on the input tensor
 TensorView* biasEpilogue(TensorView* tensor, TensorView* bias);