Skip to content

Commit

Permalink
PR review and minor cleanup.
Browse files Browse the repository at this point in the history
  • Loading branch information
csarofeen committed Jan 11, 2025
1 parent c199020 commit e008bc7
Show file tree
Hide file tree
Showing 7 changed files with 48 additions and 57 deletions.
12 changes: 3 additions & 9 deletions csrc/runtime/compiled_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
#include <runtime/executor_params.h>
#include <runtime/executor_utils.h>
#include <scheduler/scheduler_types.h>
// #include <serde/fusion_cache_generated.h>
#include <utils.h>
#include <atomic>

Expand Down Expand Up @@ -54,6 +53,9 @@ class RtcKernel : public NonCopyable {
int64_t device_index_;
};

//! Class for compilation logic through nvRTC. It shouldn't hold any logic
//! associated with how to run a kernel, but how to compile it. It should also
//! contain any information about the kernel itself.
class CompiledKernel : public NonCopyable {
public:
// NVF_API was added for nvfuser_extension. See examples/sinh_extension.
Expand Down Expand Up @@ -119,11 +121,6 @@ class CompiledKernel : public NonCopyable {
return lowered_->kernel()->as<Fusion>();
}

//! get register spills (load + store) of the compiled kernel
int getKernelRegisterSpills() const {
return compiled_kernel_->register_spills;
}

//! Returns the string of the compiled kernel
NVF_API std::string kernelString() const {
NVF_ERROR(!kernel_code_.empty(), "Kernel code not generated");
Expand Down Expand Up @@ -155,9 +152,6 @@ class CompiledKernel : public NonCopyable {
const int64_t& groupId() const {
return group_id_;
}
// void setGroupId(int64_t gid) {
// group_id_ = gid;
// }

bool validKernelId() const {
return !kernel_id_.empty();
Expand Down
20 changes: 10 additions & 10 deletions csrc/runtime/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -160,26 +160,25 @@ bool hasCpuScalarOutputs(Fusion* _fusion) {
}
} // namespace

bool KernelExecutor::supported(Fusion* _fusion) {
bool KernelExecutor::supported(Fusion* fusion) {
FUSER_PERF_SCOPE("KernelExecutor::supported");
return !hasCpuScalarOutputs(_fusion);
return !hasCpuScalarOutputs(fusion);
}

void KernelExecutor::compile(
Fusion* _fusion,
Fusion* fusion,
const KernelArgumentHolder& args,
const LaunchParams& launch_constraints,
CompileParams compile_params,
SchedulerType scheduler_type) {
FUSER_PERF_SCOPE("KernelExecutor::compile");
fusion_ = std::make_unique<Fusion>(*_fusion);

NVF_ERROR(
supported(fusion_.get()),
supported(fusion),
"KernelExecutor does not support the Fusion provided.");

NVF_ERROR(
!fusion_->outputs().empty(),
"No output found for this kernel, aborting.");
!fusion->outputs().empty(), "No output found for this kernel, aborting.");

auto device = c10::Device(c10::DeviceType::CUDA, args.getDeviceIndex());

Expand All @@ -194,7 +193,7 @@ void KernelExecutor::compile(

//! Force index_type to int and disable magic zero if we detect that the
//! kernel contains any TMA memory operations.
std::vector<Expr*> exprs = fusion_->exprs();
std::vector<Expr*> exprs = fusion->exprs();
bool has_cp_async_bulk = std::any_of(exprs.begin(), exprs.end(), [](Expr* e) {
return ir_utils::isCpAsyncBulk(e);
});
Expand Down Expand Up @@ -251,7 +250,7 @@ void KernelExecutor::compile(
// Lowered is needed to compute launch parameters as it uses the CA map. We
// could modify that, but simply generating that part first.
compiled_kernel_ = std::make_unique<CompiledKernel>(
fusion_.get(),
fusion,
compile_params,
device,
scheduler_type,
Expand Down Expand Up @@ -920,7 +919,8 @@ std::vector<at::Tensor> KernelExecutor::run(

NVF_ERROR(isCompiled());
NVF_ERROR(
outputs.empty() || (outputs.size() == fusion()->outputs().size()),
outputs.empty() ||
(outputs.size() == compiledKernel()->fusion()->outputs().size()),
__func__,
" provided number of outputs does not match fusion output");

Expand Down
8 changes: 1 addition & 7 deletions csrc/runtime/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ class KernelExecutor : public ExecutorAbstract {
if (compiledKernel()) {
return true;
}
return fusion_ != nullptr;
return false;
};

void evictCache(size_t cache_id) {
Expand Down Expand Up @@ -180,9 +180,6 @@ class KernelExecutor : public ExecutorAbstract {
using ExecutorCompileTimeInfoCache =
executor_utils::caching::ExecutorCompileTimeInfoCache;

const std::unique_ptr<Fusion>& fusion() const {
return fusion_;
}
//! Internal knob used for debugging/profiling only
void setExecuteKernelFlag(bool execute_kernel) {
execute_kernel_ = execute_kernel;
Expand Down Expand Up @@ -339,9 +336,6 @@ class KernelExecutor : public ExecutorAbstract {

int64_t warp_size_ = 0;

// Initialized for non-compiled fusions
std::unique_ptr<Fusion> fusion_;

// lookup table to take short cut to retrieve recorded information in order to
// launch kernels without re-inference parameters.
std::unordered_map<size_t, ExecutorEntry> executor_entry_lookup_;
Expand Down
3 changes: 3 additions & 0 deletions csrc/runtime/executor_params.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ struct CompileParams {
bool enable_magic_zero = true;
// if true, save ptxas info to compile log and check for register spilling
bool enable_ptxas_verbose = false;
// Wrapping device in an optional allows us to initialize a value for the
// struct without having to select a specific device. Otherwise the default
// constructor will be deleted for the struct.
std::optional<c10::Device> device = std::nullopt;

bool operator==(const CompileParams& other) const {
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/test_gpu3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8103,7 +8103,7 @@ TEST_F(NVFuserTest, AvoidCachingSliceInput) {
continue;
}
const auto* ke = exec->as<KernelExecutor>();
for (auto expr : ke->fusion()->exprs()) {
for (auto expr : ke->compiledKernel()->fusion()->exprs()) {
if (expr->isA<SliceOp>()) {
auto slice = expr->as<SliceOp>();
EXPECT_EQ(slice->in()->getMemoryType(), MemoryType::Global);
Expand Down
58 changes: 29 additions & 29 deletions tests/cpp/test_resize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4154,10 +4154,10 @@ TEST_P(ResizeSchedulerTest, PropagateSliceToInputs) {
const auto& heuristic_param =
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
auto scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
->fusion()
.get();
->compiledKernel()
->fusion();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down Expand Up @@ -4245,10 +4245,10 @@ TEST_P(ResizeSchedulerTest, PropagateSliceToInputsWithReshape1) {
const auto& heuristic_param =
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
auto scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
->fusion()
.get();
->compiledKernel()
->fusion();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down Expand Up @@ -4332,10 +4332,10 @@ TEST_P(ResizeSchedulerTest, PropagateSliceToInputsWithReshape2) {
const auto& heuristic_param =
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
auto scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
->fusion()
.get();
->compiledKernel()
->fusion();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down Expand Up @@ -4443,10 +4443,10 @@ TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs1) {
const auto& heuristic_param =
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
auto scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
->fusion()
.get();
->compiledKernel()
->fusion();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down Expand Up @@ -4653,8 +4653,8 @@ TEST_F(ResizeSchedulerTest, PropagateMultipleSlicesToInputs3) {
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
->fusion()
.get();
->compiledKernel()
->fusion();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down Expand Up @@ -4793,10 +4793,10 @@ TEST_P(ResizeSchedulerTest, PropagateMultipleSlicesToInputs5) {
const auto& heuristic_param =
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
auto scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
->fusion()
.get();
->compiledKernel()
->fusion();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down Expand Up @@ -4984,10 +4984,10 @@ TEST_P(ResizeSchedulerTest, SliceRotateCat) {
const auto& heuristic_param =
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
auto scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
->fusion()
.get();
->compiledKernel()
->fusion();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down Expand Up @@ -5123,10 +5123,10 @@ TEST_P(ResizeSchedulerTest, SliceRotateCatResidual) {
const auto& heuristic_param =
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
auto scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
->fusion()
.get();
->compiledKernel()
->fusion();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down Expand Up @@ -5312,10 +5312,10 @@ TEST_P(ResizeSchedulerTest, PropagatePadToInputs) {
const auto& heuristic_param =
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
auto scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
->fusion()
.get();
->compiledKernel()
->fusion();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down Expand Up @@ -5415,10 +5415,10 @@ TEST_P(ResizeSchedulerTest, PropagateCatToInputs) {
const auto& heuristic_param =
runtime->schedulerHeuristics()->heuristicsList().front();
EXPECT_EQ(heuristic_param->scheduler_type, SchedulerType::Resize);
Fusion* scheduled_fusion =
auto scheduled_fusion =
dynamic_cast<KernelExecutor*>(runtime->executors().at(0).get())
->fusion()
.get();
->compiledKernel()
->fusion();
checkLoopDomainEquivalence(
scheduled_fusion->outputs().at(0)->as<TensorView>());
}
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -826,7 +826,7 @@ bool isSchedulerInUse(
const SchedulerType& scheduler_type);

// Disable magic zero
const CompileParams matmul_cparams{DataType::Int32, 255, false};
constexpr CompileParams matmul_cparams{DataType::Int32, 255, false};

// Utility to generate tensor with bias applied on the input tensor
TensorView* biasEpilogue(TensorView* tensor, TensorView* bias);
Expand Down

0 comments on commit e008bc7

Please sign in to comment.