From 4409faad5862c74f3a96a16b9f92ced5fa1a31c5 Mon Sep 17 00:00:00 2001
From: Nicholas Sarkauskas <nsarkauskas@PDX01-M03-I40-DGX-A100-DL-1.nvidia.com>
Date: Tue, 14 Jan 2025 13:12:44 -0800
Subject: [PATCH] review feedback

---
 CMakeLists.txt                                |  8 +++--
 csrc/host_ir/executor.cpp                     |  6 ++--
 csrc/host_ir/host_ir.cpp                      | 14 ++++-----
 csrc/host_ir/host_ir.h                        | 14 ++++-----
 ...ation.cpp => test_host_ir_integration.cpp} | 29 +++++++++----------
 5 files changed, 32 insertions(+), 39 deletions(-)
 rename tests/cpp/{test_multidevice_host_ir_integration.cpp => test_host_ir_integration.cpp} (63%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6c70757128d..9f182892ff1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -661,7 +661,6 @@ if(BUILD_TEST)
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_communications.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_communicator.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_host_ir.cpp
-    ${NVFUSER_ROOT}/tests/cpp/test_multidevice_host_ir_integration.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_lower_communication.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_matmul.cpp
     ${NVFUSER_ROOT}/tests/cpp/test_multidevice_pipeline.cpp
@@ -700,7 +699,12 @@ if(BUILD_TEST)
   add_test(tutorial "${NVFUSER_ROOT}/tests/cpp/test_tutorial.cpp" "")
   list(APPEND TEST_BINARIES tutorial)
 
-  add_test(test_host_ir "${NVFUSER_ROOT}/tests/cpp/test_host_irs.cpp" "")
+  set(HOSTIR_TEST_SRCS)
+  list(APPEND HOSTIR_TEST_SRCS
+    ${NVFUSER_ROOT}/tests/cpp/test_host_irs.cpp
+    ${NVFUSER_ROOT}/tests/cpp/test_host_ir_integration.cpp
+  )
+  add_test(test_host_ir "${HOSTIR_TEST_SRCS}" "")
   list(APPEND TEST_BINARIES test_host_ir)
 
   if(BUILD_PYTHON)
diff --git a/csrc/host_ir/executor.cpp b/csrc/host_ir/executor.cpp
index baffa5cfbdf..0f9f3da6921 100644
--- a/csrc/host_ir/executor.cpp
+++ b/csrc/host_ir/executor.cpp
@@ -313,11 +313,9 @@ void HostIrEvaluator::handle(LaunchKernel* launch_kernel) {
     args.push(input_evaluation);
   }
 
-  // placeholder for storing the outputs
-  std::vector<at::Tensor> outputs;
-
   // run the compiled kernel
-  outputs = container_->getKernelExecutor(launch_kernel->getIndex())->run(args);
+  std::vector<at::Tensor> outputs =
+      container_->getKernelExecutor(launch_kernel->getIndex())->run(args);
 
   // Store the outputs in the context
   for (auto output_idx : c10::irange(outputs.size())) {
diff --git a/csrc/host_ir/host_ir.cpp b/csrc/host_ir/host_ir.cpp
index afe3433cc4d..780da3ca190 100644
--- a/csrc/host_ir/host_ir.cpp
+++ b/csrc/host_ir/host_ir.cpp
@@ -121,7 +121,7 @@ bool PostOnStream::sameAs(const Statement* other) const {
 
 LaunchKernel::LaunchKernel(
     IrBuilderPasskey passkey,
-    int hic_executor_index,
+    int64_t hic_executor_index,
     std::vector<Val*> inputs,
     std::vector<Val*> outputs)
     : Expr(passkey, std::move(inputs), std::move(outputs), {}),
@@ -131,12 +131,12 @@ NVFUSER_DEFINE_CLONE_AND_CREATE(LaunchKernel)
 
 std::string LaunchKernel::toString(int indent_size) const {
   std::stringstream ss;
-  indent(ss, indent_size) << "LaunchKernel ("
-                          << "Inputs:{";
+  indent(ss, indent_size) << "LaunchKernel("
+                          << "Inputs: {";
   std::for_each(inputs().begin(), inputs().end(), [&ss](auto input) {
     ss << input->toString(0) << ", ";
   });
-  ss << "}, Outputs:{";
+  ss << "}, Outputs: {";
   std::for_each(outputs().begin(), outputs().end(), [&ss](auto output) {
     ss << output->toString(0) << ", ";
   });
@@ -144,7 +144,7 @@ std::string LaunchKernel::toString(int indent_size) const {
   return ss.str();
 }
 
-int LaunchKernel::getIndex() const {
+int64_t LaunchKernel::getIndex() const {
   return hic_executor_index_;
 }
 
@@ -152,10 +152,6 @@ std::string LaunchKernel::toInlineString(int indent_size) const {
   NVF_CHECK(false, "Can not be printed inline");
 }
 
-bool LaunchKernel::sameAs(const Statement* other) const {
-  return false;
-}
-
 Stream::Stream(IrBuilderPasskey passkey, Val* index)
     : Val(passkey, ValType::Stream), index_(index) {}
 
diff --git a/csrc/host_ir/host_ir.h b/csrc/host_ir/host_ir.h
index 640183c38e5..479b7ac2ef6 100644
--- a/csrc/host_ir/host_ir.h
+++ b/csrc/host_ir/host_ir.h
@@ -120,7 +120,9 @@ class LaunchKernel : public Expr {
   using Expr::Expr;
   LaunchKernel(
       IrBuilderPasskey passkey,
-      int hic_executor_index, // TODO
+      int64_t hic_executor_index, // Index into the HostIrContainer's vector of
+                                  // KernelExecutors--i.e., the kernel this IR
+                                  // should launch
       std::vector<Val*> inputs,
       std::vector<Val*> outputs);
 
@@ -137,15 +139,9 @@ class LaunchKernel : public Expr {
     return "hir::LaunchKernel";
   }
 
-  int getIndex() const;
+  int64_t getIndex() const;
 
-  bool sameAs(const Statement* other) const override;
-
-  Expr* hostOpToPost() const {
-    return attributes_.at(0)->as<Expr>();
-  }
-
-  int hic_executor_index_;
+  int64_t hic_executor_index_;
 };
 
 class Stream : public Val {
diff --git a/tests/cpp/test_multidevice_host_ir_integration.cpp b/tests/cpp/test_host_ir_integration.cpp
similarity index 63%
rename from tests/cpp/test_multidevice_host_ir_integration.cpp
rename to tests/cpp/test_host_ir_integration.cpp
index 16c21b92461..d17a5d8f2b9 100644
--- a/tests/cpp/test_multidevice_host_ir_integration.cpp
+++ b/tests/cpp/test_host_ir_integration.cpp
@@ -1,6 +1,6 @@
 // clang-format off
 /*
-* SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+* SPDX-FileCopyrightText: Copyright (c) 2025-present NVIDIA CORPORATION & AFFILIATES.
 * All rights reserved.
 * SPDX-License-Identifier: BSD-3-Clause
 */
@@ -10,13 +10,15 @@
 #include <host_ir/executor.h>
 #include <ir/all_nodes.h>
 #include <ops/all_ops.h>
-#include <tests/cpp/multidevice.h>
+#include <tests/cpp/utils.h>
 
 namespace nvfuser {
 
 namespace hir {
 
-TEST_F(MultiDeviceTest, LaunchKernel) {
+using HostIrIntegrationTest = NVFuserTest;
+
+TEST_F(HostIrIntegrationTest, LaunchKernel) {
   Fusion fusion;
   FusionGuard fg(&fusion);
   TensorView* tv0 = makeSymbolicTensor(2);
@@ -40,26 +42,23 @@ TEST_F(MultiDeviceTest, LaunchKernel) {
   auto tv2 = ir_cloner.clone(tv0);
   auto tv3 = ir_cloner.clone(tv1);
 
-  std::vector<Val*> lk_inputs = {tv2};
-  std::vector<Val*> lk_outputs = {tv3};
+  std::vector<Val*> launch_kernel_inputs = {tv2};
+  std::vector<Val*> launch_kernel_outputs = {tv3};
 
-  hic->addInput(lk_inputs.back());
-  hic->addOutput(lk_outputs.back());
+  hic->addInput(launch_kernel_inputs.back());
+  hic->addOutput(launch_kernel_outputs.back());
 
-  auto launch_kernel =
-      IrBuilder::create<LaunchKernel>(0, lk_inputs, lk_outputs);
+  auto launch_kernel = IrBuilder::create<LaunchKernel>(
+      0, launch_kernel_inputs, launch_kernel_outputs);
 
   hic->pushBackTopLevelExprs(launch_kernel);
 
-  HostIrEvaluatorParams params;
-  params.use_fusion_executor_cache = false;
-  HostIrEvaluator hie(std::move(hic), communicator_, params);
+  HostIrEvaluator hie(std::move(hic));
 
   at::Tensor output = at::empty({32, 32}, options);
-  auto outputs =
-      hie.runWithInput({{lk_inputs.back(), t0}, {lk_outputs.back(), output}});
+  auto outputs = hie.runWithInput({{tv2, t0}, {tv3, output}});
 
-  ASSERT_TRUE(outputs[0].equal(t0));
+  EXPECT_TRUE(outputs[0].equal(t0));
 }
 
 } // namespace hir