clean up

zhongbozhu · zhongbozhu · commit 63b10ba33cf9 · 2025-05-16T12:07:06.000-07:00
Signed-off-by: zhongboz &lt;zhongboz@nvidia.com&gt;
diff --git a/benchmarks/linear/benchmark_grouped_linear.py b/benchmarks/linear/benchmark_grouped_linear.py
@@ -8,17 +8,18 @@
 from transformer_engine.common.recipe import Float8BlockScaling
 from transformer_engine.pytorch.fp8 import fp8_autocast
 from contextlib import nullcontext
+
 RECIPES = {
     "bf16": None,
     "fp8_sub_channel": Float8BlockScaling(),
 }
 
 
-def run_linear_multiple_steps(
-    layer, x, m_splits, mode, gradient, run_num_steps=1, recipe=None
-):
+def run_linear_multiple_steps(layer, x, m_splits, mode, gradient, run_num_steps=1, recipe=None):
     assert mode in ["fwd_only", "fwd_bwd"]
-    fp8_context = fp8_autocast(enabled=True, fp8_recipe=recipe) if recipe is not None else nullcontext()
+    fp8_context = (
+        fp8_autocast(enabled=True, fp8_recipe=recipe) if recipe is not None else nullcontext()
+    )
     # print(f"fp8_context: {fp8_context} and is it nullcontext? {isinstance(fp8_context, nullcontext)}")
 
     if mode == "fwd_only":
@@ -67,13 +68,11 @@ def benchmark_linear(
     num_gemms=4,
 ):
     params_dtype = torch.bfloat16
-    recipe =RECIPES[recipe_name]
+    recipe = RECIPES[recipe_name]
 
     in_features = x.shape[1]
     out_features = ws[0].shape[0]
-    gradient = torch.ones(
-        (x.shape[0], out_features), dtype=torch.bfloat16, device=x.device
-    )
+    gradient = torch.ones((x.shape[0], out_features), dtype=torch.bfloat16, device=x.device)
 
     layer = GroupedLinear(
         num_gemms,
@@ -97,7 +96,10 @@ def benchmark_linear(
     label = f"{recipe_name}_{'grouped'}"
     torch.cuda.nvtx.range_push(label)
     timing = benchmark.Timer(
-        stmt="run_linear_multiple_steps(layer, x, m_splits, mode, gradient, num_microbatches, recipe)",
+        stmt=(
+            "run_linear_multiple_steps(layer, x, m_splits, mode, gradient, num_microbatches,"
+            " recipe)"
+        ),
         globals={
             "run_linear_multiple_steps": run_linear_multiple_steps,
             "layer": layer,
@@ -116,20 +118,15 @@ def benchmark_linear(
     return timing_ms
 
 
-def run_benchmark_linear(
-    mkns, recipe_name, use_bias, num_gemms=4
-):
+def run_benchmark_linear(mkns, recipe_name, use_bias, num_gemms=4):
     data = []
     assert not use_bias, "Bias is not supported for GroupedLinear benchmark"
 
     print(f"========== Benchmarking {recipe_name} ==========")
     for m, k, n in mkns:
         device = "cuda"
         x = torch.randn((m, k), dtype=torch.bfloat16, device=device, requires_grad=True)
-        ws = [
-            torch.randn((n, k), dtype=torch.bfloat16, device=device)
-            for _ in range(num_gemms)
-        ]
+        ws = [torch.randn((n, k), dtype=torch.bfloat16, device=device) for _ in range(num_gemms)]
         assert m % num_gemms == 0
         m_splits = [m // num_gemms] * num_gemms
         # Bias is not supported for GroupedLinear benchmark
@@ -192,7 +189,7 @@ def run_benchmark_linear(
     # Set the MKN values to benchmark
     mkns = []
     for m in [1024]:
-    # for m in [4096, 8192, 16384]:
+        # for m in [4096, 8192, 16384]:
         # for n in [1024, 2048, 4096, 8192, 16384]:
         for n in [3072]:
             for k in [4096]:
@@ -236,6 +233,5 @@ def run_benchmark_linear(
 
     print(df_linears)
 
-
     if args.profile:
         torch.autograd.profiler.emit_nvtx().__exit__(None, None, None)
diff --git a/transformer_engine/pytorch/csrc/common.h b/transformer_engine/pytorch/csrc/common.h
@@ -195,7 +195,8 @@ class Float8BlockQuantizer : public Quantizer {
       const std::vector<size_t>& shape, DType dtype,
       std::optional<at::Tensor> rowwise_data = std::nullopt) const override;
 
-  std::pair<size_t, size_t> get_scale_shape(const std::vector<size_t>& shape, bool columnwise) const;
+  std::pair<size_t, size_t> get_scale_shape(const std::vector<size_t>& shape,
+                                            bool columnwise) const;
 };
 
 class MXFP8Quantizer : public Quantizer {
diff --git a/transformer_engine/pytorch/csrc/extensions.h b/transformer_engine/pytorch/csrc/extensions.h
@@ -107,10 +107,8 @@ namespace transformer_engine::pytorch {
  * Transpose
  **************************************************************************************************/
 
-std::vector<py::object> fused_bulk_alloc_outputs(at::Tensor inpput_view, std::vector<int> m_splits, 
-                                                std::vector<py::handle> quantizer_list);
-
-py::object simple_sanity_check(at::Tensor input, py::handle quantizer);
+std::vector<py::object> fused_bulk_alloc_outputs(at::Tensor inpput_view, std::vector<int> m_splits,
+                                                 std::vector<py::handle> quantizer_list);
 
 std::vector<py::object> fused_multi_quantize(std::vector<at::Tensor> input_list,
                                              std::optional<std::vector<py::object>> output_list,
diff --git a/transformer_engine/pytorch/csrc/extensions/pybind.cpp b/transformer_engine/pytorch/csrc/extensions/pybind.cpp
@@ -12,14 +12,13 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
+#include <iostream>
 #include <stdexcept>
 
 #include "../common.h"
 #include "../extensions.h"
 #include "common.h"
 
-#include <iostream>
-
 namespace transformer_engine::pytorch {
 
 PyTypeObject *Float8TensorPythonClass = nullptr;  /// TODO Remove
@@ -201,10 +200,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         py::arg("ln_out"), py::arg("quantizer"), py::arg("otype"), py::arg("sm_margin"),
         py::arg("zero_centered_gamma"));
   m.def("rmsnorm_bwd", &rmsnorm_bwd, "Backward of RMSNorm");
-  m.def("fused_bulk_alloc_outputs", &transformer_engine::pytorch::fused_bulk_alloc_outputs, "Fused Bulk Alloc Outputs",
-        py::arg("input_view"), py::arg("m_splits"), py::arg("quantizer_list"));
-  m.def("simple_sanity_check", &transformer_engine::pytorch::simple_sanity_check, "foo",
-        py::arg("input"), py::arg("quantizer"));
+  m.def("fused_bulk_alloc_outputs", &transformer_engine::pytorch::fused_bulk_alloc_outputs,
+        "Fused Bulk Alloc Outputs", py::arg("input_view"), py::arg("m_splits"),
+        py::arg("quantizer_list"));
   m.def("fused_multi_quantize", &transformer_engine::pytorch::fused_multi_quantize,
         "Fused Multi-tensor Cast + Transpose", py::arg("input_list"), py::arg("output_list"),
         py::arg("quantizer_list"), py::arg("otype"));
diff --git a/transformer_engine/pytorch/csrc/extensions/quantizer.cpp b/transformer_engine/pytorch/csrc/extensions/quantizer.cpp
@@ -387,7 +387,8 @@ std::pair<TensorWrapper, py::object> Float8BlockQuantizer::create_tensor(
   return {std::move(tensor), std::move(ret)};
 }
 
-std::pair<size_t, size_t> Float8BlockQuantizer::get_scale_shape(const std::vector<size_t>& shape, bool columnwise) const {
+std::pair<size_t, size_t> Float8BlockQuantizer::get_scale_shape(const std::vector<size_t>& shape,
+                                                                bool columnwise) const {
   using namespace pybind11::literals;
   std::vector<int64_t> torch_shape;
   size_t numel = 1;
@@ -418,7 +419,7 @@ std::pair<size_t, size_t> Float8BlockQuantizer::get_scale_shape(const std::vecto
                  block_scaling_dim);
     }
     scale_shape = {sinv0, sinv1};
-  }else {
+  } else {
     size_t sinv0 = 0;
     size_t sinv1 = 0;
     if (block_scaling_dim == 2) {
diff --git a/transformer_engine/pytorch/csrc/extensions/transpose.cpp b/transformer_engine/pytorch/csrc/extensions/transpose.cpp
@@ -4,18 +4,18 @@
  * See LICENSE for license information.
  ************************************************************************/
 
-#include <optional>
 #include <pybind.h>
 
+#include <iostream>
+#include <optional>
+
 #include "extensions.h"
 #include "pybind.h"
 
-#include <iostream>
-
 namespace transformer_engine::pytorch {
 
-std::vector<py::object> fused_bulk_alloc_outputs(at::Tensor input_view, std::vector<int> m_splits, 
-                                                std::vector<py::handle> quantizer_list) {
+std::vector<py::object> fused_bulk_alloc_outputs(at::Tensor input_view, std::vector<int> m_splits,
+                                                 std::vector<py::handle> quantizer_list) {
   init_extension();
   using namespace pybind11::literals;  // For operator""_a
 
@@ -58,25 +58,38 @@ std::vector<py::object> fused_bulk_alloc_outputs(at::Tensor input_view, std::vec
     std::vector<size_t> columnwise_data_sizes;
     std::vector<size_t> columnwise_scale_sizes;
     for (int i = 0; i < num_splits; i++) {
-      std::pair<size_t, size_t> input_view_i_shape = std::make_pair((size_t)m_splits[i], (size_t)hidden_dim);
+      std::pair<size_t, size_t> input_view_i_shape =
+          std::make_pair((size_t)m_splits[i], (size_t)hidden_dim);
       if (rowwise_usage) {
         rowwise_data_shapes.emplace_back(input_view_i_shape);
-        rowwise_scale_shapes.emplace_back(blockwise_quantizers[i]->get_scale_shape({input_view_i_shape.first, input_view_i_shape.second}, false));
-        rowwise_data_sizes.emplace_back(input_view_i_shape.first * input_view_i_shape.second * fp8_elem_size);
-        rowwise_scale_sizes.emplace_back(rowwise_scale_shapes.back().first * rowwise_scale_shapes.back().second * scale_elem_size);
+        rowwise_scale_shapes.emplace_back(blockwise_quantizers[i]->get_scale_shape(
+            {input_view_i_shape.first, input_view_i_shape.second}, false));
+        rowwise_data_sizes.emplace_back(input_view_i_shape.first * input_view_i_shape.second *
+                                        fp8_elem_size);
+        rowwise_scale_sizes.emplace_back(rowwise_scale_shapes.back().first *
+                                         rowwise_scale_shapes.back().second * scale_elem_size);
       }
       if (columnwise_usage) {
-        columnwise_data_shapes.emplace_back(std::make_pair(input_view_i_shape.second, input_view_i_shape.first));
-        columnwise_scale_shapes.emplace_back(blockwise_quantizers[i]->get_scale_shape({input_view_i_shape.first, input_view_i_shape.second}, true));
-        columnwise_data_sizes.emplace_back(input_view_i_shape.first * input_view_i_shape.second * fp8_elem_size);
-        columnwise_scale_sizes.emplace_back(columnwise_scale_shapes.back().first * columnwise_scale_shapes.back().second * scale_elem_size);
+        columnwise_data_shapes.emplace_back(
+            std::make_pair(input_view_i_shape.second, input_view_i_shape.first));
+        columnwise_scale_shapes.emplace_back(blockwise_quantizers[i]->get_scale_shape(
+            {input_view_i_shape.first, input_view_i_shape.second}, true));
+        columnwise_data_sizes.emplace_back(input_view_i_shape.first * input_view_i_shape.second *
+                                           fp8_elem_size);
+        columnwise_scale_sizes.emplace_back(columnwise_scale_shapes.back().first *
+                                            columnwise_scale_shapes.back().second *
+                                            scale_elem_size);
       }
     }
 
-    size_t total_size_rowwise_data = std::accumulate(rowwise_data_sizes.begin(), rowwise_data_sizes.end(), 0);
-    size_t total_size_rowwise_scale = std::accumulate(rowwise_scale_sizes.begin(), rowwise_scale_sizes.end(), 0);
-    size_t total_size_columnwise_data = std::accumulate(columnwise_data_sizes.begin(), columnwise_data_sizes.end(), 0);
-    size_t total_size_columnwise_scale = std::accumulate(columnwise_scale_sizes.begin(), columnwise_scale_sizes.end(), 0);
+    size_t total_size_rowwise_data =
+        std::accumulate(rowwise_data_sizes.begin(), rowwise_data_sizes.end(), 0);
+    size_t total_size_rowwise_scale =
+        std::accumulate(rowwise_scale_sizes.begin(), rowwise_scale_sizes.end(), 0);
+    size_t total_size_columnwise_data =
+        std::accumulate(columnwise_data_sizes.begin(), columnwise_data_sizes.end(), 0);
+    size_t total_size_columnwise_scale =
+        std::accumulate(columnwise_scale_sizes.begin(), columnwise_scale_sizes.end(), 0);
 
     size_t total_size_rowwise = total_size_rowwise_data + total_size_rowwise_scale;
     size_t total_size_columnwise = total_size_columnwise_data + total_size_columnwise_scale;
@@ -90,49 +103,67 @@ std::vector<py::object> fused_bulk_alloc_outputs(at::Tensor input_view, std::vec
     at::Tensor columnwise_full_tensor;
 
     if (rowwise_usage) {
-      rowwise_full_tensor = at::empty({(int64_t)total_size_rowwise}, at::device(input_view.device()).dtype(torch::kUInt8));
+      rowwise_full_tensor = at::empty({(int64_t)total_size_rowwise},
+                                      at::device(input_view.device()).dtype(torch::kUInt8));
       // use raw pointer math + from blob, avoid torch slice to reduce cpu overhead
       uint8_t* rowwise_data_ptr = rowwise_full_tensor.data_ptr<uint8_t>();
-      uint8_t* rowwise_scale_ptr = rowwise_full_tensor.data_ptr<uint8_t>() + total_size_rowwise_data;
+      uint8_t* rowwise_scale_ptr =
+          rowwise_full_tensor.data_ptr<uint8_t>() + total_size_rowwise_data;
       // use from_blob to construct rowwise_data_list and rowwise_scale_list
       for (int i = 0; i < num_splits; i++) {
-        rowwise_data_list.emplace_back(at::from_blob(rowwise_data_ptr, {static_cast<int64_t>(rowwise_data_shapes[i].first), static_cast<int64_t>(rowwise_data_shapes[i].second)}, at::device(input_view.device()).dtype(torch::kUInt8)));
-        rowwise_scale_list.emplace_back(at::from_blob(rowwise_scale_ptr, {static_cast<int64_t>(rowwise_scale_shapes[i].first), static_cast<int64_t>(rowwise_scale_shapes[i].second)}, at::device(input_view.device()).dtype(torch::kFloat32)));
+        rowwise_data_list.emplace_back(
+            at::from_blob(rowwise_data_ptr,
+                          {static_cast<int64_t>(rowwise_data_shapes[i].first),
+                           static_cast<int64_t>(rowwise_data_shapes[i].second)},
+                          at::device(input_view.device()).dtype(torch::kUInt8)));
+        rowwise_scale_list.emplace_back(
+            at::from_blob(rowwise_scale_ptr,
+                          {static_cast<int64_t>(rowwise_scale_shapes[i].first),
+                           static_cast<int64_t>(rowwise_scale_shapes[i].second)},
+                          at::device(input_view.device()).dtype(torch::kFloat32)));
         rowwise_data_ptr += rowwise_data_sizes[i];
         rowwise_scale_ptr += rowwise_scale_sizes[i];
       }
     }
 
     if (columnwise_usage) {
-      columnwise_full_tensor = at::empty({(int64_t)total_size_columnwise}, at::device(input_view.device()).dtype(torch::kUInt8));
+      columnwise_full_tensor = at::empty({(int64_t)total_size_columnwise},
+                                         at::device(input_view.device()).dtype(torch::kUInt8));
       uint8_t* columnwise_data_ptr = columnwise_full_tensor.data_ptr<uint8_t>();
-      uint8_t* columnwise_scale_ptr = columnwise_full_tensor.data_ptr<uint8_t>() + total_size_columnwise_data;
+      uint8_t* columnwise_scale_ptr =
+          columnwise_full_tensor.data_ptr<uint8_t>() + total_size_columnwise_data;
       for (int i = 0; i < num_splits; i++) {
-        columnwise_data_list.emplace_back(at::from_blob(columnwise_data_ptr, {static_cast<int64_t>(columnwise_data_shapes[i].first), static_cast<int64_t>(columnwise_data_shapes[i].second)}, at::device(input_view.device()).dtype(torch::kUInt8)));
-        columnwise_scale_list.emplace_back(at::from_blob(columnwise_scale_ptr, {static_cast<int64_t>(columnwise_scale_shapes[i].first), static_cast<int64_t>(columnwise_scale_shapes[i].second)}, at::device(input_view.device()).dtype(torch::kFloat32)));
+        columnwise_data_list.emplace_back(
+            at::from_blob(columnwise_data_ptr,
+                          {static_cast<int64_t>(columnwise_data_shapes[i].first),
+                           static_cast<int64_t>(columnwise_data_shapes[i].second)},
+                          at::device(input_view.device()).dtype(torch::kUInt8)));
+        columnwise_scale_list.emplace_back(
+            at::from_blob(columnwise_scale_ptr,
+                          {static_cast<int64_t>(columnwise_scale_shapes[i].first),
+                           static_cast<int64_t>(columnwise_scale_shapes[i].second)},
+                          at::device(input_view.device()).dtype(torch::kFloat32)));
         columnwise_data_ptr += columnwise_data_sizes[i];
         columnwise_scale_ptr += columnwise_scale_sizes[i];
       }
     }
-        
-    for (int i = 0; i < num_splits; i++) {
 
+    for (int i = 0; i < num_splits; i++) {
       py::handle Float8BlockwiseQTensorClass(
-        reinterpret_cast<PyObject*>(Float8BlockwiseQTensorBasePythonClass));
+          reinterpret_cast<PyObject*>(Float8BlockwiseQTensorBasePythonClass));
 
       // Create the tensor object with proper reference counting
       py::object rowwise_data = rowwise_usage ? py::cast(rowwise_data_list[i]) : py::none();
-      py::object columnwise_data = columnwise_usage ? py::cast(columnwise_data_list[i]) : py::none();
+      py::object columnwise_data =
+          columnwise_usage ? py::cast(columnwise_data_list[i]) : py::none();
       py::object rowwise_scale = rowwise_usage ? py::cast(rowwise_scale_list[i]) : py::none();
-      py::object columnwise_scale = columnwise_usage ? py::cast(columnwise_scale_list[i]) : py::none();
+      py::object columnwise_scale =
+          columnwise_usage ? py::cast(columnwise_scale_list[i]) : py::none();
 
       py::object ret = Float8BlockwiseQTensorClass(
-          "rowwise_data"_a = rowwise_data,
-          "columnwise_data"_a = columnwise_data, 
-          "rowwise_scale_inv"_a = rowwise_scale,
-          "columnwise_scale_inv"_a = columnwise_scale,
-          "fp8_dtype"_a = fp8_dtype,
-          "quantizer"_a = quantizer_list[i],
+          "rowwise_data"_a = rowwise_data, "columnwise_data"_a = columnwise_data,
+          "rowwise_scale_inv"_a = rowwise_scale, "columnwise_scale_inv"_a = columnwise_scale,
+          "fp8_dtype"_a = fp8_dtype, "quantizer"_a = quantizer_list[i],
           "is_2D_scaled"_a = is_2D_scaled);
 
       output_list.emplace_back(std::move(ret));
@@ -143,41 +174,20 @@ std::vector<py::object> fused_bulk_alloc_outputs(at::Tensor input_view, std::vec
 
     // put the two full tensor into a python class to maintain their life cycle
     py::object ret = Float8BlockwiseQTensorClass(
-        "rowwise_data"_a = rowwise_full_tensor,
-        "columnwise_data"_a = columnwise_full_tensor,
-        "rowwise_scale_inv"_a = py::none(),
-        "columnwise_scale_inv"_a = py::none(),
-        "fp8_dtype"_a = transformer_engine::DType::kFloat8E4M3, "quantizer"_a = py::none(), "is_2D_scaled"_a = true);
-    
+        "rowwise_data"_a = rowwise_full_tensor, "columnwise_data"_a = columnwise_full_tensor,
+        "rowwise_scale_inv"_a = py::none(), "columnwise_scale_inv"_a = py::none(),
+        "fp8_dtype"_a = transformer_engine::DType::kFloat8E4M3, "quantizer"_a = py::none(),
+        "is_2D_scaled"_a = true);
+
     output_list.emplace_back(std::move(ret));
 
-  }else{
+  } else {
     NVTE_ERROR("Fused bulk alloc is not supported for this quantizer type");
   }
 
   return output_list;
 }
 
-py::object simple_sanity_check(at::Tensor input, py::handle quantizer){
-  init_extension();
-  using namespace pybind11::literals;  // For operator""_a
-  py::handle Float8BlockwiseQTensorClass(
-    reinterpret_cast<PyObject*>(Float8BlockwiseQTensorBasePythonClass));
-
-  py::object ret = Float8BlockwiseQTensorClass(
-      "rowwise_data"_a = input,
-      "columnwise_data"_a = input,
-      "rowwise_scale_inv"_a = input,
-      "columnwise_scale_inv"_a = input,
-      "fp8_dtype"_a = transformer_engine::DType::kFloat8E4M3, "quantizer"_a = quantizer, "is_2D_scaled"_a = true);
-
-  // py::handle Float8TensorClass(reinterpret_cast<PyObject*>(Float8TensorBasePythonClass));
-  // py::object ret = Float8TensorClass("data"_a = py::none(), "fp8_scale_inv"_a = py::none(),
-  //                         "fp8_dtype"_a = transformer_engine::DType::kFloat8E4M3, "data_transpose"_a = py::none(),
-  //                         "quantizer"_a = py::none());
-  return ret;
-}
-
 std::vector<py::object> fused_multi_quantize(std::vector<at::Tensor> input_list,
                                              std::optional<std::vector<py::object>> output_list,
                                              std::vector<py::handle> quantizer_list,
diff --git a/transformer_engine/pytorch/module/grouped_linear.py b/transformer_engine/pytorch/module/grouped_linear.py
diff --git a/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py b/transformer_engine/pytorch/tensor/float8_blockwise_tensor.py