Support tensor stride

Summary: X-link: facebookresearch/param#126 Support tensor stride for execution trace. Test Plan: buck2 test mode/opt caffe2/test:test_profiler_cuda profiler.test_execution_trace.TestExecutionTrace Differential Revision: D58900476
shengfukevin · Jun 22, 2024 · 7d121ef · 7d121ef
1 parent 5b14943
commit 7d121ef
Showing 1 changed file with 94 additions and 95 deletions.
diff --git a/torch/csrc/profiler/standalone/execution_trace_observer.cpp b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
@@ -64,71 +64,6 @@ std::string json_str_escape(const std::string& str);
 constexpr size_t maxNumElements = 4096;
 constexpr size_t maxStrLength = 8192;
 
-inline std::string getValueType(
-    const c10::IValue& val,
-    const bool baseType = true,
-    const size_t maxArrayLen = maxNumElements) {
-  std::string type = val.tagKind();
-
-  if (val.isTensor()) {
-    // Add tensor element data type.
-    type += fmt::format("({})", std::string(val.toTensor().dtype().name()));
-  } else if (val.isTuple()) {
-    const auto& val_container = val.toTupleRef().elements();
-    std::vector<std::string> str_array;
-    for (const auto& t : val_container) {
-      str_array.emplace_back(getValueType(t, false));
-    }
-    type += vectorToString(str_array);
-  } else if (val.isList()) {
-    const auto& val_list = val.toList();
-    std::vector<std::string> str_array;
-    str_array.reserve(val_list.size());
-    for (const auto j : c10::irange(val_list.size())) {
-      str_array.push_back(getValueType(val_list.get(j), false));
-      if (j >= maxArrayLen) {
-        LOG(WARNING) << "list size=" << val_list.size()
-                     << " exceeded maxArrayLen=" << maxArrayLen;
-        break;
-      }
-    }
-    type += vectorToString(str_array);
-  }
-  return baseType ? fmt::format("\"{}\"", type) : type;
-}
-
-inline std::string getValueShape(
-    const c10::IValue& val,
-    const size_t maxArrayLen = maxNumElements) {
-  if (val.isTensor()) {
-    auto& tensor = val.toTensor();
-    if (tensor.defined() &&
-        !tensor.unsafeGetTensorImpl()->has_symbolic_sizes_strides()) {
-      return vectorToString(tensor.sizes().vec());
-    }
-  } else if (val.isTuple()) {
-    const auto& val_container = val.toTupleRef().elements();
-    std::vector<std::string> str_array;
-    for (const auto& t : val_container) {
-      str_array.push_back(getValueShape(t));
-    }
-    return vectorToString(str_array);
-  } else if (val.isList()) {
-    const auto& val_list = val.toList();
-    std::vector<std::string> str_array;
-    str_array.reserve(val_list.size());
-    for (const auto j : c10::irange(val_list.size())) {
-      str_array.push_back(getValueShape(val_list.get(j)));
-      if (j >= maxArrayLen) {
-        LOG(WARNING) << "list size=" << val_list.size()
-                     << " exceeded maxArrayLen=" << maxArrayLen;
-        break;
-      }
-    }
-    return vectorToString(str_array);
-  }
-  return "[]";
-}
 
 inline std::string getScalarValue(const c10::IValue& val) {
   if (val.isDouble()) {
@@ -259,6 +194,7 @@ struct FunctionCallContext : public ObserverContext {
   ExecutionTraceObserver::ID parent_id{uninitialized_id};
   ExecutionTraceObserver::ID fw_parent_id{uninitialized_id};
   std::vector<std::string> input_types;
+  std::vector<std::string> input_strides;
   std::vector<std::string> input_shapes;
   std::vector<std::string> input_values;
 };
@@ -303,9 +239,11 @@ static void writeJsonNode(
     const uint64_t fw_tid,
     const std::string& inputs = "[]",
     const std::string& input_shapes = "[]",
+    const std::string& input_strides = "[]",
     const std::string& input_types = "[]",
     const std::string& outputs = "[]",
     const std::string& output_shapes = "[]",
+    const std::string& output_strides = "[]",
     const std::string& output_types = "[]",
     const std::string& operator_schema = "",
     const std::string& kernel_backend = "",
@@ -315,8 +253,8 @@ static void writeJsonNode(
       R"JSON(
     {{
       "id": {}, "name": "{}", "ctrl_deps": {},
-      "inputs": {{"values": {}, "shapes": {}, "types": {}}},
-      "outputs": {{"values": {}, "shapes": {}, "types": {}}},
+      "inputs": {{"values": {}, "shapes": {}, "types": {}, "strides": {}}},
+      "outputs": {{"values": {}, "shapes": {}, "types": {}, "strides": {}}},
       "attrs": [{{"name": "rf_id", "type": "uint64", "value": {}}},{{"name": "fw_parent", "type": "uint64", "value": {}}},{{"name": "seq_id", "type": "int64", "value": {}}},{{"name": "scope", "type": "uint64", "value": {}}},{{"name": "tid", "type": "uint64", "value": {}}},{{"name": "fw_tid", "type": "uint64", "value": {}}},{{"name": "op_schema", "type": "string", "value": "{}"}},{{"name": "kernel_backend", "type": "string", "value": "{}"}},{{"name": "kernel_file", "type": "string", "value": "{}"}}{}]
     }})JSON",
       id,
@@ -325,9 +263,11 @@ static void writeJsonNode(
       inputs,
       input_shapes,
       input_types,
+      input_strides,
       outputs,
       output_shapes,
       output_types,
+      output_strides,
       rf_id,
       fw_parent,
       seq_id,
@@ -366,7 +306,7 @@ static bool initExecutionTraceStart(ExecutionTraceObserver& ob) {
 
   ob.out << fmt::format(
       R"JSON({{
-  "schema": "1.1.0-chakra.0.0.4", "pid": {}, "time": "{}", "start_ts": {},
+  "schema": "1.1.1-chakra.0.0.4", "pid": {}, "time": "{}", "start_ts": {},
   "nodes": [)JSON",
       ob.pid,
       ob.record_time,
@@ -417,69 +357,125 @@ inline ExecutionTraceObserver::ID getObjectID(
   return iter->second;
 }
 
-inline std::string convertIValue(
+
+inline std::tuple<std::string, std::string, std::string, std::string> convertIValue(
     ExecutionTraceObserver& ob,
     const c10::IValue& val,
+    const bool baseType = true,
     const size_t maxArrayLen = maxNumElements) {
+  std::string type = val.tagKind();
   if (val.isTensor()) {
-    const auto t = val.toTensor().unsafeGetTensorImpl();
-    ExecutionTraceObserver::ID tensor_id = getObjectID(ob, t);
+    std::string tensor_shape, tensor_stride, tensor_type, tensor_value;
+
+    const auto tensor = val.toTensor();
+    const auto tensor_impl = tensor.unsafeGetTensorImpl();
+    if (tensor.defined() &&
+        !tensor.unsafeGetTensorImpl()->has_symbolic_sizes_strides()) {
+      // tensor shape
+      tensor_shape = vectorToString(tensor.sizes().vec());  
+      // tensor strides
+      tensor_stride = vectorToString(tensor.strides().vec());
+    }
+    else {
+      tensor_shape = "[]";
+      tensor_stride = "[]";
+    }
+    // tensor dtype
+    type = type + fmt::format("({})", std::string(tensor.dtype().name()));
+    tensor_type = baseType ? fmt::format("\"{}\"", type) : type;
+
+    ExecutionTraceObserver::ID tensor_id = getObjectID(ob, tensor_impl);
     ExecutionTraceObserver::ID storage_id = 0;
     size_t offset = 0;
     size_t numel = 0;
     size_t itemsize = 0;
     std::string device_str = "";
     // symbolic sizes/strides implies t->storage_offset() will fail
-    if (t->has_storage() && !t->has_symbolic_sizes_strides()) {
-      auto& t_storage = t->storage();
+    if (tensor_impl->has_storage() && !tensor_impl->has_symbolic_sizes_strides()) {
+      auto& t_storage = tensor_impl->storage();
       storage_id = getObjectID(ob, t_storage.data());
-      offset = t->storage_offset();
-      numel = t->numel();
-      itemsize = t->itemsize();
-      device_str = t->device().str();
+      offset = tensor_impl->storage_offset();
+      numel = tensor_impl->numel();
+      itemsize = tensor_impl->itemsize();
+      device_str = tensor_impl->device().str();
     }
-    return fmt::format(
+    tensor_value = fmt::format(
         "[{},{},{},{},{},\"{}\"]",
         tensor_id,
         storage_id,
         offset,
         numel,
         itemsize,
         device_str);
+    return std::make_tuple(tensor_shape, tensor_stride, tensor_type, tensor_value);
   } else if (val.isTuple()) {
-    std::vector<std::string> str_array;
     const auto& val_tuple = val.toTupleRef().elements();
-    for (const auto j : c10::irange(val_tuple.size())) {
-      str_array.push_back(convertIValue(ob, val_tuple[j]));
+    size_t tuple_size =val_tuple.size();
+    std::vector<std::string> shape_array;
+    std::vector<std::string> stride_array;
+    std::vector<std::string> type_array;
+    std::vector<std::string> value_array;
+    for (const auto j : c10::irange(tuple_size)) {
+      auto tuple = convertIValue(ob, val_tuple[j], false);
+      shape_array.push_back(std::get<0>(tuple));
+      stride_array.push_back(std::get<1>(tuple));
+      type_array.push_back(std::get<2>(tuple));
+      value_array.push_back(std::get<3>(tuple));
     }
-    return vectorToString(str_array);
+    type = type + vectorToString(type_array);
+    std::string tensor_type = baseType ? fmt::format("\"{}\"", type) : type;
+    return std::make_tuple(vectorToString(shape_array), 
+            vectorToString(stride_array), 
+            tensor_type, 
+            vectorToString(value_array));
   } else if (val.isList()) {
     const auto& val_list = val.toList();
-    std::vector<std::string> str_array;
-    str_array.reserve(val_list.size());
-    for (const auto j : c10::irange(val_list.size())) {
-      str_array.push_back(convertIValue(ob, val_list.get(j)));
+    size_t list_size = val_list.size();
+    std::vector<std::string> shape_array;
+    std::vector<std::string> stride_array;
+    std::vector<std::string> type_array;
+    std::vector<std::string> value_array;
+    for (const auto j : c10::irange(list_size)) {
+      auto tuple = convertIValue(ob,  val_list.get(j), false);
+      shape_array.push_back(std::get<0>(tuple));
+      stride_array.push_back(std::get<1>(tuple));
+      type_array.push_back(std::get<2>(tuple));
+      value_array.push_back(std::get<3>(tuple));
       if (j >= maxArrayLen) {
         LOG(WARNING) << "list size=" << val_list.size()
                      << " exceeded maxArrayLen=" << maxArrayLen;
         break;
       }
     }
-    return vectorToString(str_array);
+    type = type + vectorToString(type_array);
+    std::string tensor_type = baseType ? fmt::format("\"{}\"", type) : type;
+    return std::make_tuple(vectorToString(shape_array), 
+            vectorToString(stride_array), 
+            tensor_type, 
+            vectorToString(value_array));
   } else {
-    return getScalarValue(val);
+    std::string tensor_shape = "[]";
+    std::string tensor_stride = "[]";
+    std::string tensor_type = baseType ? fmt::format("\"{}\"", type) : type;
+    std::string tensor_value = getScalarValue(val);
+
+    return std::make_tuple(tensor_shape, tensor_stride, tensor_type, tensor_value);
   }
 }
 
 inline void appendValueInfo(
     ExecutionTraceObserver& ob,
     const c10::IValue& val,
-    std::vector<std::string>& values,
+    std::vector<std::string>& shapes,
+    std::vector<std::string>& strides,
     std::vector<std::string>& types,
-    std::vector<std::string>& shapes) {
-  values.push_back(convertIValue(ob, val));
-  types.push_back(getValueType(val));
-  shapes.push_back(getValueShape(val));
+    std::vector<std::string>& values) {
+  auto tuple = convertIValue(ob, val, true);
+
+  shapes.push_back(std::get<0>(tuple));
+  strides.push_back(std::get<1>(tuple));
+  types.push_back(std::get<2>(tuple));
+  values.push_back(std::get<3>(tuple));
 }
 
 inline void handleKernelBackendInfo(
@@ -617,7 +613,7 @@ static void recordOperatorStart(
 
     for (const auto i : c10::irange(input_start, inputs.size())) {
       appendValueInfo(
-          ob, inputs[i], fc.input_values, fc.input_types, fc.input_shapes);
+          ob, inputs[i], fc.input_shapes, fc.input_strides, fc.input_types, fc.input_values);
     }
 
     handleKernelBackendInfo(fc, fn);
@@ -711,6 +707,7 @@ static void onFunctionExit(const RecordFunction& fn, ObserverContext* ctx_ptr) {
     size_t output_start = outputs.size() - num_outputs;
 
     std::vector<std::string> output_types;
+    std::vector<std::string> output_strides;
     std::vector<std::string> output_shapes;
     std::vector<std::string> output_values;
     try {
@@ -720,7 +717,7 @@ static void onFunctionExit(const RecordFunction& fn, ObserverContext* ctx_ptr) {
       ob->op_stack[fn.threadId()].pop();
       for (const auto i : c10::irange(output_start, outputs.size())) {
         appendValueInfo(
-            *ob, outputs[i], output_values, output_types, output_shapes);
+            *ob, outputs[i], output_shapes, output_strides, output_types, output_values);
       }
 
       std::string op_schema_str{};
@@ -745,9 +742,11 @@ static void onFunctionExit(const RecordFunction& fn, ObserverContext* ctx_ptr) {
           fn.forwardThreadId(),
           vectorToString(fc.input_values),
           vectorToString(fc.input_shapes),
+          vectorToString(fc.input_strides),
           vectorToString(fc.input_types),
           vectorToString(output_values),
           vectorToString(output_shapes),
+          vectorToString(output_strides),
           vectorToString(output_types),
           op_schema_str,
           fc.kernel_backend,