Skip to content

Commit

Permalink
[Pytorch Edge] Enable kineto profiler on mobile via EdgeKinetoProfiler (
Browse files Browse the repository at this point in the history
pytorch#62419)

Summary:
Pull Request resolved: pytorch#62419

This diff adds support for cpu only kineto profiler on mobile. Thus
enabling chrome trace generation on mobile. This bring cpp API for
mobile profiling on part with Torchscript.
This is done via:
1. Utilizating debug handle annotations in KinetoEvent.
2. Adding post processing capability, via callbacks, to
KinetoThreadLocalState
3. Creating new RAII stype profiler, KinetoEdgeCPUProfiler, which can be
used in surrounding scope of model execution. This will write chrome
trace to the location specified in profiler constructor.

Test Plan:
MobileProfiler.ModuleHierarchy

Imported from OSS

Reviewed By: raziel

Differential Revision: D29993660

fbshipit-source-id: 0b44f52f9e9c5f5aff81ebbd9273c254c3c03299
  • Loading branch information
kimishpatel authored and facebook-github-bot committed Aug 14, 2021
1 parent 77a6436 commit 38c1851
Show file tree
Hide file tree
Showing 24 changed files with 406 additions and 72 deletions.
1 change: 1 addition & 0 deletions .jenkins/pytorch/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,7 @@ if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
if [[ "${BUILD_ENVIRONMENT}" == *vulkan* ]]; then
build_args+=("-DUSE_VULKAN=ON")
fi
build_args+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF")
exec ./scripts/build_android.sh "${build_args[@]}" "$@"
fi

Expand Down
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ if(NOT DEFINED USE_VULKAN)
endif()

option(USE_SOURCE_DEBUG_ON_MOBILE "Enable " ON)
option(USE_LITE_INTERPRETER_PROFILER "Enable " ON)
option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
option(USE_VULKAN_SHADERC_RUNTIME "Vulkan - Use runtime shader compilation as opposed to build-time (needs libshaderc)" OFF)
Expand Down Expand Up @@ -687,6 +688,10 @@ if(USE_SOURCE_DEBUG_ON_MOBILE)
string(APPEND CMAKE_CXX_FLAGS " -DSYMBOLICATE_MOBILE_DEBUG_HANDLE")
endif()

if(USE_LITE_INTERPRETER_PROFILER)
string(APPEND CMAKE_CXX_FLAGS " -DEDGE_PROFILER_USE_KINETO")
endif()

# ---[ Allowlist file if allowlist is specified
include(cmake/Allowlist.cmake)

Expand Down
5 changes: 3 additions & 2 deletions android/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ check_gradle() {
}

parse_abis_list() {
ABIS_LIST="armeabi-v7a,arm64-v8a,x86,x86_64"
ABIS_LIST="x86"
CUSTOM_ABIS_LIST=false
if [ $# -gt 0 ]; then
ABIS_LIST=$1
Expand Down Expand Up @@ -59,7 +59,8 @@ build_android() {
ANDROID_ABI="$abi" \
BUILD_ROOT="$ANDROID_BUILD_ROOT" \
"$PYTORCH_DIR/scripts/build_android.sh" \
-DANDROID_CCACHE="$(which ccache)"
-DANDROID_CCACHE="$(which ccache)" \
-DUSE_LITE_INTERPRETER_PROFILER="OFF"

echo "$abi build output lib,include at $ANDROID_BUILD_ROOT/install"
ln -s "$ANDROID_BUILD_ROOT/install/lib" "$LIB_DIR/$abi"
Expand Down
4 changes: 2 additions & 2 deletions android/pytorch_android/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ android {
externalNativeBuild {
cmake {
if(System.env.BUILD_LITE_INTERPRETER == '0') {
arguments "-DANDROID_STL=c++_shared", "-DBUILD_LITE_INTERPRETER=OFF"
arguments "-DANDROID_STL=c++_shared", "-DBUILD_LITE_INTERPRETER=OFF", "-DUSE_LITE_INTERPRETER_PROFILER=OFF"
} else {
arguments "-DANDROID_STL=c++_shared"
arguments "-DANDROID_STL=c++_shared", "-DUSE_LITE_INTERPRETER_PROFILER=OFF"
}
}
}
Expand Down
12 changes: 7 additions & 5 deletions aten/src/ATen/record_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ enum class C10_API_ENUM RecordScope : uint8_t {
TORCHSCRIPT_FUNCTION,
// Kernel Function dtype Tag
KERNEL_FUNCTION_DTYPE,
// Kernel Function dtype Tag
LITE_INTERPRETER,
// User defined scope (e.g. with record_function())
USER_SCOPE,
NUM_SCOPES, // must be the last in the list
Expand Down Expand Up @@ -502,11 +504,11 @@ class TORCH_API RecordFunctionCallback {
} \
}

// Helper macros to record user_scope events with debug handles
#define RECORD_USER_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS( \
fn, debug_handle, inputs) \
RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS( \
at::RecordScope::USER_SCOPE, fn, debug_handle, inputs)
// Helper macros to record LITE INTERPETER scope events with debug handles
#define RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS( \
fn, debug_handle, inputs) \
RECORD_WITH_SCOPE_DEBUG_HANDLE_AND_INPUTS( \
at::RecordScope::LITE_INTERPRETER, fn, debug_handle, inputs)

// Notes:
// - two types of callbacks are provided: thread local and global
Expand Down
7 changes: 7 additions & 0 deletions caffe2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -485,10 +485,17 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
endif()
endif()

list(APPEND LITE_PROFILER_SRCS "")
if(USE_LITE_INTERPRETER_PROFILER)
append_filelist("libtorch_edge_profiler_sources " LITE_PROFILER_SRCS)
endif()

# Switch between the full jit interpreter and lite interpreter
if(BUILD_LITE_INTERPRETER)
append_filelist("libtorch_lite_cmake_sources" LIBTORCH_CMAKE_SRCS)
list(APPEND LIBTORCH_CMAKE_SRCS ${LITE_EAGER_SYMOBLICATION_SRCS})
list(APPEND LIBTORCH_CMAKE_SRCS ${LITE_PROFILER_SRCS})
set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
else()
append_filelist("libtorch_cmake_sources" LIBTORCH_CMAKE_SRCS)

Expand Down
14 changes: 13 additions & 1 deletion cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1568,6 +1568,11 @@ endif()
# --[ ATen checks
set(USE_LAPACK 0)

# we need to build all targets to be linked with PIC
if(USE_KINETO AND INTERN_BUILD_MOBILE AND USE_LITE_INTERPRETER_PROFILER)
set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
endif()

if(NOT INTERN_BUILD_MOBILE)
set(TORCH_CUDA_ARCH_LIST $ENV{TORCH_CUDA_ARCH_LIST})
set(TORCH_NVCC_FLAGS $ENV{TORCH_NVCC_FLAGS})
Expand Down Expand Up @@ -1876,11 +1881,17 @@ list(APPEND Caffe2_DEPENDENCY_LIBS fmt::fmt-header-only)
set(BUILD_SHARED_LIBS ${TEMP_BUILD_SHARED_LIBS} CACHE BOOL "Build shared libs" FORCE)

# ---[ Kineto
if(USE_KINETO AND INTERN_BUILD_MOBILE)
# edge profiler depends on KinetoProfiler but it only does cpu
# profiling. Thus we dont need USE_CUDA/USE_ROCM
if(USE_KINETO AND INTERN_BUILD_MOBILE AND NOT (BUILD_LITE_INTERPRETER AND USE_LITE_INTERPRETER_PROFILER))
message(STATUS "Not using libkineto in a mobile build.")
set(USE_KINETO OFF)
endif()

if(USE_KINETO AND INTERN_BUILD_MOBILE AND USE_LITE_INTERPRETER_PROFILER AND (USE_CUDA OR USE_ROCM))
message(FATAL_ERROR "Mobile build with profiler does not support CUDA or ROCM")
endif()

if(USE_KINETO)
if((NOT USE_CUDA) OR MSVC)
set(LIBKINETO_NOCUPTI ON CACHE STRING "" FORCE)
Expand Down Expand Up @@ -1956,6 +1967,7 @@ if(USE_KINETO)

if(NOT TARGET kineto)
add_subdirectory("${KINETO_SOURCE_DIR}")
set_property(TARGET kineto PROPERTY POSITION_INDEPENDENT_CODE ON)
endif()
list(APPEND Caffe2_DEPENDENCY_LIBS kineto)
string(APPEND CMAKE_CXX_FLAGS " -DUSE_KINETO")
Expand Down
1 change: 1 addition & 0 deletions scripts/build_ios.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ if [ "${BUILD_LITE_INTERPRETER}" == 0 ]; then
else
CMAKE_ARGS+=("-DBUILD_LITE_INTERPRETER=ON")
fi
CMAKE_ARGS+=("-DUSE_LITE_INTERPRETER_PROFILER=OFF")

# Don't build binaries or tests (only the library)
CMAKE_ARGS+=("-DBUILD_TEST=OFF")
Expand Down
30 changes: 15 additions & 15 deletions test/cpp/jit/test_backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -338,16 +338,16 @@ TEST(BackendTestDebugInfo, TestCompiler) {
lm._save_for_mobile(ss, ExtraFilesMap(), true);
auto mlm = _load_for_mobile(ss);
std::string error_pattern = R"(
Module hierarchy:top(m).aten::add
Module hierarchy:top(m)::<unknown>.aten::add
Traceback of TorchScript (most recent call last):
File "<string>", line 5, in FunctionName_UNKNOWN
File "<string>", line 5, in <unknown>
typed_inputs: List[Any] = [x, h, ]
if self.__backend.is_available() :
_0, = self.__backend.execute(self.__handles["forward"], typed_inputs)
~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
assert isinstance(_0, Tensor)
return _0
File "<string>", line 3, in FunctionName_UNKNOWN
File "<string>", line 3, in <unknown>
def forward(self, x, h):
return x + h
Expand Down Expand Up @@ -392,16 +392,16 @@ TEST(BackendTestDebugInfo, TestExceptionStackForCompilerWithModuleHierarchy) {
lm._save_for_mobile(ss, ExtraFilesMap(), true);
auto mlm = _load_for_mobile(ss);
std::string error_pattern = R"(
Module hierarchy:top(C).A0(A).aten::add
Module hierarchy:top(C)::<unknown>.A0(A)::forward.aten::add
Traceback of TorchScript (most recent call last):
File "<string>", line 5, in FunctionName_UNKNOWN
File "<string>", line 5, in <unknown>
typed_inputs: List[Any] = [x, y, ]
if self.__backend.is_available() :
_0, = self.__backend.execute(self.__handles["forward"], typed_inputs)
~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
assert isinstance(_0, Tensor)
return _0
File "<string>", line 3, in FunctionName_UNKNOWN
File "<string>", line 3, in <unknown>
def forward(self, x, y):
return self.A0.forward(x, y) + self.B0.forward(x)
Expand Down Expand Up @@ -485,16 +485,16 @@ TEST(
*
*/
std::string error_pattern = R"(
Module hierarchy:top(C).B0(B).A0(A).aten::add
Module hierarchy:top(C)::<unknown>.B0(B)::forward.A0(A)::forward.aten::add
Traceback of TorchScript (most recent call last):
File "<string>", line 5, in FunctionName_UNKNOWN
File "<string>", line 5, in <unknown>
typed_inputs: List[Any] = [x, y, ]
if self.__backend.is_available() :
_0, = self.__backend.execute(self.__handles["forward"], typed_inputs)
~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
assert isinstance(_0, Tensor)
return _0
File "<string>", line 3, in FunctionName_UNKNOWN
File "<string>", line 3, in <unknown>
def forward(self, x, y):
return self.B0.forward(x, y) + 3
Expand Down Expand Up @@ -572,9 +572,9 @@ TEST(BackendTestDebugInfo, TestExceptionStackForCompilerWithLoweredSubModule) {
c._save_for_mobile(ss, ExtraFilesMap(), true);
auto c_loaded = _load_for_mobile(ss);
std::string error_pattern = R"(
Module hierarchy:top(C).A0(A).aten::add
Module hierarchy:top(C)::<unknown>.A0(A)::forward.aten::add
Traceback of TorchScript (most recent call last):
File "<string>", line 3, in FunctionName_UNKNOWN
File "<string>", line 3, in <unknown>
def forward(self, x, y):
return self.A0.forward(x, y) + self.B0.forward(x)
Expand All @@ -587,7 +587,7 @@ Traceback of TorchScript (most recent call last):
~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
assert isinstance(_0, Tensor)
return _0
File "<string>", line 3, in FunctionName_UNKNOWN
File "<string>", line 3, in <unknown>
def forward(self, x, y):
return x + y
Expand Down Expand Up @@ -693,9 +693,9 @@ TEST(
*
* */
std::string error_pattern = R"(
Module hierarchy:top(C).A0(A).AA0(AA).aten::add
Module hierarchy:top(C)::<unknown>.A0(A)::forward.AA0(AA)::forward.aten::add
Traceback of TorchScript (most recent call last):
File "<string>", line 3, in FunctionName_UNKNOWN
File "<string>", line 3, in <unknown>
def forward(self, x, y):
return self.A0.forward(x, y) + self.B0.forward(x)
Expand All @@ -708,7 +708,7 @@ Traceback of TorchScript (most recent call last):
~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
assert isinstance(_0, Tensor)
return _0
File "<string>", line 3, in FunctionName_UNKNOWN
File "<string>", line 3, in <unknown>
def forward(self, x, y):
return self.AA0.forward(x, y) + 3
Expand Down
46 changes: 28 additions & 18 deletions test/cpp/jit/test_lite_interpreter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ TEST(LiteInterpreterTest, ModuleInfoBasic) {
}
}

AT_ASSERT(module_debug_info_set.count("top(M).aten::mul"));
AT_ASSERT(module_debug_info_set.count("top(M)::<unknown>.aten::mul"));
}

TEST(LiteInterpreterTest, NotSaveModuleInfo) {
Expand Down Expand Up @@ -542,9 +542,11 @@ TEST(LiteInterpreterTest, OneSubmoduleModuleInfo) {
}
}

AT_ASSERT(module_debug_info_set.count("top(B).aten::add"));
AT_ASSERT(module_debug_info_set.count("top(B).A0(A).aten::add"));
AT_ASSERT(module_debug_info_set.count("top(B).A0(A).aten::mul"));
AT_ASSERT(module_debug_info_set.count("top(B)::<unknown>.aten::add"));
AT_ASSERT(module_debug_info_set.count(
"top(B)::<unknown>.A0(A)::forward.aten::add"));
AT_ASSERT(module_debug_info_set.count(
"top(B)::<unknown>.A0(A)::forward.aten::mul"));
}

TEST(LiteInterpreterTest, TwoSubmodulesModuleInfo) {
Expand Down Expand Up @@ -585,9 +587,11 @@ TEST(LiteInterpreterTest, TwoSubmodulesModuleInfo) {
}
}

AT_ASSERT(module_debug_info_set.count("top(C).aten::add"));
AT_ASSERT(module_debug_info_set.count("top(C).A0(A).aten::add"));
AT_ASSERT(module_debug_info_set.count("top(C).B0(B).aten::add"));
AT_ASSERT(module_debug_info_set.count("top(C)::<unknown>.aten::add"));
AT_ASSERT(module_debug_info_set.count(
"top(C)::<unknown>.A0(A)::forward.aten::add"));
AT_ASSERT(module_debug_info_set.count(
"top(C)::<unknown>.B0(B)::forward.aten::add"));
}

TEST(LiteInterpreterTest, GetRuntimeByteCodeVersion) {
Expand Down Expand Up @@ -854,9 +858,11 @@ TEST(LiteInterpreterTest, SequentialModuleInfo) {
// def forward(self, x):
// return self.A0.forward(self.B0.forward(x))

AT_ASSERT(module_debug_info_set.count("top(C).prim::Return"));
AT_ASSERT(module_debug_info_set.count("top(C).A0(A).aten::add"));
AT_ASSERT(module_debug_info_set.count("top(C).B0(B).aten::add"));
AT_ASSERT(module_debug_info_set.count("top(C)::<unknown>.prim::Return"));
AT_ASSERT(module_debug_info_set.count(
"top(C)::<unknown>.A0(A)::forward.aten::add"));
AT_ASSERT(module_debug_info_set.count(
"top(C)::<unknown>.B0(B)::forward.aten::add"));
}

TEST(LiteInterpreterTest, HierarchyModuleInfo) {
Expand Down Expand Up @@ -901,9 +907,11 @@ TEST(LiteInterpreterTest, HierarchyModuleInfo) {
// "top(C).forward": for the add operator in top.
// "top(C).B0(B).forward": for the add operator in B0.
// "top(C).B0(B).forward.A0(A).forward": for the add operator in A0.
AT_ASSERT(module_debug_info_set.count("top(C).aten::add"));
AT_ASSERT(module_debug_info_set.count("top(C).B0(B).aten::add"));
AT_ASSERT(module_debug_info_set.count("top(C).B0(B).A0(A).aten::add"));
AT_ASSERT(module_debug_info_set.count("top(C)::<unknown>.aten::add"));
AT_ASSERT(module_debug_info_set.count(
"top(C)::<unknown>.B0(B)::forward.aten::add"));
AT_ASSERT(module_debug_info_set.count(
"top(C)::<unknown>.B0(B)::forward.A0(A)::forward.aten::add"));
}

TEST(LiteInterpreterTest, DuplicatedClassTypeModuleInfo) {
Expand Down Expand Up @@ -960,9 +968,11 @@ TEST(LiteInterpreterTest, DuplicatedClassTypeModuleInfo) {
// "top(B).A0(A).forward": for the add operator in A0.
// "top(B).A1(A).forward": for the add operator in A1.

AT_ASSERT(module_debug_info_set.count("top(B).aten::add"));
AT_ASSERT(module_debug_info_set.count("top(B).A0(A).aten::add"));
AT_ASSERT(module_debug_info_set.count("top(B).A1(A).aten::add"));
AT_ASSERT(module_debug_info_set.count("top(B)::<unknown>.aten::add"));
AT_ASSERT(module_debug_info_set.count(
"top(B)::<unknown>.A0(A)::forward.aten::add"));
AT_ASSERT(module_debug_info_set.count(
"top(B)::<unknown>.A1(A)::forward.aten::add"));
}
#endif // !defined(FB_XPLAT_BUILD)

Expand Down Expand Up @@ -1371,9 +1381,9 @@ TEST(LiteInterpreterTest, TestExceptionStackWithTwoLevelModuleHierarchy) {
c._save_for_mobile(ss, ExtraFilesMap(), true);
auto lite_m = _load_for_mobile(ss);
std::string error_pattern = R"(
Module hierarchy:top(C).B0(B).A0(A).aten::add
Module hierarchy:top(C)::<unknown>.B0(B)::foo.A0(A)::bar.aten::add
Traceback of TorchScript (most recent call last):
File "<string>", line 3, in FunctionName_UNKNOWN
File "<string>", line 3, in <unknown>
def forward(self, x, y):
return self.B0.foo(x, y) + 3
Expand Down
12 changes: 5 additions & 7 deletions test/cpp/jit/test_misc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2481,7 +2481,7 @@ TEST(RecordDebugHandles, Basic) {
torch::autograd::profiler::ProfilerState::KINETO, false, false),
activities);
{
RECORD_USER_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS("my_function", 42, {});
RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS("my_function", 42, {});
float x{5.9999}, y{2.1212};
float z = x / y;
}
Expand Down Expand Up @@ -2533,7 +2533,7 @@ TEST(RecordDebugHandles, ScopedCallbacks) {
torch::autograd::profiler::ProfilerConfig(
torch::autograd::profiler::ProfilerState::KINETO, false, false),
{torch::autograd::profiler::ActivityType::CPU},
{at::RecordScope::USER_SCOPE});
{at::RecordScope::LITE_INTERPRETER});
{
auto a = torch::rand({128, 128});
auto b = torch::rand({128, 128});
Expand All @@ -2550,9 +2550,9 @@ TEST(RecordDebugHandles, ScopedCallbacks) {
torch::autograd::profiler::ProfilerConfig(
torch::autograd::profiler::ProfilerState::KINETO, false, false),
{torch::autograd::profiler::ActivityType::CPU},
{at::RecordScope::USER_SCOPE});
{at::RecordScope::LITE_INTERPRETER});
{
RECORD_USER_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS("my_function", 42, {});
RECORD_EDGE_SCOPE_WITH_DEBUG_HANDLE_AND_INPUTS("my_function", 42, {});
auto a = torch::rand({128, 128});
auto b = torch::rand({128, 128});
auto c = a + b;
Expand All @@ -2568,11 +2568,9 @@ TEST(RecordDebugHandles, ScopedCallbacks) {
for (const auto& e : kineto_events) {
if (e.name() == "my_function") {
ASSERT_EQ(e.debugHandle(), 42);
} else if (e.name() == "not_my_function") {
ASSERT_EQ(e.debugHandle(), -1);
}
}
ASSERT_TRUE(profiler_results_ptr->events().size() == 2);
ASSERT_TRUE(profiler_results_ptr->events().size() == 1);
}

TEST(IValueKWargsTest, Basic) {
Expand Down
1 change: 1 addition & 0 deletions test/cpp/lite_interpreter_runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ set(
set(LITE_INTERPRETER_RUNTIME_TEST_DIR
${TORCH_ROOT}/test/cpp/lite_interpreter_runtime/main.cpp
${TORCH_ROOT}/test/cpp/lite_interpreter_runtime/test_lite_interpreter_runtime.cpp
${TORCH_ROOT}/test/cpp/lite_interpreter_runtime/test_mobile_profiler.cpp
)

add_library(backend_with_compiler_runtime SHARED
Expand Down
Loading

0 comments on commit 38c1851

Please sign in to comment.