Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Python Cuda loading issues #7939

Merged
merged 16 commits into from
Jun 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 26 additions & 20 deletions onnxruntime/core/framework/provider_bridge_ort.cc
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,8 @@ using IndexedSubGraph_MetaDef = IndexedSubGraph::MetaDef;

namespace onnxruntime {

//ProviderHost* g_host{};

ProviderInfo_CUDA* GetProviderInfo_CUDA();
ProviderInfo_CUDA* TryGetProviderInfo_CUDA();
ProviderInfo_CUDA& GetProviderInfo_CUDA();

struct TensorShapeProto_Dimension_Iterator_Impl : TensorShapeProto_Dimension_Iterator {
TensorShapeProto_Dimension_Iterator_Impl(google::protobuf::internal::RepeatedPtrIterator<const onnx::TensorShapeProto_Dimension>&& v) : v_{std::move(v)} {}
Expand Down Expand Up @@ -195,15 +194,15 @@ struct ProviderHostImpl : ProviderHost {
void CPUAllocator__Free(CPUAllocator* p, void* allocation) override { return p->CPUAllocator::Free(allocation); }

#ifdef USE_CUDA
std::unique_ptr<IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_CUDA()->CreateCUDAAllocator(device_id, name); }
std::unique_ptr<IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_CUDA()->CreateCUDAPinnedAllocator(device_id, name); }
std::unique_ptr<IDataTransfer> CreateGPUDataTransfer(void* stream) override { return GetProviderInfo_CUDA()->CreateGPUDataTransfer(stream); }
std::unique_ptr<IAllocator> CreateCUDAAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_CUDA().CreateCUDAAllocator(device_id, name); }
std::unique_ptr<IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_CUDA().CreateCUDAPinnedAllocator(device_id, name); }
std::unique_ptr<IDataTransfer> CreateGPUDataTransfer(void* stream) override { return GetProviderInfo_CUDA().CreateGPUDataTransfer(stream); }

void cuda__Impl_Cast(void* stream, const int64_t* input_data, int32_t* output_data, size_t count) override { return GetProviderInfo_CUDA()->cuda__Impl_Cast(stream, input_data, output_data, count); }
void cuda__Impl_Cast(void* stream, const int32_t* input_data, int64_t* output_data, size_t count) override { return GetProviderInfo_CUDA()->cuda__Impl_Cast(stream, input_data, output_data, count); }
void cuda__Impl_Cast(void* stream, const int64_t* input_data, int32_t* output_data, size_t count) override { return GetProviderInfo_CUDA().cuda__Impl_Cast(stream, input_data, output_data, count); }
void cuda__Impl_Cast(void* stream, const int32_t* input_data, int64_t* output_data, size_t count) override { return GetProviderInfo_CUDA().cuda__Impl_Cast(stream, input_data, output_data, count); }

bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return GetProviderInfo_CUDA()->CudaCall_false(retCode, exprString, libName, successCode, msg); }
bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return GetProviderInfo_CUDA()->CudaCall_true(retCode, exprString, libName, successCode, msg); }
bool CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return GetProviderInfo_CUDA().CudaCall_false(retCode, exprString, libName, successCode, msg); }
bool CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg) override { return GetProviderInfo_CUDA().CudaCall_true(retCode, exprString, libName, successCode, msg); }
#endif

std::string GetEnvironmentVar(const std::string& var_name) override { return Env::Default().GetEnvironmentVar(var_name); }
Expand Down Expand Up @@ -1048,7 +1047,7 @@ void UnloadSharedProviders() {

// Used by test code
std::unique_ptr<IAllocator> CreateCUDAPinnedAllocator(int16_t device_id, const char* name) {
if (auto* info = onnxruntime::GetProviderInfo_CUDA())
if (auto* info = onnxruntime::TryGetProviderInfo_CUDA())
return info->CreateCUDAPinnedAllocator(device_id, name);

return nullptr;
Expand Down Expand Up @@ -1095,10 +1094,17 @@ ProviderInfo_OpenVINO* GetProviderInfo_OpenVINO() {
return nullptr;
}

ProviderInfo_CUDA* GetProviderInfo_CUDA() {
ProviderInfo_CUDA* TryGetProviderInfo_CUDA() {
if (auto* provider = s_library_cuda.Get())
return reinterpret_cast<ProviderInfo_CUDA*>(provider->GetInfo());
LOGS_DEFAULT(WARNING) << "GetProviderInfo_CUDA called, returning nullptr";

return nullptr;
}

ProviderInfo_CUDA& GetProviderInfo_CUDA() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a better error msg we can give to users? Even if the user enables the cuda provider, this might fail and the error msg is not clear why it failed. The user cannot take any action other than filing a github issue which increases support costs.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

second this. we need to somehow propagate the actual error.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I worked on improving the error messages, it used to show this on failure:

"2021-06-09 18:24:21.8575539 [E:onnxruntime:Default, provider_bridge_ort.cc:1000 onnxruntime::ProviderLibrary::Get] Failed to load library, error code: 126"

With these changes, now it will show this:

2021-06-09 18:54:17.7950730 [E:onnxruntime:Default, provider_bridge_ort.cc:1000 onnxruntime::ProviderLibrary::Get] LoadLibrary failed with error 126 "The specified module could not be found." when trying to load "C:\Code\Github\onnxruntime\build\Windows\Debug\Debug\onnxruntime\capi\onnxruntime_providers_cuda.dll"

so now on Windows it will translate the error message into text, vs just showing a number.

if(auto* info = TryGetProviderInfo_CUDA())
return *info;

ORT_THROW("CUDA Provider not available, can't get interface for it");
}

Expand All @@ -1108,21 +1114,21 @@ void CopyGpuToCpu(
const size_t size,
const OrtMemoryInfo& dst_location,
const OrtMemoryInfo& src_location) {
if (auto* info = onnxruntime::GetProviderInfo_CUDA())
if (auto* info = onnxruntime::TryGetProviderInfo_CUDA())
return info->CopyGpuToCpu(dst_ptr, src_ptr, size, dst_location, src_location);
ORT_THROW("GPU-to-CPU copy is not implemented.");
}

void cudaMemcpy_HostToDevice(void* dst, const void* src, size_t count) {
if (auto* info = onnxruntime::GetProviderInfo_CUDA())
if (auto* info = onnxruntime::TryGetProviderInfo_CUDA())
return info->cudaMemcpy_HostToDevice(dst, src, count);
ORT_THROW("cudaMemcpy_HostToDevice is not implemented.");
}

#if defined(USE_CUDA) && defined(ORT_USE_NCCL) && defined(USE_NCCL_P2P)
namespace cuda {
INcclService& INcclService::GetInstance() {
return GetProviderInfo_CUDA()->GetINcclService();
return GetProviderInfo_CUDA().GetINcclService();
}
} // namespace cuda
#endif
Expand Down Expand Up @@ -1192,17 +1198,17 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_CUDA, _In_ OrtSessi

ORT_API_STATUS_IMPL(OrtApis::SetCurrentGpuDeviceId, _In_ int device_id) {
API_IMPL_BEGIN
if (auto* info = onnxruntime::GetProviderInfo_CUDA())
if (auto* info = onnxruntime::TryGetProviderInfo_CUDA())
return info->SetCurrentGpuDeviceId(device_id);
return CreateStatus(ORT_FAIL, "CUDA execution provider is not enabled.");
return CreateStatus(ORT_FAIL, "CUDA execution provider is either not enabled or not available.");
API_IMPL_END
}

ORT_API_STATUS_IMPL(OrtApis::GetCurrentGpuDeviceId, _In_ int* device_id) {
API_IMPL_BEGIN
if (auto* info = onnxruntime::GetProviderInfo_CUDA())
if (auto* info = onnxruntime::TryGetProviderInfo_CUDA())
return info->GetCurrentGpuDeviceId(device_id);
return CreateStatus(ORT_FAIL, "CUDA execution provider is not enabled.");
return CreateStatus(ORT_FAIL, "CUDA execution provider is either not enabled or not available.");
API_IMPL_END
}

Expand Down
1 change: 1 addition & 0 deletions onnxruntime/core/platform/windows/env.cc
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ EXTERN_C IMAGE_DOS_HEADER __ImageBase;
namespace onnxruntime {

namespace {

class WindowsThread : public EnvThread {
private:
struct Param {
Expand Down
46 changes: 0 additions & 46 deletions onnxruntime/python/_pybind_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,58 +7,12 @@
"""
import os
import platform
import sys

from . import _ld_preload # noqa: F401

if platform.system() == "Windows":
from . import version_info

if version_info.use_cuda:
cuda_version_major, cuda_version_minor = version_info.cuda_version.split(".")
if int(cuda_version_major) < 11:
# Prior to CUDA 11 both major and minor version at build time/runtime have to match.
cuda_env_variable = f"CUDA_PATH_V{cuda_version_major}_{cuda_version_minor}"
if cuda_env_variable not in os.environ:
raise ImportError(f"CUDA Toolkit {version_info.cuda_version} not installed on the machine.")
else:
# With CUDA 11 and newer only the major version at build time/runtime has to match.
# Use the most recent minor version available.
cuda_env_variable = None
for i in range(9, -1, -1):
if f"CUDA_PATH_V{cuda_version_major}_{i}" in os.environ:
cuda_env_variable = f"CUDA_PATH_V{cuda_version_major}_{i}"
break
if not cuda_env_variable:
raise ImportError(f"CUDA Toolkit {cuda_version_major}.x not installed on the machine.")

cuda_bin_dir = os.path.join(os.environ[cuda_env_variable], "bin")

# prefer CUDNN_HOME if set. fallback to the CUDA install directory (would have required user to manually
# copy the cudnn dll there
cudnn_path = os.environ["CUDNN_HOME"] if "CUDNN_HOME" in os.environ else os.environ[cuda_env_variable]
cudnn_bin_dir = os.path.join(cudnn_path, "bin")

if not os.path.isfile(os.path.join(cudnn_bin_dir, f"cudnn64_{version_info.cudnn_version}.dll")):
raise ImportError(f"cuDNN {version_info.cudnn_version} not installed in {cudnn_bin_dir}. "
f"Set the CUDNN_HOME environment variable to the path of the 'cuda' directory "
f"in your CUDNN installation if necessary.")

if sys.version_info >= (3, 8):
# Python 3.8 (and later) doesn't search system PATH when loading DLLs, so the CUDA location needs to be
# specified explicitly using the new API introduced in Python 3.8.
os.add_dll_directory(cuda_bin_dir)
cuda_root = os.path.join(cuda_bin_dir, "..", "..")
for root, _, files in os.walk(cuda_root):
for f in files:
if f == "cupti.lib":
os.add_dll_directory(root)
else:
# Python 3.7 (and earlier) searches directories listed in PATH variable.
# Make sure that the target CUDA version is at the beginning (important if multiple CUDA versions are
# installed on the machine.)
os.environ["PATH"] = cuda_bin_dir + os.pathsep + os.environ["PATH"]

if version_info.vs2019 and platform.architecture()[0] == "64bit":
if not os.path.isfile("C:\\Windows\\System32\\vcruntime140_1.dll"):
raise ImportError(
Expand Down
8 changes: 4 additions & 4 deletions onnxruntime/python/onnxruntime_pybind_mlvalue.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,11 @@ void CpuToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {

#ifdef USE_CUDA
void CpuToCudaMemCpy(void* dst, const void* src, size_t num_bytes) {
GetProviderInfo_CUDA()->cudaMemcpy_HostToDevice(dst, src, num_bytes);
GetProviderInfo_CUDA().cudaMemcpy_HostToDevice(dst, src, num_bytes);
}

void CudaToCpuMemCpy(void* dst, const void* src, size_t num_bytes) {
GetProviderInfo_CUDA()->cudaMemcpy_DeviceToHost(dst, src, num_bytes);
GetProviderInfo_CUDA().cudaMemcpy_DeviceToHost(dst, src, num_bytes);
}

const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetCudaToHostMemCpyFunction() {
Expand All @@ -82,7 +82,7 @@ const std::unordered_map<OrtDevice::DeviceType, MemCpyFunc>* GetCudaToHostMemCpy
}

bool IsCudaDeviceIdValid(const onnxruntime::logging::Logger& logger, int id) {
int num_devices = GetProviderInfo_CUDA()->cudaGetDeviceCount();
int num_devices = GetProviderInfo_CUDA().cudaGetDeviceCount();

if (0 == num_devices) {
LOGS(logger, WARNING) << "your system does not have a CUDA capable device.";
Expand All @@ -105,7 +105,7 @@ AllocatorPtr GetCudaAllocator(OrtDevice::DeviceId id) {

if (id_to_allocator_map->find(id) == id_to_allocator_map->end()) {
// TODO: Expose knobs so that users can set fields associated with OrtArenaCfg so that we can pass it to the following method
id_to_allocator_map->insert({id, GetProviderInfo_CUDA()->CreateCudaAllocator(id, gpu_mem_limit, arena_extend_strategy, external_allocator_info, nullptr)});
id_to_allocator_map->insert({id, GetProviderInfo_CUDA().CreateCudaAllocator(id, gpu_mem_limit, arena_extend_strategy, external_allocator_info, nullptr)});
}

return (*id_to_allocator_map)[id];
Expand Down
43 changes: 26 additions & 17 deletions onnxruntime/python/onnxruntime_pybind_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -470,24 +470,33 @@ static void RegisterExecutionProviders(InferenceSession* sess, const std::vector
#endif
} else if (type == kCudaExecutionProvider) {
#ifdef USE_CUDA
const auto it = provider_options_map.find(type);
CUDAExecutionProviderInfo info{};
if (it != provider_options_map.end())
GetProviderInfo_CUDA()->CUDAExecutionProviderInfo__FromProviderOptions(it->second, info);
else {
info.device_id = cuda_device_id;
info.gpu_mem_limit = gpu_mem_limit;
info.arena_extend_strategy = arena_extend_strategy;
info.cudnn_conv_algo_search = cudnn_conv_algo_search;
info.do_copy_in_default_stream = do_copy_in_default_stream;
info.external_allocator_info = external_allocator_info;
}
if(auto* cuda_provider_info = TryGetProviderInfo_CUDA())
{
const auto it = provider_options_map.find(type);
CUDAExecutionProviderInfo info{};
if (it != provider_options_map.end())
cuda_provider_info->CUDAExecutionProviderInfo__FromProviderOptions(it->second, info);
else {
info.device_id = cuda_device_id;
info.gpu_mem_limit = gpu_mem_limit;
info.arena_extend_strategy = arena_extend_strategy;
info.cudnn_conv_algo_search = cudnn_conv_algo_search;
info.do_copy_in_default_stream = do_copy_in_default_stream;
info.external_allocator_info = external_allocator_info;
}

// This variable is never initialized because the APIs by which is it should be initialized are deprecated, however they still
// exist are are in-use. Neverthless, it is used to return CUDAAllocator, hence we must try to initialize it here if we can
// since FromProviderOptions might contain external CUDA allocator.
external_allocator_info = info.external_allocator_info;
RegisterExecutionProvider(sess, *GetProviderInfo_CUDA()->CreateExecutionProviderFactory(info));
// This variable is never initialized because the APIs by which is it should be initialized are deprecated, however they still
// exist are are in-use. Neverthless, it is used to return CUDAAllocator, hence we must try to initialize it here if we can
// since FromProviderOptions might contain external CUDA allocator.
external_allocator_info = info.external_allocator_info;
RegisterExecutionProvider(sess, *cuda_provider_info->CreateExecutionProviderFactory(info));
}
else
{
if(!Env::Default().GetEnvironmentVar("CUDA_PATH").empty()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you please add a note here saying why we're checking for this? something like - we want to allow the flexibility to use a gpu pkg on a CPU-only machine unless the user intended to run inferencing on the GPU.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be nice to add something else to the message for the user to check?
Like make sure they have all the correct version prerequisites and they are in the user's PATH?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea, error message updated

ORT_THROW("CUDA_PATH is set but CUDA wasn't able to be loaded. Please install the correct version of CUDA and cuDNN as mentioned in the GPU requirements page, make sure they're in the PATH, and that your GPU is supported.");
}
}
#endif
} else if (type == kRocmExecutionProvider) {
#ifdef USE_ROCM
Expand Down
3 changes: 2 additions & 1 deletion onnxruntime/python/onnxruntime_pybind_state_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,8 @@ extern std::string nuphar_settings;
#if defined(USE_CUDA) || defined(USE_ROCM)
#ifdef USE_CUDA
namespace onnxruntime {
ProviderInfo_CUDA* GetProviderInfo_CUDA();
ProviderInfo_CUDA* TryGetProviderInfo_CUDA();
ProviderInfo_CUDA& GetProviderInfo_CUDA();
namespace python {
// TODO remove deprecated global config
extern OrtCudnnConvAlgoSearch cudnn_conv_algo_search;
Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/test/framework/inference_session_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ struct KernelRegistryAndStatus {
namespace onnxruntime {

#ifdef USE_CUDA
ProviderInfo_CUDA* GetProviderInfo_CUDA();
ProviderInfo_CUDA& GetProviderInfo_CUDA();
#endif

class FuseAdd : public OpKernel {
Expand Down Expand Up @@ -353,7 +353,7 @@ void RunModelWithBindingMatMul(InferenceSession& session_object,
cpu_allocator);
#ifdef USE_CUDA
cudaStream_t stream = static_cast<cudaStream_t>(gpu_provider->GetComputeStream());
st = GetProviderInfo_CUDA()->CreateGPUDataTransfer(stream)->CopyTensor(rtensor, *cpu_tensor.get(), 0);
st = GetProviderInfo_CUDA().CreateGPUDataTransfer(stream)->CopyTensor(rtensor, *cpu_tensor.get(), 0);
#elif USE_ROCM
hipStream_t stream = static_cast<hipStream_t>(gpu_provider->GetComputeStream());
st = GPUDataTransfer(stream).CopyTensor(rtensor, *cpu_tensor.get(), 0);
Expand Down